1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- # Benchmark for evaluate best ways to read from a PyTables Table
- import sys
- import bcolz
- import tables as tb
- import numpy as np
- from time import time
- PY2 = sys.version_info[0] == 2
- if not PY2:
- xrange = range
- def range(*args):
- return list(xrange(*args))
- filepath = 'fromhdf5.h5'
- nodepath = '/ctable'
- NR = int(1e6)
- NC = 10
- dsize = (NR * NC * 4) / 2. ** 30
- bcolz.cparams.setdefaults(clevel=5)
- a = bcolz.arange(NR, dtype='i4')
- #ra = np.rec.fromarrays([a]*NC, names=['f%d'%i for i in range(NC)])
- ra = bcolz.ctable((a,)*NC)[:]
- t0 = time()
- f = tb.open_file(filepath, "w")
- f.create_table(f.root, nodepath[1:], ra)
- f.close()
- tt = time() - t0
- print("time for storing the HDF5 table: %.2f (%.2f GB/s)" % (tt, dsize / tt))
- # Using an iterator
- t0 = time()
- f = tb.open_file(filepath)
- t = f.get_node(nodepath)
- t = bcolz.fromiter((r[:] for r in t), dtype=t.dtype, count=len(t))
- f.close()
- tt = time() - t0
- print("time with fromiter: %.2f (%.2f GB/s)" % (tt, dsize / tt))
- # Using blocked read
- t0 = time()
- f = tb.open_file(filepath)
- t = f.get_node(nodepath)
- names = t.colnames
- dtypes = [dt[0] for dt in t.dtype.fields.values()]
- cols = [np.zeros(0, dtype=dt) for dt in dtypes]
- ct = bcolz.ctable(cols, names)
- bs = t._v_chunkshape[0]
- for i in xrange(0, len(t), bs):
- ct.append(t[i:i+bs])
- f.close()
- tt = time() - t0
- print("time with blocked read: %.2f (%.2f GB/s)" % (tt, dsize / tt))
- # Using generic implementation
- t0 = time()
- t = bcolz.ctable.fromhdf5(filepath, nodepath)
- tt = time() - t0
- print("time with fromhdf5: %.2f (%.2f GB/s)" % (tt, dsize / tt))
- #print(repr(ct))
|