cjcpp
/
bcolz


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
							# Benchmark for evaluate best ways to read from a PyTables Table

import sys
import bcolz
import tables as tb
import numpy as np
from time import time


PY2 = sys.version_info[0] == 2
if not PY2:
    xrange = range
    def range(*args):
        return list(xrange(*args))


filepath = 'fromhdf5.h5'
nodepath = '/ctable'
NR = int(1e6)
NC = 10
dsize = (NR * NC * 4) / 2. ** 30

bcolz.cparams.setdefaults(clevel=5)

a = bcolz.arange(NR, dtype='i4')
#ra = np.rec.fromarrays([a]*NC, names=['f%d'%i for i in range(NC)])
ra = bcolz.ctable((a,)*NC)[:]

t0 = time()
f = tb.open_file(filepath, "w")
f.create_table(f.root, nodepath[1:], ra)
f.close()
tt = time() - t0
print("time for storing the HDF5 table: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using an iterator
t0 = time()
f = tb.open_file(filepath)
t = f.get_node(nodepath)
t = bcolz.fromiter((r[:] for r in t), dtype=t.dtype, count=len(t))
f.close()
tt = time() - t0
print("time with fromiter: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using blocked read
t0 = time()
f = tb.open_file(filepath)
t = f.get_node(nodepath)
names = t.colnames
dtypes = [dt[0] for dt in t.dtype.fields.values()]
cols = [np.zeros(0, dtype=dt) for dt in dtypes]
ct = bcolz.ctable(cols, names)
bs = t._v_chunkshape[0]
for i in xrange(0, len(t), bs):
    ct.append(t[i:i+bs])
f.close()
tt = time() - t0
print("time with blocked read: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using generic implementation
t0 = time()
t = bcolz.ctable.fromhdf5(filepath, nodepath)
tt = time() - t0
print("time with fromhdf5: %.2f (%.2f GB/s)" % (tt, dsize / tt))


#print(repr(ct))