pytables-fromhdf5.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # Benchmark for evaluate best ways to read from a PyTables Table
  2. import sys
  3. import bcolz
  4. import tables as tb
  5. import numpy as np
  6. from time import time
  7. PY2 = sys.version_info[0] == 2
  8. if not PY2:
  9. xrange = range
  10. def range(*args):
  11. return list(xrange(*args))
  12. filepath = 'fromhdf5.h5'
  13. nodepath = '/ctable'
  14. NR = int(1e6)
  15. NC = 10
  16. dsize = (NR * NC * 4) / 2. ** 30
  17. bcolz.cparams.setdefaults(clevel=5)
  18. a = bcolz.arange(NR, dtype='i4')
  19. #ra = np.rec.fromarrays([a]*NC, names=['f%d'%i for i in range(NC)])
  20. ra = bcolz.ctable((a,)*NC)[:]
  21. t0 = time()
  22. f = tb.open_file(filepath, "w")
  23. f.create_table(f.root, nodepath[1:], ra)
  24. f.close()
  25. tt = time() - t0
  26. print("time for storing the HDF5 table: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  27. # Using an iterator
  28. t0 = time()
  29. f = tb.open_file(filepath)
  30. t = f.get_node(nodepath)
  31. t = bcolz.fromiter((r[:] for r in t), dtype=t.dtype, count=len(t))
  32. f.close()
  33. tt = time() - t0
  34. print("time with fromiter: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  35. # Using blocked read
  36. t0 = time()
  37. f = tb.open_file(filepath)
  38. t = f.get_node(nodepath)
  39. names = t.colnames
  40. dtypes = [dt[0] for dt in t.dtype.fields.values()]
  41. cols = [np.zeros(0, dtype=dt) for dt in dtypes]
  42. ct = bcolz.ctable(cols, names)
  43. bs = t._v_chunkshape[0]
  44. for i in xrange(0, len(t), bs):
  45. ct.append(t[i:i+bs])
  46. f.close()
  47. tt = time() - t0
  48. print("time with blocked read: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  49. # Using generic implementation
  50. t0 = time()
  51. t = bcolz.ctable.fromhdf5(filepath, nodepath)
  52. tt = time() - t0
  53. print("time with fromhdf5: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  54. #print(repr(ct))