pandas-fromdataframe.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # Benchmark for evaluate best ways to convert from a pandas dataframe
  2. from collections import OrderedDict
  3. import bcolz
  4. import pandas as pd
  5. import numpy as np
  6. from time import time
  7. NR = int(1e6)
  8. NC = 100
  9. #bcolz.cparams.setdefaults(clevel=0)
  10. print("Creating inputs...")
  11. a = bcolz.arange(NR, dtype='i4')
  12. df = pd.DataFrame.from_dict(OrderedDict(('f%d'%i, a[:]) for i in range(NC)))
  13. dsize = (NR * NC * 4) / 2. ** 30
  14. # Adding a column once a time
  15. t0 = time()
  16. names = list(df.columns.values)
  17. firstk = names.pop(0)
  18. t = bcolz.ctable([df[firstk]], names=(firstk,))
  19. for key in names:
  20. t.addcol(np.array(df[key]), key)
  21. tt = time() - t0
  22. print("time with adding cols: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  23. del t
  24. # Using an iterator
  25. t0 = time()
  26. names = list(df.columns.values)
  27. t = bcolz.ctable([df[key] for key in names], names)
  28. tt = time() - t0
  29. print("time with constructor: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  30. # Using generic implementation
  31. t0 = time()
  32. t = bcolz.ctable.fromdataframe(df)
  33. tt = time() - t0
  34. print("time with fromdataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  35. #print(repr(t))