pandas-fromdataframe.py 1.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # Benchmark for evaluate best ways to convert from a pandas dataframe
  2. import bcolz
  3. import pandas as pd
  4. import numpy as np
  5. from time import time
  6. NR = int(1e6)
  7. NC = 100
  8. #bcolz.cparams.setdefaults(clevel=0)
  9. print("Creating inputs...")
  10. a = bcolz.arange(NR, dtype='i4')
  11. df = pd.DataFrame.from_items((('f%d'%i, a[:]) for i in range(NC)))
  12. dsize = (NR * NC * 4) / 2. ** 30
  13. # Adding a column once a time
  14. t0 = time()
  15. names = list(df.columns.values)
  16. firstk = names.pop(0)
  17. t = bcolz.ctable([df[firstk]], names=(firstk,))
  18. for key in names:
  19. t.addcol(np.array(df[key]), key)
  20. tt = time() - t0
  21. print("time with adding cols: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  22. del t
  23. # Using an iterator
  24. t0 = time()
  25. names = list(df.columns.values)
  26. t = bcolz.ctable([df[key] for key in names], names)
  27. tt = time() - t0
  28. print("time with constructor: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  29. # Using generic implementation
  30. t0 = time()
  31. t = bcolz.ctable.fromdataframe(df)
  32. tt = time() - t0
  33. print("time with fromdataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  34. #print(repr(t))