pandas-todataframe.py 977 B

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. # Benchmark for evaluate best ways to convert into a pandas dataframe
  2. from collections import OrderedDict
  3. import bcolz
  4. import pandas as pd
  5. from time import time
  6. NR = int(1e6)
  7. NC = 100
  8. bcolz.cparams.setdefaults(clevel=0)
  9. a = bcolz.arange(NR, dtype='i4')
  10. t = bcolz.ctable((a,)*NC)
  11. dsize = (NR * NC * 4) / 2. ** 30
  12. # Adding a column once a time
  13. t0 = time()
  14. tnames = list(t.names)
  15. firstk = tnames.pop(0)
  16. df = pd.DataFrame.from_dict(OrderedDict([(firstk, t[firstk][:])]))
  17. for key in tnames:
  18. df[key] = t[key][:]
  19. tt = time() - t0
  20. print("time with from_dict (adding cols): %.2f (%.2f GB/s)" % (tt, dsize / tt))
  21. del df
  22. # Using a generator
  23. t0 = time()
  24. df = pd.DataFrame.from_dict(OrderedDict((key, t[key][:]) for key in t.names))
  25. tt = time() - t0
  26. print("time with from_dict: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  27. # Using generic implementation
  28. t0 = time()
  29. df = t.todataframe()
  30. tt = time() - t0
  31. print("time with todataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  32. #print(df)