pandas-todataframe.py 921 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. # Benchmark for evaluate best ways to convert into a pandas dataframe
  2. import bcolz
  3. import pandas as pd
  4. from time import time
  5. NR = int(1e6)
  6. NC = 100
  7. bcolz.cparams.setdefaults(clevel=0)
  8. a = bcolz.arange(NR, dtype='i4')
  9. t = bcolz.ctable((a,)*NC)
  10. dsize = (NR * NC * 4) / 2. ** 30
  11. # Adding a column once a time
  12. t0 = time()
  13. tnames = list(t.names)
  14. firstk = tnames.pop(0)
  15. df = pd.DataFrame.from_items([(firstk, t[firstk][:])])
  16. for key in tnames:
  17. df[key] = t[key][:]
  18. tt = time() - t0
  19. print("time with from_items (adding cols): %.2f (%.2f GB/s)" % (tt, dsize / tt))
  20. del df
  21. # Using a generator
  22. t0 = time()
  23. df = pd.DataFrame.from_items(((key, t[key][:]) for key in t.names))
  24. tt = time() - t0
  25. print("time with from_items: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  26. # Using generic implementation
  27. t0 = time()
  28. df = t.todataframe()
  29. tt = time() - t0
  30. print("time with todataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))
  31. #print(df)