pandas-fromdataframe-strings.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. # Benchmark for evaluate best ways to convert from a pandas dataframe
  2. # (version with a mix of columns of ints and strings)
  3. from collections import OrderedDict
  4. import sys
  5. import bcolz
  6. import pandas as pd
  7. from time import time
  8. PY2 = sys.version_info[0] == 2
  9. if not PY2:
  10. xrange = range
  11. def range(*args):
  12. return list(xrange(*args))
  13. NR = int(1e4)
  14. NC = 100
  15. #bcolz.cparams.setdefaults(clevel=0)
  16. print("Creating inputs...")
  17. a = bcolz.arange(NR, dtype='i4')
  18. s = bcolz.fromiter(("%d"%i for i in xrange(NR)), dtype='S7', count=NR)
  19. df = pd.DataFrame.from_dict(OrderedDict(
  20. ('f%d'%i, a[:] if i < (NC//2) else s[:]) for i in range(NC)))
  21. dsize = (NR * (NC//2) * (a.dtype.itemsize + s.dtype.itemsize)) / 2. ** 20
  22. print("Performing benchmarks...")
  23. # # Using an iterator (will get objects)
  24. # t0 = time()
  25. # names = list(df.columns.values)
  26. # t = bcolz.ctable([df[key] for key in names], names)
  27. # tt = time() - t0
  28. # print("time with constructor: %.2f (%.2f MB/s)" % (tt, dsize / tt))
  29. # print(repr(t.dtype))
  30. # Using generic implementation
  31. t0 = time()
  32. t = bcolz.ctable.fromdataframe(df)
  33. tt = time() - t0
  34. print("time with fromdataframe: %.2f (%.2f MB/s)" % (tt, dsize / tt))
  35. print(t.dtype)