cjcpp
/
bcolz


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546
							# Benchmark for evaluate best ways to convert from a pandas dataframe

from collections import OrderedDict
import bcolz
import pandas as pd
import numpy as np
from time import time

NR = int(1e6)
NC = 100

#bcolz.cparams.setdefaults(clevel=0)

print("Creating inputs...")
a = bcolz.arange(NR, dtype='i4')
df = pd.DataFrame.from_dict(OrderedDict(('f%d'%i, a[:]) for i in range(NC)))

dsize = (NR * NC * 4) / 2. ** 30

# Adding a column once a time
t0 = time()
names = list(df.columns.values)
firstk = names.pop(0)
t = bcolz.ctable([df[firstk]], names=(firstk,))
for key in names:
    t.addcol(np.array(df[key]), key)
tt = time() - t0
print("time with adding cols: %.2f (%.2f GB/s)" % (tt, dsize / tt))
del t

# Using an iterator
t0 = time()
names = list(df.columns.values)
t = bcolz.ctable([df[key] for key in names], names)
tt = time() - t0
print("time with constructor: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using generic implementation
t0 = time()
t = bcolz.ctable.fromdataframe(df)
tt = time() - t0
print("time with fromdataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))


#print(repr(t))