cjcpp
/
bcolz


			
							123456789101112131415161718192021222324252627282930313233343536373839404142
							# Benchmark for evaluate best ways to convert into a pandas dataframe

from collections import OrderedDict
import bcolz
import pandas as pd
from time import time

NR = int(1e6)
NC = 100

bcolz.cparams.setdefaults(clevel=0)
a = bcolz.arange(NR, dtype='i4')
t = bcolz.ctable((a,)*NC)

dsize = (NR * NC * 4) / 2. ** 30

# Adding a column once a time
t0 = time()
tnames = list(t.names)
firstk = tnames.pop(0)
df = pd.DataFrame.from_dict(OrderedDict([(firstk, t[firstk][:])]))
for key in tnames:
    df[key] = t[key][:]
tt = time() - t0
print("time with from_dict (adding cols): %.2f (%.2f GB/s)" % (tt, dsize / tt))
del df

# Using a generator
t0 = time()
df = pd.DataFrame.from_dict(OrderedDict((key, t[key][:]) for key in t.names))
tt = time() - t0
print("time with from_dict: %.2f (%.2f GB/s)" % (tt, dsize / tt))

# Using generic implementation
t0 = time()
df = t.todataframe()
tt = time() - t0
print("time with todataframe: %.2f (%.2f GB/s)" % (tt, dsize / tt))


#print(df)