123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- # Benchmark to compare the times for querying ctable objects. Numexpr
- # is needed in order to execute this. A comparison with SQLite3 and
- # PyTables (if installed) is also done.
- from __future__ import print_function
- import sys
- import os
- import os.path
- import subprocess
- import getopt
- import sqlite3
- from time import time
- import numpy as np
- import bcolz
- from bcolz.py2help import xrange
- NR = 1e5 # the number of rows
- if sys.version_info >= (3,0):
- # There is a silly limitation on the number of fields for namedtuples
- # for Python 3:
- # https://groups.google.com/forum/#!msg/python-ideas/96AwHqs59GM/8bxJsiWLN6UJ
- NC = 253 # the number of columns
- else:
- NC = 500 # the number of columns
- mv = 1e10 # the mean value for entries (sig digits = 17 - log10(mv))
- clevel = 3 # the compression level
- cname = 'blosclz' # the compressor to be used
- show = False # show statistics
- # The query for a ctable
- squery = "(f2>.9) & ((f8>.3) & (f8<.4))" # the ctable query
- # The query for a recarray
- nquery = "(t['f2']>.9) & ((t['f8']>.3) & (t['f8']<.4))" # for a recarray
- # A time reference
- tref = 0
- def show_rss(explain):
- "Show the used time and RSS memory (only works for Linux 2.6.x)."
- global tref
- # Build the command to obtain memory info
- newtref = time()
- print("Time (%20s) --> %.3f" % (explain, newtref - tref), end="")
- tref = newtref
- if show:
- cmd = "cat /proc/%s/status" % os.getpid()
- sout = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
- for line in sout:
- if line.startswith("VmRSS:"):
- vmrss = int(line.split()[1]) // 1024
- print("\t(Resident memory: %d MB)" % vmrss)
- else:
- print()
- def enter():
- global tref
- tref = time()
- def after_create(mess=""):
- global tref
- if mess: mess = ", " + mess
- show_rss("creation" + mess)
- def after_query(mess=""):
- global tref
- if mess: mess = ", " + mess
- show_rss("query" + mess)
- def test_numpy():
- enter()
- t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
- dtype=dt)
- after_create()
- out = np.fromiter(((row['f1'], row['f3']) for row in t[eval(nquery)]),
- dtype="f8,f8")
- after_query()
- return out
- def test_numexpr():
- import numexpr as ne
- enter()
- t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
- dtype=dt)
- after_create()
- map_field = dict(("f%s" % i, t["f%s" % i]) for i in range(NC))
- out = np.fromiter(((row['f1'], row['f3']) for row in
- t[ne.evaluate(squery, map_field)]),
- dtype="f8,f8")
- after_query()
- return out
- def test_ctable(clevel):
- enter()
- tc = bcolz.fromiter(
- (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
- dtype=dt,
- cparams=bcolz.cparams(clevel, cname=cname),
- count=int(NR))
- after_create()
- out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
- dtype="f8,f8")
- after_query()
- return out
- def test_sqlite():
- enter()
- sqlquery = "(f2>.9) and ((f8>.3) and (f8<.4))" # the query
- con = sqlite3.connect(":memory:")
- # Create table
- fields = "(%s)" % ",".join(["f%d real" % i for i in range(NC)])
- con.execute("create table bench %s" % fields)
- # Insert a NR rows of data
- vals = "(%s)" % ",".join(["?" for i in range(NC)])
- with con:
- con.executemany("insert into bench values %s" % vals,
- (mv + np.random.rand(NC) - mv for i in
- xrange(int(NR))))
- after_create()
- out = np.fromiter(
- (row for row in con.execute(
- "select f1, f3 from bench where %s" % sqlquery)),
- dtype="f8,f8")
- after_query("non-indexed")
- # Create indexes
- con.execute("CREATE INDEX f1idx ON bench (f1)")
- con.execute("CREATE INDEX f2idx ON bench (f8)")
- after_create("index")
- out = np.fromiter(
- (row for row in con.execute(
- "select f1, f3 from bench where %s" % sqlquery)),
- dtype="f8,f8")
- after_query("indexed")
- return out
- if __name__ == "__main__":
- global dt
- usage = """\
- usage: %s [-s] [-m method] [-c ncols] [-r nrows] [-n cname] [-z clevel]
- -s show memory statistics (only for Linux)
- -m select the method: "ctable" (def.), "numpy", "numexpr", "sqlite"
- -c the number of columns in table (def. %d)
- -r the number of rows in table (def. %d)
- -n the compressor name (def. '%s')
- -z the compression level (def. %d)
- """ % (sys.argv[0], NC, NR, cname, clevel)
- try:
- opts, pargs = getopt.getopt(sys.argv[1:], 'sc:r:m:n:z:')
- except:
- sys.stderr.write(usage)
- sys.exit(1)
- method = "ctable"
- # Get the options
- for option in opts:
- if option[0] == '-s':
- if "linux" in sys.platform:
- show = True
- elif option[0] == '-m':
- method = option[1]
- elif option[0] == '-c':
- NC = int(option[1])
- elif option[0] == '-r':
- NR = float(option[1])
- elif option[0] == '-n':
- cname = option[1]
- elif option[0] == '-z':
- clevel = int(option[1])
- np.random.seed(12) # so as to get reproducible results
- # The dtype for tables
- # dt = np.dtype("f8,"*NC) # aligned fields
- dt = np.dtype("f8," * (NC - 1) + "i1") # unaligned fields
- if method == "numexpr":
- mess = "numexpr (+numpy)"
- elif method == "ctable":
- mess = "ctable (clevel=%d, cname='%s')" % (clevel, cname)
- elif method == "sqlite":
- mess = "sqlite (in-memory)"
- else:
- mess = method
- print("########## Checking method: %s ############" % mess)
- print("Querying with %g rows and %d cols" % (NR, NC))
- print("Building database. Wait please...")
- if method == "ctable":
- out = test_ctable(clevel)
- elif method == "numpy":
- out = test_numpy()
- elif method == "numexpr":
- out = test_numexpr()
- elif method == "sqlite":
- out = test_sqlite()
- print("Number of selected elements in query:", len(out))
|