ctable-query.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. # Benchmark to compare the times for querying ctable objects. Numexpr
  2. # is needed in order to execute this. A comparison with SQLite3 and
  3. # PyTables (if installed) is also done.
  4. from __future__ import print_function
  5. import sys
  6. import os
  7. import os.path
  8. import subprocess
  9. import getopt
  10. import sqlite3
  11. from time import time
  12. import numpy as np
  13. import bcolz
  14. from bcolz.py2help import xrange
  15. NR = 1e5 # the number of rows
  16. if sys.version_info >= (3,0):
  17. # There is a silly limitation on the number of fields for namedtuples
  18. # for Python 3:
  19. # https://groups.google.com/forum/#!msg/python-ideas/96AwHqs59GM/8bxJsiWLN6UJ
  20. NC = 253 # the number of columns
  21. else:
  22. NC = 500 # the number of columns
  23. mv = 1e10 # the mean value for entries (sig digits = 17 - log10(mv))
  24. clevel = 3 # the compression level
  25. cname = 'blosclz' # the compressor to be used
  26. show = False # show statistics
  27. # The query for a ctable
  28. squery = "(f2>.9) & ((f8>.3) & (f8<.4))" # the ctable query
  29. # The query for a recarray
  30. nquery = "(t['f2']>.9) & ((t['f8']>.3) & (t['f8']<.4))" # for a recarray
  31. # A time reference
  32. tref = 0
  33. def show_rss(explain):
  34. "Show the used time and RSS memory (only works for Linux 2.6.x)."
  35. global tref
  36. # Build the command to obtain memory info
  37. newtref = time()
  38. print("Time (%20s) --> %.3f" % (explain, newtref - tref), end="")
  39. tref = newtref
  40. if show:
  41. cmd = "cat /proc/%s/status" % os.getpid()
  42. sout = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).stdout
  43. for line in sout:
  44. if line.startswith("VmRSS:"):
  45. vmrss = int(line.split()[1]) // 1024
  46. print("\t(Resident memory: %d MB)" % vmrss)
  47. else:
  48. print()
  49. def enter():
  50. global tref
  51. tref = time()
  52. def after_create(mess=""):
  53. global tref
  54. if mess: mess = ", " + mess
  55. show_rss("creation" + mess)
  56. def after_query(mess=""):
  57. global tref
  58. if mess: mess = ", " + mess
  59. show_rss("query" + mess)
  60. def test_numpy():
  61. enter()
  62. t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
  63. dtype=dt)
  64. after_create()
  65. out = np.fromiter(((row['f1'], row['f3']) for row in t[eval(nquery)]),
  66. dtype="f8,f8")
  67. after_query()
  68. return out
  69. def test_numexpr():
  70. import numexpr as ne
  71. enter()
  72. t = np.fromiter((mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
  73. dtype=dt)
  74. after_create()
  75. map_field = dict(("f%s" % i, t["f%s" % i]) for i in range(NC))
  76. out = np.fromiter(((row['f1'], row['f3']) for row in
  77. t[ne.evaluate(squery, map_field)]),
  78. dtype="f8,f8")
  79. after_query()
  80. return out
  81. def test_ctable(clevel):
  82. enter()
  83. tc = bcolz.fromiter(
  84. (mv + np.random.rand(NC) - mv for i in xrange(int(NR))),
  85. dtype=dt,
  86. cparams=bcolz.cparams(clevel, cname=cname),
  87. count=int(NR))
  88. after_create()
  89. out = np.fromiter((row for row in tc.where(squery, 'f1,f3')),
  90. dtype="f8,f8")
  91. after_query()
  92. return out
  93. def test_sqlite():
  94. enter()
  95. sqlquery = "(f2>.9) and ((f8>.3) and (f8<.4))" # the query
  96. con = sqlite3.connect(":memory:")
  97. # Create table
  98. fields = "(%s)" % ",".join(["f%d real" % i for i in range(NC)])
  99. con.execute("create table bench %s" % fields)
  100. # Insert a NR rows of data
  101. vals = "(%s)" % ",".join(["?" for i in range(NC)])
  102. with con:
  103. con.executemany("insert into bench values %s" % vals,
  104. (mv + np.random.rand(NC) - mv for i in
  105. xrange(int(NR))))
  106. after_create()
  107. out = np.fromiter(
  108. (row for row in con.execute(
  109. "select f1, f3 from bench where %s" % sqlquery)),
  110. dtype="f8,f8")
  111. after_query("non-indexed")
  112. # Create indexes
  113. con.execute("CREATE INDEX f1idx ON bench (f1)")
  114. con.execute("CREATE INDEX f2idx ON bench (f8)")
  115. after_create("index")
  116. out = np.fromiter(
  117. (row for row in con.execute(
  118. "select f1, f3 from bench where %s" % sqlquery)),
  119. dtype="f8,f8")
  120. after_query("indexed")
  121. return out
  122. if __name__ == "__main__":
  123. global dt
  124. usage = """\
  125. usage: %s [-s] [-m method] [-c ncols] [-r nrows] [-n cname] [-z clevel]
  126. -s show memory statistics (only for Linux)
  127. -m select the method: "ctable" (def.), "numpy", "numexpr", "sqlite"
  128. -c the number of columns in table (def. %d)
  129. -r the number of rows in table (def. %d)
  130. -n the compressor name (def. '%s')
  131. -z the compression level (def. %d)
  132. """ % (sys.argv[0], NC, NR, cname, clevel)
  133. try:
  134. opts, pargs = getopt.getopt(sys.argv[1:], 'sc:r:m:n:z:')
  135. except:
  136. sys.stderr.write(usage)
  137. sys.exit(1)
  138. method = "ctable"
  139. # Get the options
  140. for option in opts:
  141. if option[0] == '-s':
  142. if "linux" in sys.platform:
  143. show = True
  144. elif option[0] == '-m':
  145. method = option[1]
  146. elif option[0] == '-c':
  147. NC = int(option[1])
  148. elif option[0] == '-r':
  149. NR = float(option[1])
  150. elif option[0] == '-n':
  151. cname = option[1]
  152. elif option[0] == '-z':
  153. clevel = int(option[1])
  154. np.random.seed(12) # so as to get reproducible results
  155. # The dtype for tables
  156. # dt = np.dtype("f8,"*NC) # aligned fields
  157. dt = np.dtype("f8," * (NC - 1) + "i1") # unaligned fields
  158. if method == "numexpr":
  159. mess = "numexpr (+numpy)"
  160. elif method == "ctable":
  161. mess = "ctable (clevel=%d, cname='%s')" % (clevel, cname)
  162. elif method == "sqlite":
  163. mess = "sqlite (in-memory)"
  164. else:
  165. mess = method
  166. print("########## Checking method: %s ############" % mess)
  167. print("Querying with %g rows and %d cols" % (NR, NC))
  168. print("Building database. Wait please...")
  169. if method == "ctable":
  170. out = test_ctable(clevel)
  171. elif method == "numpy":
  172. out = test_numpy()
  173. elif method == "numexpr":
  174. out = test_numexpr()
  175. elif method == "sqlite":
  176. out = test_sqlite()
  177. print("Number of selected elements in query:", len(out))