column-iter.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. from __future__ import print_function
  2. import sys
  3. import contextlib, time
  4. import bcolz, numpy
  5. PY2 = sys.version_info[0] == 2
  6. if not PY2:
  7. xrange = range
  8. def range(*args):
  9. return list(xrange(*args))
  10. @contextlib.contextmanager
  11. def ctime(label=""):
  12. "Counts the time spent in some context"
  13. t = time.time()
  14. yield
  15. print(label, round(time.time() - t, 3), "sec")
  16. N = 1000 * 1000
  17. ct = bcolz.fromiter(((i, i*i, i*i*i) for i in xrange(N)), dtype='i8,i8,i8', count=N)
  18. b = numpy.array(numpy.arange(N) % 2, dtype="bool")
  19. c = bcolz.carray(b)
  20. sorted_index = range(1, N, 2)
  21. with ctime():
  22. r0 = (ct['f0'][sorted_index]).tolist()
  23. with ctime():
  24. r1 = [x.f0 for x in ct.where(b)]
  25. assert r0 == r1
  26. with ctime():
  27. r2 = [x.f0 for x in ct.where(c)]
  28. assert r0 == r2
  29. with ctime():
  30. r3 = [x for x in ct['f0'].where(b)]
  31. assert r0 == r3
  32. with ctime():
  33. r4 = [x for x in ct['f0'].where(c)]
  34. assert r0 == r4
  35. # sum
  36. with ctime("sum list"):
  37. r5 = sum([x for x in ct['f0'].where(c)])
  38. with ctime("sum generator"):
  39. r6 = sum(x for x in ct['f0'].where(c))
  40. assert r5 == r6
  41. with ctime("sum method"):
  42. r7 = bcolz.fromiter((x for x in ct['f0'].where(c)),
  43. dtype=ct['f0'].dtype,
  44. count=c.wheretrue().sum()).sum()
  45. assert r7 == r5
  46. # sum with no NA's
  47. with ctime("sum with no NA (list)"):
  48. r8 = sum([x for x in ct['f0'].where(c) if x == x]) # x==x check to leave out NA values
  49. # sum with no NA's
  50. with ctime("sum with no NA (generator)"):
  51. r9 = sum((x for x in ct['f0'].where(c) if x == x)) # x==x check to leave out NA values