{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Querying with ranges\n",
    "This is a small tutorial showing how you can accelerate the queries of large tables by using `ctable.whereblocks()` and a combination of boolean conditions and the selections of columns."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import bcolz\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n",
      "bcolz version:     1.2.1\n",
      "NumPy version:     1.16.2\n",
      "Blosc version:     1.14.3 ($Date:: 2018-04-06 #$)\n",
      "Blosc compressors: ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd']\n",
      "Numexpr version:   2.6.9\n",
      "Dask version:      1.2.2\n",
      "Python version:    3.7.2 (default, Dec 29 2018, 00:00:04) \n",
      "[Clang 4.0.1 (tags/RELEASE_401/final)]\n",
      "Platform:          darwin-x86_64\n",
      "Byte-ordering:     little\n",
      "Detected cores:    4\n",
      "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"
     ]
    }
   ],
   "source": [
    "bcolz.print_versions()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create a table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The number of entries in the table\n",
    "N = int(1e7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ctable((10000000,), [('f0', '<i4'), ('f1', '<f8')])\n",
       "  nbytes: 114.44 MB; cbytes: 12.93 MB; ratio: 8.85\n",
       "  cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)\n",
       "[(0, 0.) (1, 1.) (2, 4.) ..., (9999997, 9.999994e+13)\n",
       " (9999998, 9.999996e+13) (9999999, 9.999998e+13)]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create the table with two fields, one integer and the other double\n",
    "ct = bcolz.fromiter(((i,i*i) for i in range(N)), dtype=\"i4,f8\", count=N)\n",
    "ct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Add a timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "carray((10000000,), float64)\n",
       "  nbytes := 76.29 MB; cbytes := 1.25 MB; ratio: 60.83\n",
       "  cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)\n",
       "  chunklen := 65536; chunksize: 524288; blocksize: 524288\n",
       "[  1.55833865e+09   1.55833871e+09   1.55833877e+09 ...,   2.15833847e+09\n",
       "   2.15833853e+09   2.15833859e+09]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create some arbitrary timestamps\n",
    "import datetime\n",
    "now = datetime.datetime.timestamp(datetime.datetime.now())\n",
    "timestamps = bcolz.fromiter((now + 60 * i for i in range(N)), dtype=np.float64, count=N)\n",
    "timestamps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ctable((10000000,), [('f0', '<i4'), ('f1', '<f8'), ('timestamp', '<f8')])\n",
       "  nbytes: 190.73 MB; cbytes: 14.18 MB; ratio: 13.45\n",
       "  cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)\n",
       "[(0, 0., 1.55833865e+09) (1, 1., 1.55833871e+09) (2, 4., 1.55833877e+09)\n",
       " ..., (9999997, 9.999994e+13, 2.15833847e+09)\n",
       " (9999998, 9.999996e+13, 2.15833853e+09)\n",
       " (9999999, 9.999998e+13, 2.15833859e+09)]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Add the timestamps top the table, in the field \"timestamp\" (note that the trailing 's' is out now)\n",
    "ct.addcol(timestamps, name=\"timestamp\")\n",
    "ct"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Playing with `ctable.whereblocks`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 2.74 s, sys: 113 ms, total: 2.85 s\n",
      "Wall time: 2.84 s\n"
     ]
    }
   ],
   "source": [
    "# Do a query with the whole range of timestamps\n",
    "%time res = [row for row in ct.whereblocks(\"timestamp > 0\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 1.11 s, sys: 58.4 ms, total: 1.17 s\n",
      "Wall time: 1.15 s\n"
     ]
    }
   ],
   "source": [
    "# Do a query with the whole range of timestamps, but using a single column as output\n",
    "%time res = [row for row in ct.whereblocks(\"timestamp > 0\", outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 118 ms, sys: 12.6 ms, total: 131 ms\n",
      "Wall time: 83.2 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([(    2, 4.00000000e+00, 1.55833877e+09),\n",
       "        (    3, 9.00000000e+00, 1.55833883e+09),\n",
       "        (    4, 1.60000000e+01, 1.55833889e+09), ...,\n",
       "        (16664, 2.77688896e+08, 1.55933849e+09),\n",
       "        (16665, 2.77722225e+08, 1.55933855e+09),\n",
       "        (16666, 2.77755556e+08, 1.55933861e+09)],\n",
       "       dtype=[('f0', '<i4'), ('f1', '<f8'), ('timestamp', '<f8')])]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Do a query in a much more small time range\n",
    "cond = \"(timestamp > now + 100) & (timestamp < now + 1e6)\"\n",
    "%time [row for row in ct.whereblocks(cond)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "carray((10000000,), bool)\n",
       "  nbytes := 9.54 MB; cbytes := 257.10 KB; ratio: 37.98\n",
       "  cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)\n",
       "  chunklen := 262144; chunksize: 262144; blocksize: 65536\n",
       "[False False  True ..., False False False]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Create a boolean array in-memory for ultimate speed\n",
    "carray_cond = bcolz.eval(\"(timestamps > now + 100) & (timestamps < now + 1e6)\")\n",
    "carray_cond"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note that compression ratio is very good, to the point that it usually fits in CPU caches."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.89 ms, sys: 1.56 ms, total: 7.45 ms\n",
      "Wall time: 7.8 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([(    2,), (    3,), (    4,), ..., (16664,), (16665,), (16666,)],\n",
       "       dtype=[('f0', '<i4')])]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now, re-run with the boolean array:\n",
    "%time [row for row in ct.whereblocks(carray_cond, outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 5.53 ms, sys: 1.09 ms, total: 6.62 ms\n",
      "Wall time: 5.55 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([    2,     3,     4, ..., 16664, 16665, 16666], dtype=int32)]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get rid of the structured type and use just the dtype for the integer field\n",
    "%time [row.astype(ct.dtype['f0']) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Apply vectorized operations to the blocks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 58.6 ms, sys: 2.38 ms, total: 61 ms\n",
      "Wall time: 60.6 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([ 0,  0,  0, ..., 35, 35, 35], dtype=int32)]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now, perform some bytewise operation on each element\n",
    "def some_bitwise_op(x):\n",
    "    y = np.empty(x.shape, x.dtype)\n",
    "    for i in range(len(x)):\n",
    "        y[i] = x[i] >> 3 & 0xFF\n",
    "    return y\n",
    "%time [some_bitwise_op(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 201 ms, sys: 39.8 ms, total: 241 ms\n",
      "Wall time: 310 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([ 0,  0,  0, ..., 35, 35, 35], dtype=int32)]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Now, compile the function with numba\n",
    "import numba\n",
    "@numba.jit(nopython=True, cache=True)\n",
    "def some_bitwise_op_numba(x):\n",
    "    y = np.empty(x.shape, x.dtype)\n",
    "    for i in range(len(x)):\n",
    "        y[i] = x[i] >> 3 & 0xFF\n",
    "    return y\n",
    "%time [some_bitwise_op_numba(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 4.69 ms, sys: 1.04 ms, total: 5.73 ms\n",
      "Wall time: 4.86 ms\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[array([ 0,  0,  0, ..., 35, 35, 35], dtype=int32)]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Ops, we have got too much time.  Re-run again now that the numba has compiled the function \n",
    "%time [some_bitwise_op_numba(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hey, numba is really, really fast.  That's all folks!"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}