{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Querying with ranges\n", "This is a small tutorial showing how you can accelerate the queries of large tables by using `ctable.whereblocks()` and a combination of boolean conditions and the selections of columns." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import bcolz\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n", "bcolz version: 1.2.1\n", "NumPy version: 1.16.2\n", "Blosc version: 1.14.3 ($Date:: 2018-04-06 #$)\n", "Blosc compressors: ['blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd']\n", "Numexpr version: 2.6.9\n", "Dask version: 1.2.2\n", "Python version: 3.7.2 (default, Dec 29 2018, 00:00:04) \n", "[Clang 4.0.1 (tags/RELEASE_401/final)]\n", "Platform: darwin-x86_64\n", "Byte-ordering: little\n", "Detected cores: 4\n", "-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n" ] } ], "source": [ "bcolz.print_versions()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a table" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# The number of entries in the table\n", "N = int(1e7)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ctable((10000000,), [('f0', ' 0\")]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.11 s, sys: 58.4 ms, total: 1.17 s\n", "Wall time: 1.15 s\n" ] } ], "source": [ "# Do a query with the whole range of timestamps, but using a single column as output\n", "%time res = [row for row in ct.whereblocks(\"timestamp > 0\", outcols=\"f0\")]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 118 ms, sys: 12.6 ms, total: 131 ms\n", "Wall time: 83.2 ms\n" ] }, { "data": { "text/plain": [ "[array([( 2, 4.00000000e+00, 1.55833877e+09),\n", " ( 3, 9.00000000e+00, 1.55833883e+09),\n", " ( 4, 1.60000000e+01, 1.55833889e+09), ...,\n", " (16664, 2.77688896e+08, 1.55933849e+09),\n", " (16665, 2.77722225e+08, 1.55933855e+09),\n", " (16666, 2.77755556e+08, 1.55933861e+09)],\n", " dtype=[('f0', ' now + 100) & (timestamp < now + 1e6)\"\n", "%time [row for row in ct.whereblocks(cond)]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "carray((10000000,), bool)\n", " nbytes := 9.54 MB; cbytes := 257.10 KB; ratio: 37.98\n", " cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)\n", " chunklen := 262144; chunksize: 262144; blocksize: 65536\n", "[False False True ..., False False False]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Create a boolean array in-memory for ultimate speed\n", "carray_cond = bcolz.eval(\"(timestamps > now + 100) & (timestamps < now + 1e6)\")\n", "carray_cond" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note that compression ratio is very good, to the point that it usually fits in CPU caches." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 5.89 ms, sys: 1.56 ms, total: 7.45 ms\n", "Wall time: 7.8 ms\n" ] }, { "data": { "text/plain": [ "[array([( 2,), ( 3,), ( 4,), ..., (16664,), (16665,), (16666,)],\n", " dtype=[('f0', '> 3 & 0xFF\n", " return y\n", "%time [some_bitwise_op(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 201 ms, sys: 39.8 ms, total: 241 ms\n", "Wall time: 310 ms\n" ] }, { "data": { "text/plain": [ "[array([ 0, 0, 0, ..., 35, 35, 35], dtype=int32)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Now, compile the function with numba\n", "import numba\n", "@numba.jit(nopython=True, cache=True)\n", "def some_bitwise_op_numba(x):\n", " y = np.empty(x.shape, x.dtype)\n", " for i in range(len(x)):\n", " y[i] = x[i] >> 3 & 0xFF\n", " return y\n", "%time [some_bitwise_op_numba(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.69 ms, sys: 1.04 ms, total: 5.73 ms\n", "Wall time: 4.86 ms\n" ] }, { "data": { "text/plain": [ "[array([ 0, 0, 0, ..., 35, 35, 35], dtype=int32)]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Ops, we have got too much time. Re-run again now that the numba has compiled the function \n", "%time [some_bitwise_op_numba(row.astype(ct.dtype['f0'])) for row in ct.whereblocks(carray_cond, outcols=\"f0\")]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hey, numba is really, really fast. That's all folks!" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }