diff --git a/docs/mkdocs/docs/notebooks/ArcticDB_billion_row_challenge.ipynb b/docs/mkdocs/docs/notebooks/ArcticDB_billion_row_challenge.ipynb new file mode 100644 index 0000000000..72d5e21305 --- /dev/null +++ b/docs/mkdocs/docs/notebooks/ArcticDB_billion_row_challenge.ipynb @@ -0,0 +1,572 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ArcticDB Billion Row Challenge Notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "* installs\n", + "* imports\n", + "* create ArticDB object store\n", + "* define the parameters of the problem\n", + "* create an ArticDB library to hold the data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install arcticdb" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from arcticdb_ext import set_config_int\n", + "set_config_int('VersionStore.NumCPUThreads', 16)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import arcticdb as adb" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "arctic = adb.Arctic(\"lmdb://arcticdb_brc\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "block_size: 62,500,000, total records: 1,000,000,000\n" + ] + } + ], + "source": [ + "sym_1brc = 'weather_stations_1brc'\n", + "num_cities = 10_000\n", + "num_rows = 1_000_000_000\n", + "aggs = {\n", + " 'max': ('Temperature', 'max'), \n", + " 'min': ('Temperature', 'min'),\n", + " 'mean': ('Temperature', 'mean')\n", + "}\n", + "num_blocks = 16\n", + "block_size = num_rows // num_blocks\n", + "seed = 17\n", + "cities = np.array([f\"city_{i:04d}\" for i in range(num_cities)])\n", + "print(f\"block_size: {block_size:,d}, total records: {block_size*num_blocks:,d}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "lib_name = 'arcticdb_brc'\n", + "# delete the library if it already exists\n", + "arctic.delete_library(lib_name)\n", + "# performance tuning: a large rows_per_segment value can improve performance for dataframes with a large number of rows\n", + "lib_options = adb.LibraryOptions(rows_per_segment=10_000_000)\n", + "lib = arctic.get_library(lib_name, create_if_missing=True, library_options=lib_options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Write the Data to ArcticDB\n", + "* Generate the data: each row has a city chosen at random from the list and a random temperature between -99.9 and 99.9\n", + "* Data is written in blocks to control memory usage" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def create_block_df(rng, cities, block_size):\n", + " random_cities = rng.choice(cities, size=block_size)\n", + " random_temperatures = np.round(rng.uniform(-99.9, 99.9, size=block_size), 4)\n", + " return pd.DataFrame({'City': random_cities, 'Temperature': random_temperatures})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing blocks: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, Finished\n" + ] + } + ], + "source": [ + "rng = np.random.default_rng(seed)\n", + "print('Writing blocks: ', end='')\n", + "for b in range(num_blocks):\n", + " block_df = create_block_df(rng, cities, block_size)\n", + " if b==0:\n", + " lib.write(sym_1brc, block_df)\n", + " else:\n", + " lib.append(sym_1brc, block_df, validate_index=False)\n", + " print(f'{b}, ', end='')\n", + "print(' Finished')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Read and Aggregate the Data\n", + "* Uses ArcticDb QueryBuilder to group and aggregate the data\n", + "* This allows the performant multi-threaded C++ layer to do the heavy lifting\n", + "* This code uses too much memory to run on the free Google colab. The chunked version below will work" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.01 s ± 604 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "# this runs the query several times to get an accurate timing\n", + "lib.read(sym_1brc, query_builder=adb.QueryBuilder().groupby('City').agg(aggs))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
minmeanmax
City
city_0000-99.9-0.199.9
city_0001-99.90.399.9
city_0002-99.9-0.099.9
city_0003-99.9-0.399.9
city_0004-99.90.499.9
............
city_9995-99.9-0.199.9
city_9996-99.90.199.9
city_9997-99.9-0.099.9
city_9998-99.9-0.199.9
city_9999-99.9-0.399.9
\n", + "

10000 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " min mean max\n", + "City \n", + "city_0000 -99.9 -0.1 99.9\n", + "city_0001 -99.9 0.3 99.9\n", + "city_0002 -99.9 -0.0 99.9\n", + "city_0003 -99.9 -0.3 99.9\n", + "city_0004 -99.9 0.4 99.9\n", + "... ... ... ...\n", + "city_9995 -99.9 -0.1 99.9\n", + "city_9996 -99.9 0.1 99.9\n", + "city_9997 -99.9 -0.0 99.9\n", + "city_9998 -99.9 -0.1 99.9\n", + "city_9999 -99.9 -0.3 99.9\n", + "\n", + "[10000 rows x 3 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# run the query once more to see the output\n", + "agg_results = lib.read(sym_1brc, query_builder=adb.QueryBuilder().groupby('City').agg(aggs))\n", + "agg_results.data.sort_index().round(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusions\n", + "* It was easy to solve the 1 billion row challenge using ArticDB\n", + "* The code is short and easy to read\n", + "* Almost no tuning was needed to get good performance" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus: Chunked Read and Aggregate\n", + "* This version reads and aggregates in chunks\n", + "* It has the same end result as the simpler version above\n", + "* It performs almost as well and needs less memory\n", + "* In particular, it will run within the memory on the free version of Google colab" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# we need to aggregate sum and count to get the aggregated mean\n", + "aggs_chunked = {\n", + " 'max': ('Temperature', 'max'), \n", + " 'min': ('Temperature', 'min'),\n", + " 'sum': ('Temperature', 'sum'),\n", + " 'count': ('Temperature', 'count')\n", + "}\n", + "\n", + "# define a list of ReadRequests - the chunks are based on row_ranges\n", + "read_requests = [adb.ReadRequest(symbol=sym_1brc, \n", + " row_range=(block_size*b, block_size*(b+1)), \n", + " query_builder=adb.QueryBuilder().groupby('City').agg(aggs_chunked))\n", + " for b in range(num_blocks)\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "# these functions merge the results of the chunks into one result\n", + "def merge_results_pair(r0, r1):\n", + " join_r = r0.join(r1, lsuffix='_0', rsuffix='_1')\n", + " return pd.DataFrame(index=join_r.index,\n", + " data={\n", + " 'min': join_r[['min_0', 'min_1']].min(axis=1),\n", + " 'max': join_r[['max_0', 'max_1']].max(axis=1),\n", + " 'count': join_r[['count_0', 'count_1']].sum(axis=1),\n", + " 'sum': join_r[['sum_0', 'sum_1']].sum(axis=1),\n", + " }\n", + " )\n", + "\n", + "def merge_results(r):\n", + " res = r[0].data.sort_index()\n", + " for b in range(1, len(r)):\n", + " next_res = r[b].data.sort_index()\n", + " res = merge_results_pair(res, next_res)\n", + " res['mean'] = res['sum'] / res['count']\n", + " res = res.drop(columns=['sum', 'count']).loc[:, ['min', 'mean', 'max']].round(1)\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
minmeanmax
City
city_0000-99.9-0.199.9
city_0001-99.90.399.9
city_0002-99.9-0.099.9
city_0003-99.9-0.399.9
city_0004-99.90.499.9
............
city_9995-99.9-0.199.9
city_9996-99.90.199.9
city_9997-99.9-0.099.9
city_9998-99.9-0.199.9
city_9999-99.9-0.399.9
\n", + "

10000 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " min mean max\n", + "City \n", + "city_0000 -99.9 -0.1 99.9\n", + "city_0001 -99.9 0.3 99.9\n", + "city_0002 -99.9 -0.0 99.9\n", + "city_0003 -99.9 -0.3 99.9\n", + "city_0004 -99.9 0.4 99.9\n", + "... ... ... ...\n", + "city_9995 -99.9 -0.1 99.9\n", + "city_9996 -99.9 0.1 99.9\n", + "city_9997 -99.9 -0.0 99.9\n", + "city_9998 -99.9 -0.1 99.9\n", + "city_9999 -99.9 -0.3 99.9\n", + "\n", + "[10000 rows x 3 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# run the query\n", + "read_results = lib.read_batch(read_requests)\n", + "results = merge_results(read_results)\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml index 833eac2acc..adabebbdc1 100644 --- a/docs/mkdocs/mkdocs.yml +++ b/docs/mkdocs/mkdocs.yml @@ -109,6 +109,7 @@ nav: - Snapshots Notebook: 'notebooks/ArcticDB_demo_snapshots.ipynb' - Equity Analytics Notebook: 'notebooks/ArcticDB_demo_equity_analytics.ipynb' - Equity Options Notebook: 'notebooks/ArcticDB_demo_equity_options.ipynb' + - 1 Billion Row Challenge Notebook: 'notebooks/ArcticDB_billion_row_challenge.ipynb' - Python API Reference: - Introduction: 'api/index.md' - Arctic: 'api/arctic.md'