From 1f97cc2d625e08a2d3ae19ecb01df51f36e80073 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 18 Jul 2020 16:29:00 -0700 Subject: [PATCH 01/11] Add hyper-pararmeter-optimization notebook with Hyperband Currently depends on https://github.com/dask/dask-ml/pull/701 This could be improved by using an estimator that benefitted from large amounts of data. --- hyper-parameter-optimization.ipynb | 253 +++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 hyper-parameter-optimization.ipynb diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb new file mode 100644 index 0000000..e6d16ef --- /dev/null +++ b/hyper-parameter-optimization.ipynb @@ -0,0 +1,253 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get resources" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import coiled\n", + "import dask.distributed" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using existing cluster: play\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 20
  • \n", + "
  • Cores: 40
  • \n", + "
  • Memory: 85.90 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cluster = coiled.Cluster(\n", + " name=\"play\", \n", + " n_workers=20, \n", + " configuration=\"coiled/dask-examples\", \n", + " shutdown_on_close=False\n", + ")\n", + "client = dask.distributed.Client(cluster)\n", + "\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get and pre-process data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "df = dd.read_csv(\n", + " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", + " parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"],\n", + " dtype={\n", + " \"VendorID\": \"UInt8\",\n", + " \"passenger_count\": \"UInt8\",\n", + " \"RatecodeID\": \"UInt8\",\n", + " \"store_and_fwd_flag\": \"category\",\n", + " \"PULocationID\": \"UInt16\",\n", + " \"DOLocationID\": \"UInt16\", \n", + " \"payment_type\": \"UInt8\",\n", + " },\n", + ")\n", + "\n", + "data = df[[\"passenger_count\", \"trip_distance\", \"RatecodeID\", \"payment_type\", \"fare_amount\"]]\n", + "data = data.fillna(0)\n", + "\n", + "labels = (df.tip_amount / df.fare_amount) > 0.25\n", + "labels = labels.fillna(False)\n", + "\n", + "from dask_ml.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=True)\n", + "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Train model" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import SGDClassifier\n", + "from dask_ml.model_selection import HyperbandSearchCV\n", + "from scipy.stats import uniform, loguniform\n", + "\n", + "\n", + "clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)\n", + "\n", + "params = {'alpha': loguniform(1e-2, 1e0), # or np.logspace\n", + " 'l1_ratio': uniform(0, 1)} # or np.linspace\n", + "\n", + "search = HyperbandSearchCV(clf, params, max_iter=81, random_state=0)\n", + "\n", + "search.fit(X_train, y_train, classes=[0, 1]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.766619056755278" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.score(X_test.sample(frac=0.1, random_state=123), y_test.sample(frac=0.1, random_state=123))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What if we just sampled instead?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "SGDClassifier(penalty='elasticnet', random_state=0)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.fit(\n", + " X_train.sample(frac=0.01, random_state=123).compute(), \n", + " y_train.sample(frac=0.01, random_state=123).compute()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7487741904919819" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf.score(\n", + " X_test.sample(frac=0.01, random_state=123).compute(), \n", + " y_test.sample(frac=0.01, random_state=123).compute()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Depending on our business needs, we maybe didn't need to do all of this :)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:coiled-coiled-dask-examples]", + "language": "python", + "name": "conda-env-coiled-coiled-dask-examples-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 72b479b505ab9b6cca3ff195c76b1a27f6f516e7 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Fri, 24 Jul 2020 10:29:38 -0700 Subject: [PATCH 02/11] squashme --- hyper-parameter-optimization.ipynb | 138 ++++++++++++++++++----------- 1 file changed, 85 insertions(+), 53 deletions(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index e6d16ef..dabcf13 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -37,23 +37,23 @@ "\n", "

Client

\n", "\n", "\n", "\n", "

Cluster

\n", "
    \n", - "
  • Workers: 20
  • \n", - "
  • Cores: 40
  • \n", - "
  • Memory: 85.90 GB
  • \n", + "
  • Workers: 15
  • \n", + "
  • Cores: 60
  • \n", + "
  • Memory: 128.85 GB
  • \n", "
\n", "\n", "\n", "" ], "text/plain": [ - "" + "" ] }, "execution_count": 2, @@ -64,8 +64,8 @@ "source": [ "cluster = coiled.Cluster(\n", " name=\"play\", \n", - " n_workers=20, \n", - " configuration=\"coiled/dask-examples\", \n", + " n_workers=15, \n", + " configuration=\"coiled/default\", \n", " shutdown_on_close=False\n", ")\n", "client = dask.distributed.Client(cluster)\n", @@ -74,12 +74,32 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 8, "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client\n", + "_GatheringFuture exception was never retrieved\n", + "future: <_GatheringFuture finished exception=CancelledError()>\n", + "asyncio.exceptions.CancelledError\n" + ] + } + ], "source": [ - "## Get and pre-process data" + "cluster.scale(0)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": 3, @@ -99,6 +119,7 @@ " \"DOLocationID\": \"UInt16\", \n", " \"payment_type\": \"UInt8\",\n", " },\n", + " blocksize=\"16 MiB\",\n", ")\n", "\n", "data = df[[\"passenger_count\", \"trip_distance\", \"RatecodeID\", \"payment_type\", \"fare_amount\"]]\n", @@ -113,6 +134,26 @@ "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "131504474" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.passenger_count.sum().compute()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -124,8 +165,32 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "TypeError", + "evalue": "Cannot operate on Dask array with unknown chunk sizes. Use the following the compute chunk sizes:\n\n >>> X.compute_chunk_sizes() # if Dask.Array\n >>> ddf.to_dask_array(lengths=True) # if Dask.Dataframe", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 673\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_client\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 674\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_estimator_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 830\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 832\u001b[0;31m return sync(\n\u001b[0m\u001b[1;32m 833\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 834\u001b[0m )\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36mf\u001b[0;34m()\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0mfuture\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 735\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 736\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;31m# performance penalty for the synchronous case.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 209\u001b[0;31m \u001b[0myielded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 210\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mStopIteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mReturn\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m future_set_result_unless_cancelled(\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_hyperband.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgen\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoroutine\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 388\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 389\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_parameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0mbrackets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_hyperband_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meta\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maggressiveness\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36m_validate_parameters\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dask_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccept_unknown_chunks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_dask_dataframe\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36m_check_array\u001b[0;34m(self, X, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 526\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/utils.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_dask_array, accept_dask_dataframe, accept_unknown_chunks, accept_multiple_blocks, preserve_pandas_dataframe, remove_zero_chunks, *args, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0maccept_unknown_chunks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnan\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m raise TypeError(\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;34m\"Cannot operate on Dask array with unknown chunk sizes. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[0;34m\"Use the following the compute chunk sizes:\\n\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mTypeError\u001b[0m: Cannot operate on Dask array with unknown chunk sizes. Use the following the compute chunk sizes:\n\n >>> X.compute_chunk_sizes() # if Dask.Array\n >>> ddf.to_dask_array(lengths=True) # if Dask.Dataframe" + ] + } + ], "source": [ + "%%time\n", + "\n", "from sklearn.linear_model import SGDClassifier\n", "from dask_ml.model_selection import HyperbandSearchCV\n", "from scipy.stats import uniform, loguniform\n", @@ -150,20 +215,9 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.766619056755278" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "search.score(X_test.sample(frac=0.1, random_state=123), y_test.sample(frac=0.1, random_state=123))" ] @@ -177,20 +231,9 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "SGDClassifier(penalty='elasticnet', random_state=0)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "clf.fit(\n", " X_train.sample(frac=0.01, random_state=123).compute(), \n", @@ -200,20 +243,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7487741904919819" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "clf.score(\n", " X_test.sample(frac=0.01, random_state=123).compute(), \n", @@ -231,9 +263,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:coiled-coiled-dask-examples]", + "display_name": "Python [conda env:coiled-coiled-default]", "language": "python", - "name": "conda-env-coiled-coiled-dask-examples-py" + "name": "conda-env-coiled-coiled-default-py" }, "language_info": { "codemirror_mode": { @@ -245,7 +277,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.4" + "version": "3.8.5" } }, "nbformat": 4, From 3bedf545cd3cf4e4977d30710b53cf7f1f0e71da Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Sat, 25 Jul 2020 09:45:52 -0700 Subject: [PATCH 03/11] cleanup --- hyper-parameter-optimization.ipynb | 121 ++--------------------------- 1 file changed, 8 insertions(+), 113 deletions(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index dabcf13..cc30473 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -19,54 +19,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing cluster: play\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "

Client

\n", - "\n", - "
\n", - "

Cluster

\n", - "
    \n", - "
  • Workers: 15
  • \n", - "
  • Cores: 60
  • \n", - "
  • Memory: 128.85 GB
  • \n", - "
\n", - "
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "cluster = coiled.Cluster(\n", - " name=\"play\", \n", - " n_workers=15, \n", + " n_workers=20, \n", " configuration=\"coiled/default\", \n", - " shutdown_on_close=False\n", ")\n", "client = dask.distributed.Client(cluster)\n", "\n", @@ -74,23 +33,10 @@ ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client\n", - "_GatheringFuture exception was never retrieved\n", - "future: <_GatheringFuture finished exception=CancelledError()>\n", - "asyncio.exceptions.CancelledError\n" - ] - } - ], "source": [ - "cluster.scale(0)" + "## Get and pre-process data" ] }, { @@ -98,13 +44,6 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "import dask.dataframe as dd\n", "df = dd.read_csv(\n", @@ -134,26 +73,6 @@ "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" ] }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "131504474" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.passenger_count.sum().compute()" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -163,34 +82,10 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "TypeError", - "evalue": "Cannot operate on Dask array with unknown chunk sizes. Use the following the compute chunk sizes:\n\n >>> X.compute_chunk_sizes() # if Dask.Array\n >>> ddf.to_dask_array(lengths=True) # if Dask.Dataframe", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 671\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mcontext\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 673\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_client\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 674\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 675\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_estimator_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 830\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 832\u001b[0;31m return sync(\n\u001b[0m\u001b[1;32m 833\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 834\u001b[0m )\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 337\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 339\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtb\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 340\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 341\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36mf\u001b[0;34m()\u001b[0m\n\u001b[1;32m 321\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallback_timeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 322\u001b[0m \u001b[0mfuture\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0masyncio\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfuture\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 323\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 324\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 325\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 733\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 734\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 735\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 736\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 737\u001b[0m \u001b[0mexc_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/tornado/gen.py\u001b[0m in \u001b[0;36mwrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 207\u001b[0m \u001b[0;31m# performance penalty for the synchronous case.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 208\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 209\u001b[0;31m \u001b[0myielded\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 210\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mStopIteration\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mReturn\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 211\u001b[0m future_set_result_unless_cancelled(\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_hyperband.py\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 387\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mgen\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcoroutine\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 388\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 389\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_validate_parameters\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 390\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 391\u001b[0m \u001b[0mbrackets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_get_hyperband_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax_iter\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meta\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0maggressiveness\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36m_validate_parameters\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 501\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_dask_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 502\u001b[0m \u001b[0mkwargs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0maccept_unknown_chunks\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_dask_dataframe\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 503\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 504\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mensure_2d\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0mscorer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_scoring\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscoring\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36m_check_array\u001b[0;34m(self, X, **kwargs)\u001b[0m\n\u001b[1;32m 522\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mda\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 524\u001b[0;31m \u001b[0mX\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 525\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 526\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/miniconda/envs/coiled-coiled-default/lib/python3.8/site-packages/dask_ml/utils.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_dask_array, accept_dask_dataframe, accept_unknown_chunks, accept_multiple_blocks, preserve_pandas_dataframe, remove_zero_chunks, *args, **kwargs)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0maccept_unknown_chunks\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 143\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misnan\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 144\u001b[0;31m raise TypeError(\n\u001b[0m\u001b[1;32m 145\u001b[0m \u001b[0;34m\"Cannot operate on Dask array with unknown chunk sizes. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 146\u001b[0m \u001b[0;34m\"Use the following the compute chunk sizes:\\n\\n\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mTypeError\u001b[0m: Cannot operate on Dask array with unknown chunk sizes. Use the following the compute chunk sizes:\n\n >>> X.compute_chunk_sizes() # if Dask.Array\n >>> ddf.to_dask_array(lengths=True) # if Dask.Dataframe" - ] - } - ], + "outputs": [], "source": [ - "%%time\n", - "\n", "from sklearn.linear_model import SGDClassifier\n", "from dask_ml.model_selection import HyperbandSearchCV\n", "from scipy.stats import uniform, loguniform\n", From 7ab89eb0eef2093bc8c1ba8b014d835cae80e7d0 Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 27 Jul 2020 22:41:37 -0500 Subject: [PATCH 04/11] Make edits to Hyperband example --- hyper-parameter-optimization.ipynb | 347 ++++++++++++++++++++++++++--- 1 file changed, 311 insertions(+), 36 deletions(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index cc30473..7d88c7b 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -4,12 +4,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Get resources" + "This example will walk through the following:\n", + "\n", + "* **Getting and processing the data.**\n", + "* **Defining a model and parameters.**\n", + "* **Finding the best parameters,** and some details on why we're using the chosen search algorithm.\n", + "* **Scoring** and deploying.\n", + "\n", + "All of these tasks will be performed on the New York City Taxi Cab dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup cluster" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -39,37 +53,101 @@ "## Get and pre-process data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", + "\n", + "This data has 84 million taxi rides.\n", + "\n", + "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import dask.dataframe as dd\n", + "\n", + "features = [\"passenger_count\", \"trip_distance\", \"RatecodeID\", \"payment_type\", \"fare_amount\"]\n", + "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", "df = dd.read_csv(\n", " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", - " parse_dates=[\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"],\n", + " parse_dates=output,\n", + " usecols=features + output,\n", " dtype={\n", - " \"VendorID\": \"UInt8\",\n", " \"passenger_count\": \"UInt8\",\n", " \"RatecodeID\": \"UInt8\",\n", - " \"store_and_fwd_flag\": \"category\",\n", - " \"PULocationID\": \"UInt16\",\n", - " \"DOLocationID\": \"UInt16\", \n", " \"payment_type\": \"UInt8\",\n", " },\n", " blocksize=\"16 MiB\",\n", ")\n", + "df = df.persist()\n", "\n", - "data = df[[\"passenger_count\", \"trip_distance\", \"RatecodeID\", \"payment_type\", \"fare_amount\"]]\n", - "data = data.fillna(0)\n", - "\n", - "labels = (df.tip_amount / df.fare_amount) > 0.25\n", - "labels = labels.fillna(False)\n", + "data = df[features]\n", + "data = data.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "durations = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.total_seconds() / 60 # minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "84399019" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(durations)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.preprocessing import OneHotEncoder\n", + "rates = df[\"RatecodeID\"]\n", "\n", + "# Difficulty with this command\n", + "# rates_flags = OneHotEncoder().fit_transform((rates * 1.0).to_dask_array(lengths=True).reshape(-1, 1))\n", + "## After that's done, I'd stick df and rate_flags together and call that the training set\n", + "## It might be simpler to skip this cell" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ "from dask_ml.model_selection import train_test_split\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(data, labels, shuffle=True)\n", + "features = data.to_dask_array(lengths=True) # because MLPRegressor doesn't support dataframes\n", + "output = durations.to_dask_array(lengths=True)\n", + "X_train, X_test, y_train, y_test = train_test_split(features, output, shuffle=True)\n", + "\n", + "# persist the data so it's not re-computed\n", "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" ] }, @@ -77,28 +155,143 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Train model" + "## Define model and hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use Scikit-Learn's neural network as a stand-in for a more complicated model that needs GPUs.\n", + "\n", + "If desired, [PyTorch] can be used seamlessly in Dask-ML through the Scikit-Learn wrapper [skorch]. PyTorch is a popular deep learning that has strong GPU support, and Skorch is a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", + "\n", + "[PyTorch]:https://pytorch.org/\n", + "[skorch]:https://skorch.readthedocs.io/en/stable/" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "from sklearn.linear_model import SGDClassifier\n", - "from dask_ml.model_selection import HyperbandSearchCV\n", + "from sklearn.neural_network import MLPRegressor\n", "from scipy.stats import uniform, loguniform\n", "\n", + "# Input: XXX features\n", + "# Output: 1 scalar, estimated trip duration\n", + "model = MLPRegressor()\n", + "\n", + "params = {\n", + " \"hidden_layer_sizes\": [\n", + " (100, ),\n", + " (50, ) * 2,\n", + " (34, 33, 33),\n", + " (25, ) * 4,\n", + " (20, ) * 5,\n", + " (10, ) * 10,\n", + " ], # 100 neurons; how much does width/depth help?\n", + " \"activation\": [\"logistic\", \"tanh\", \"relu\"],\n", + " \"alpha\": loguniform(1e-5, 1e-3),\n", + " \"batch_size\": [128, 256, 512, 1024],\n", + " \"learning_rate_init\": loguniform(1e-4, 1e-2),\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the best hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our search is \"compute constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", "\n", - "clf = SGDClassifier(tol=1e-3, penalty='elasticnet', random_state=0)\n", + "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`. To use this class, we need to know two items:\n", "\n", - "params = {'alpha': loguniform(1e-2, 1e0), # or np.logspace\n", - " 'l1_ratio': uniform(0, 1)} # or np.linspace\n", + "* `n_params`, the (approximate) number of parameters to sample.\n", + "* `n_examples`, the largest number of examples any model will see.\n", "\n", - "search = HyperbandSearchCV(clf, params, max_iter=81, random_state=0)\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.model_selection import HyperbandSearchCV\n", "\n", - "search.fit(X_train, y_train, classes=[0, 1]);" + "n_params = 25\n", + "n_examples = 1e6" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` comes with a rule-of-thumb to computer the inputs:" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "max_iter = n_params # how many partial_fit calls?\n", + "chunksize = n_examples // n_params # how many examples does each partial_fit call see?\n", + "\n", + "X_train2 = X_train.rechunk(chunks=(chunksize, -1))\n", + "y_train2 = y_train.rechunk(chunks=chunksize)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's run the search. Because this is an initial search, let's set `aggressiveness=4`:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msearch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHyperbandSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msearch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0mAdditional\u001b[0m \u001b[0mpartial\u001b[0m \u001b[0mfit\u001b[0m \u001b[0mkeyword\u001b[0m \u001b[0marguments\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 609\u001b[0m \"\"\"\n\u001b[0;32m--> 610\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_client\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 611\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 612\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_estimator_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 830\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m return sync(\n\u001b[0;32m--> 832\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 833\u001b[0m )\n\u001b[1;32m 834\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 553\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/anaconda3/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "search = HyperbandSearchCV(model, params, max_iter=n_params, aggressiveness=4, random_state=0)\n", + "\n", + "search.fit(X_train2, y_train2, classes=[0, 1]);" ] }, { @@ -121,7 +314,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## What if we just sampled instead?" + "We can also obtain the best estimator through the `best_estimator_` attribute:" ] }, { @@ -130,10 +323,14 @@ "metadata": {}, "outputs": [], "source": [ - "clf.fit(\n", - " X_train.sample(frac=0.01, random_state=123).compute(), \n", - " y_train.sample(frac=0.01, random_state=123).compute()\n", - ")" + "search.best_estimator_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means we can score on the entire dataset:" ] }, { @@ -142,25 +339,103 @@ "metadata": {}, "outputs": [], "source": [ - "clf.score(\n", - " X_test.sample(frac=0.01, random_state=123).compute(), \n", - " y_test.sample(frac=0.01, random_state=123).compute()\n", - ")" + "from dask_ml.wrappers import ParallelPostFit\n", + "deployed_model = ParallelPostFit(search.best_estimator_)\n", + "deployed_model.score(X_test, y_test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Depending on our business needs, we maybe didn't need to do all of this :)" + "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", + "\n", + "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search.best_score_" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search.best_params_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why not simply sampling instead?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sampling solves the memory issues:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_small = X_train.sample(frac=0.01, random_state=123).compute()\n", + "y_train_small = y_train.sample(frac=0.01, random_state=123).compute()\n", + "\n", + "X_train_small # NumPy ndarray; must fit in memory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But `HyperbandSearchCV` is meant for computationally-constrained problems, regardless of their memory usage (which [Dask-ML's documentation on hyperparameter searches][2] also indicate). `HyperbandSearchCV` would still be relevant:\n", + "\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search = HyperbandSearchCV(model, params, max_iter=81, random_state=0)\n", + "search.fit(X_train_small, y_train_small, classes=[0, 1]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:coiled-coiled-default]", + "display_name": "Python [conda env:root] *", "language": "python", - "name": "conda-env-coiled-coiled-default-py" + "name": "conda-root-py" }, "language_info": { "codemirror_mode": { @@ -172,7 +447,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.7.4" } }, "nbformat": 4, From a49ccdaa330f5fb644dc38e9480b9dee72128ebf Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 27 Jul 2020 22:46:19 -0500 Subject: [PATCH 05/11] Note about when to use IncrementalSearchCV --- hyper-parameter-optimization.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index 7d88c7b..1ed73b8 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -420,7 +420,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs." + "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs.\n", + "\n", + "If we had a simpler model and a massive dataset, `IncrementalSearchCV` is recommended. It mirrors Scikit-Learn's `RandomizedSearchCV` but works on Dask Arrays/Dataframes, both of which can be larger than memory." ] }, { From 0dd0a060c5d8ec377de910874129aa88baa7265d Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 27 Jul 2020 22:50:57 -0500 Subject: [PATCH 06/11] wording --- hyper-parameter-optimization.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index 1ed73b8..bc1b61d 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -59,7 +59,7 @@ "source": [ "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", "\n", - "This data has 84 million taxi rides.\n", + "These data have records on 84 million taxi rides.\n", "\n", "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" ] @@ -217,7 +217,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our search is \"compute constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", + "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", "\n", "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`. To use this class, we need to know two items:\n", "\n", From 14c41c04855d8f275ba14caa0d8b429e87af9708 Mon Sep 17 00:00:00 2001 From: Scott Date: Mon, 27 Jul 2020 22:57:41 -0500 Subject: [PATCH 07/11] Add visualiation --- hyper-parameter-optimization.ipynb | 72 ++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index bc1b61d..0bf66c6 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -371,6 +371,78 @@ "search.best_params_" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What does the error distribution look like?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = deployed_model.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Prediction error')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "image/png": { + "height": 277, + "width": 388 + }, + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "## This cell was run after the fact from a fresh notebook; rerun to \n", + "## y_test := as above. Ground truth trip durations\n", + "## y_pred := model output (as in the cell above?)\n", + "\n", + "y_test = np.random.uniform(0, 1, size=1000)\n", + "y_pred = np.random.uniform(0, 1, size=1000)\n", + "\n", + "err = np.abs(y_pred - y_test)\n", + "ax = pd.Series(err).plot.hist()\n", + "ax.set_xlabel(\"Prediction error (minutes)\")\n", + "ax.set_title(\"Prediction error\")" + ] + }, { "cell_type": "markdown", "metadata": {}, From fc2a73e63db2ab91582cd16368ae8e68ab7eb18e Mon Sep 17 00:00:00 2001 From: Scott Date: Thu, 30 Jul 2020 17:06:50 -0500 Subject: [PATCH 08/11] pytorch --- hyper-parameter-optimization.ipynb | 481 +++++++++++++++++++---------- 1 file changed, 321 insertions(+), 160 deletions(-) diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb index 0bf66c6..ecf26bf 100644 --- a/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimization.ipynb @@ -23,7 +23,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -66,34 +66,42 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import dask.dataframe as dd\n", "\n", - "features = [\"passenger_count\", \"trip_distance\", \"RatecodeID\", \"payment_type\", \"fare_amount\"]\n", + "features = [\"passenger_count\", \"trip_distance\", \"fare_amount\"]\n", + "categorical_features = [\"RatecodeID\", \"payment_type\"]\n", "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", + "\n", "df = dd.read_csv(\n", " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", " parse_dates=output,\n", - " usecols=features + output,\n", + " usecols=features + categorical_features + output,\n", " dtype={\n", " \"passenger_count\": \"UInt8\",\n", - " \"RatecodeID\": \"UInt8\",\n", - " \"payment_type\": \"UInt8\",\n", + " \"RatecodeID\": \"category\",\n", + " \"payment_type\": \"category\",\n", " },\n", " blocksize=\"16 MiB\",\n", ")\n", + "\n", + "# one hot encode the categorical columns;\n", + "# if df[\"foo\"].unique() == [1, 3, 4], add columns foo_1, foo_3, foo_4\n", + "df = dd.get_dummies(df, columns=categorical_features)\n", + "\n", + "# persist so only download once\n", "df = df.persist()\n", "\n", - "data = df[features]\n", + "data = df[[c for c in df.columns if c not in output]]\n", "data = data.fillna(0)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -104,47 +112,13 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "84399019" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(durations)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "from dask_ml.preprocessing import OneHotEncoder\n", - "rates = df[\"RatecodeID\"]\n", - "\n", - "# Difficulty with this command\n", - "# rates_flags = OneHotEncoder().fit_transform((rates * 1.0).to_dask_array(lengths=True).reshape(-1, 1))\n", - "## After that's done, I'd stick df and rate_flags together and call that the training set\n", - "## It might be simpler to skip this cell" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, "outputs": [], "source": [ "from dask_ml.model_selection import train_test_split\n", + "import dask\n", "\n", - "features = data.to_dask_array(lengths=True) # because MLPRegressor doesn't support dataframes\n", - "output = durations.to_dask_array(lengths=True)\n", + "features = data.to_dask_array(lengths=True).astype(\"float32\")\n", + "output = durations.to_dask_array(lengths=True).astype(\"float32\")\n", "X_train, X_test, y_train, y_test = train_test_split(features, output, shuffle=True)\n", "\n", "# persist the data so it's not re-computed\n", @@ -162,136 +136,207 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's use Scikit-Learn's neural network as a stand-in for a more complicated model that needs GPUs.\n", + "Let's use a simple neural network from [PyTorch] usin Skorch, a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", "\n", - "If desired, [PyTorch] can be used seamlessly in Dask-ML through the Scikit-Learn wrapper [skorch]. PyTorch is a popular deep learning that has strong GPU support, and Skorch is a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", + "This network is only small for demonstration. If desired, we could use much larger networks on GPUs.\n", "\n", "[PyTorch]:https://pytorch.org/\n", "[skorch]:https://skorch.readthedocs.io/en/stable/" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If desired, this model could use GPUs." + ] + }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from sklearn.neural_network import MLPRegressor\n", - "from scipy.stats import uniform, loguniform\n", + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", "\n", - "# Input: XXX features\n", - "# Output: 1 scalar, estimated trip duration\n", - "model = MLPRegressor()\n", + "class HiddenLayerNet(nn.Module):\n", + " def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation=\"relu\"):\n", + " super().__init__()\n", + " self.fc1 = nn.Linear(n_features, n_hidden)\n", + " self.fc2 = nn.Linear(n_hidden, n_outputs)\n", + " self.activation = getattr(F, activation)\n", "\n", - "params = {\n", - " \"hidden_layer_sizes\": [\n", - " (100, ),\n", - " (50, ) * 2,\n", - " (34, 33, 33),\n", - " (25, ) * 4,\n", - " (20, ) * 5,\n", - " (10, ) * 10,\n", - " ], # 100 neurons; how much does width/depth help?\n", - " \"activation\": [\"logistic\", \"tanh\", \"relu\"],\n", - " \"alpha\": loguniform(1e-5, 1e-3),\n", - " \"batch_size\": [128, 256, 512, 1024],\n", - " \"learning_rate_init\": loguniform(1e-4, 1e-2),\n", - "}" + " def forward(self, x, **kwargs):\n", + " return self.fc2(self.activation(self.fc1(x)))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 6, "metadata": {}, + "outputs": [], "source": [ - "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." + "from torch_model import HiddenLayerNet" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 7, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(900, 14)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Find the best hyperparameters" + "X_train.shape" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 8, "metadata": {}, + "outputs": [], "source": [ - "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", - "\n", - "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`. To use this class, we need to know two items:\n", + "from skorch import NeuralNetRegressor\n", "\n", - "* `n_params`, the (approximate) number of parameters to sample.\n", - "* `n_examples`, the largest number of examples any model will see.\n", + "niceties = {\n", + " \"callbacks\": False,\n", + " \"warm_start\": True,\n", + " \"train_split\": None,\n", + " \"max_epochs\": 1,\n", + "}\n", "\n", - "[2]:https://ml.dask.org/hyper-parameter-search.html" + "model = NeuralNetRegressor(\n", + " module=HiddenLayerNet,\n", + " module__n_features=X_train.shape[1],\n", + " optimizer=optim.SGD,\n", + " criterion=nn.MSELoss,\n", + " lr=0.0001,\n", + " **niceties,\n", + ")" ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "from dask_ml.model_selection import HyperbandSearchCV\n", + "from scipy.stats import loguniform, uniform\n", "\n", - "n_params = 25\n", - "n_examples = 1e6" + "params = {\n", + " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", + " \"batch_size\": [32, 64, 128, 256],\n", + " \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", + " \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", + " \"optimizer__momentum\": uniform(0, 1),\n", + " \"optimizer__nesterov\": [True],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the best hyperparameters" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "`HyperbandSearchCV` comes with a rule-of-thumb to computer the inputs:" + "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", + "\n", + "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`.\n", + "\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "max_iter = n_params # how many partial_fit calls?\n", - "chunksize = n_examples // n_params # how many examples does each partial_fit call see?\n", - "\n", - "X_train2 = X_train.rechunk(chunks=(chunksize, -1))\n", - "y_train2 = y_train.rechunk(chunks=chunksize)" + "from dask_ml.model_selection import HyperbandSearchCV\n", + "search = HyperbandSearchCV(model, params, random_state=42, verbose=True, max_iter=9)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's run the search. Because this is an initial search, let's set `aggressiveness=4`:\n" + "By default, `HyperbandSearchCV` will call `partial_fit` on each chunk of the Dask Array. `HyperbandSearchCV`'s rule of thumb specifies how to train for longer or sample more parameters." ] }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 12, "metadata": {}, "outputs": [ { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0msearch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mHyperbandSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmax_iter\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mn_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrandom_state\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0msearch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX_train2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclasses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m;\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/dask_ml/model_selection/_incremental.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, **fit_params)\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[0mAdditional\u001b[0m \u001b[0mpartial\u001b[0m \u001b[0mfit\u001b[0m \u001b[0mkeyword\u001b[0m \u001b[0marguments\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 609\u001b[0m \"\"\"\n\u001b[0;32m--> 610\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mdefault_client\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msync\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mfit_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 611\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 612\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mif_delegate_has_method\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdelegate\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"best_estimator_\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"estimator\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/distributed/client.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 830\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 831\u001b[0m return sync(\n\u001b[0;32m--> 832\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloop\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcallback_timeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcallback_timeout\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 833\u001b[0m )\n\u001b[1;32m 834\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/distributed/utils.py\u001b[0m in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;32mwhile\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_set\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 336\u001b[0;31m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 337\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mtyp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mexc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtb\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0merror\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 550\u001b[0m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_flag\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 551\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 552\u001b[0;31m \u001b[0msignaled\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_cond\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 553\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0msignaled\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 554\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/anaconda3/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 299\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 300\u001b[0;31m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 301\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 302\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV, bracket=2] creating 9 models\n", + "[CV, bracket=1] creating 5 models\n", + "[CV, bracket=0] creating 3 models\n", + "[CV, bracket=0] For training there are between 360 and 360 examples in each chunk\n", + "[CV, bracket=1] For training there are between 360 and 360 examples in each chunk\n", + "[CV, bracket=0] validation score of -0.3191 received after 1 partial_fit calls\n", + "[CV, bracket=2] For training there are between 360 and 360 examples in each chunk\n", + "[CV, bracket=1] validation score of 0.0170 received after 1 partial_fit calls\n", + "[CV, bracket=1] validation score of 0.0322 received after 3 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.2228 received after 1 partial_fit calls\n", + "[CV, bracket=2] validation score of -0.7214 received after 3 partial_fit calls\n", + "[CV, bracket=1] validation score of 0.0183 received after 9 partial_fit calls\n", + "[CV, bracket=0] validation score of -0.2677 received after 9 partial_fit calls\n", + "[CV, bracket=2] validation score of -0.5336 received after 9 partial_fit calls\n" ] + }, + { + "data": { + "text/plain": [ + "HyperbandSearchCV(estimator=[uninitialized](\n", + " module=,\n", + " module__n_features=14,\n", + "),\n", + " max_iter=9,\n", + " parameters={'batch_size': [32, 64, 128, 256],\n", + " 'module__activation': ['relu', 'elu', 'softsign',\n", + " 'leaky_relu', 'rrelu'],\n", + " 'optimizer__lr': ,\n", + " 'optimizer__momentum': ,\n", + " 'optimizer__nesterov': [True],\n", + " 'optimizer__weight_decay': },\n", + " random_state=42, verbose=True)" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "search = HyperbandSearchCV(model, params, max_iter=n_params, aggressiveness=4, random_state=0)\n", - "\n", - "search.fit(X_train2, y_train2, classes=[0, 1]);" + "y_train2 = y_train.reshape(-1, 1).persist()\n", + "_ = search.fit(X_train, y_train2)" ] }, { @@ -301,27 +346,81 @@ "## Score" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", + "\n", + "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.018286365127180515" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "search.score(X_test.sample(frac=0.1, random_state=123), y_test.sample(frac=0.1, random_state=123))" + "search.best_score_" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 18, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'batch_size': 32,\n", + " 'module__activation': 'rrelu',\n", + " 'optimizer__lr': 0.0002668107973843001,\n", + " 'optimizer__momentum': 0.5920831762255758,\n", + " 'optimizer__nesterov': True,\n", + " 'optimizer__weight_decay': 3.6363529586270234e-05}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "We can also obtain the best estimator through the `best_estimator_` attribute:" + "search.best_params_" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[initialized](\n", + " module_=HiddenLayerNet(\n", + " (fc1): Linear(in_features=14, out_features=100, bias=True)\n", + " (fc2): Linear(in_features=100, out_features=1, bias=True)\n", + " ),\n", + ")" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "search.best_estimator_" ] @@ -330,14 +429,25 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This means we can score on the entire dataset:" + "This means we can deploy the best model and score on the entire dataset:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.33630241003610284" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from dask_ml.wrappers import ParallelPostFit\n", "deployed_model = ParallelPostFit(search.best_estimator_)\n", @@ -348,77 +458,136 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", - "\n", - "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" + "## Visualization" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "search.best_score_" + "What does the error distribution look like on this larger dataset?" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 103, "metadata": {}, "outputs": [], "source": [ - "search.best_params_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualization" + "y_pred = deployed_model.predict(X_test)\n", + "y_pred = y_pred.flatten()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 108, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 800 B 400 B
Shape (100,) (50,)
Count 4 Tasks 2 Chunks
Type int64 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 100\n", + " 1\n", + "\n", + "
" + ], + "text/plain": [ + "dask.array<_predict, shape=(100,), dtype=int64, chunksize=(50,), chunktype=numpy.ndarray>" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "What does the error distribution look like?" + "y_pred" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ - "y_pred = deployed_model.predict(X_test)" + "import numpy as np\n", + "import pandas as pd\n", + "import dask.array as da\n", + "\n", + "err = np.abs(y_pred - y_test)\n", + "max_min_err = 20\n", + "vals, edges = da.histogram(err, range=(0, max_min_err), bins=max_min_err)\n", + "vals, edges = dask.compute(vals, edges)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Text(0.5, 1.0, 'Prediction error')" + "(100,)" ] }, - "execution_count": 7, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" - }, + } + ], + "source": [ + "y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ { "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAwkAAAIqCAYAAABv1AagAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAWJQAAFiUBSVIk8AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nOzdd7wtVX338c9X6ZduQ4PxipESRYlXxUoztmDBAFGDBpIYNbErMdivJQqPPlaMDRCVGGwRo4DloSMaFYLEhCLiFVAUEQGpAvf3/DFz5Mxxn3pnn33K5/16zWvOnlmz1tr7zD13vntmzaSqkCRJkqQxdxp1ByRJkiQtLIYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESVrGkqxJUkn2mLD8oHb5qfPcn2qnlfPZriSpy5AgSesgydHjDmzHT9clOTfJO5NsO+p+jlqSPZKsTrLPqPsiSZqeIUGS+nEr8It2uhLYFHgwcDDw30keM8K+zcW1wIXApT3VtwfwJmC6kHBhO93aU7uSpDlYb9QdkKQl4qyq2mPsRZJNgH2B9wNbAp9Lsl1V3TSi/s1KVX0R+OII2t1xvtuUJP0+zyRI0hBU1Y1V9Sngpe2ibZj+W3RJkhYEQ4IkDddngbXtz6vGFk4cGJzkgCSnJflVu7wTKJJsmuS1Sb6b5NokNyf5YZL3J7n3VB1o6/52kuuTXJ3k5CR7T7PNtAOXk+yU5MNJLkpyQ5Jrkvx326dVbZmVSYrmUiOAAweM31g5rs4pBy4nuV+SjyS5pP0Mfp3k9CTPS3LnSbY5ta3zoCQbt2MjLkxyU5Irkxyb5P5TfR7TSfKYtp7Lk9zS/h7/X5JnJ8mA8nu0fVrTvn5ykhPb/qxN8vJ2+Wz3k3X9fLZMcliSC5LcmOSadflcJC1eXm4kSUNUVbckuQq4O7D5oDJJ3g+8hCZMXMsdoWJs/U7AicB92kW3AbcAf9Ru95wkT62qbw6o+3DgRe3LtTTX+u8B7JnkZXN9X0leArwHGDvwvAHYAHhgOz2obed2mnEamwIrgJvb9zje7TNs8ynA54CN2kXXtnU+tp2emWSfqrphkio2B74J/AnN57cWuBvwTODxSR5eVT+aSV8m9Osw4NXjFv2G5hKzx7XT05IcUFVrJ9n+VcC7gGLA739cuen2k3X9fO4GnA1sR/P5/Hbydy1pqfNMgiQNUZKNaQ6+AAZ9K7sKeDHNN+13qaqtga2As9rttwBOoAkIxwEPATauqk2B+wKfast/IcmWE9o+gDsCwrva+rcC7gl8sl12N2Ypyf40Yy3uDHwe+OO2PyuAewHPoTnYpKouq6pt2rYAPlNV20yYLptBm/cDjqU5AD4N2LGqtgQ2A15Ac1D7p8D7pqjmzTSf1ZPavm4K7AZcDmwNvGPmn8Lv+vUymoDwS+AfgK2qavO2/r8ArgCeBfzTJFXcAzgM+Bfgnu3vZ1Oaz3W86faTPj6fNwLrA08GNmnfx0Nn9EFIWnqqysnJyclpjhNwNM03wKdOsv7F7foC9h23/KBxy98+Rf1va8scB2SSMse3ZQ4etyzAD9vlRw/YJsA3xvVhjwnrx/p36oTl6wOXtes+PYvPafVkfZlQbqw/KycsP7JdfjHNAezE7Z7frl8L/NGEdae2626cuK5dv2+7/mZgg1m8py1pzhrcCjx8kjKPaPt09fi6ac6yjL3XST/HWewnfXw+vwUeOMp/T05OTgtn8kyCJPUsjZVJDgb+T7v4J8CXBxS/HXj3FNUd2M7fU1U1SZl/a+ePH7dsF5rLkWDAN+RtXW+fot3JPA7Ylqbf/ziH7WetvaZ/3/ble6rqxgHFjgB+ShN+9pukqs9X1cUDlv8HzUHyhtzxmc3EvjTf+p9ZVd8ZVKCqvg1cQvOt/6pBZYB3zqCtSfeTHj+fE6vqBzPoi6RlwDEJktSP3dsBuoNcAexTVYOu8b64qq4atFE7IHnsQWyfSzLwWnWasQAA4wcwP6SdX1lVF06y3Vk04xtm83/BI9r596vqp7PYbl1sB2zR/nzKoAJVtbYd3HsAd7z3ib47yba3JrmS5tKfrWbRr0e1812T/HyKclu383sD35qw7ibg+zNoa9L9hP4+n4l9k7SMGRIkqR+30lxSAs230jfQfIP8DeCIqvr1JNv9coo67znu55mMHdhkQPlJD+TrjkHV28yg7jH3aOd9PWRtJsa/96mCyeUDyo/3mym2vbmdrz/TTnHH72fjdprOJgOW/aomGdA8wVT7SV+fz1RtSFpmDAmS1I/Ow9RmYao7+4y/JHSLqrpuDvVP5/duz9lz+b5tOOL2xxv7/bynql45xzpmdGenWZRbl89npm1IWgYckyBJC9cvxv38x7Pcduxb4XtNViDJBsBdZlnv2GU195myVL/Gf8M9Vbtjl2bN1zfiY7+f2f5u+rZQPx9Ji5ghQZIWqKr6MXcciP75LDc/p53fI8n2k5R5FLM/o/ztdv6gJH8wi+3GLqmZy5mIS7jj9rF7DiqQ5E40dwyCO977sI1dw797ktmGrT4t1M9H0iJmSJCkhe3odv4P7UPVBmrvqLTFuEXn0twOEwbco7+9I84hc+jPSTTXvd+Zmd2VZ8zYpVJbTllqgPZOTP/evnxZkkHX9j8P+AOa8SATnzEwLJ+jGXuyEdN8FklmMyB6Vhbw5yNpETMkSNLCdijNN8UrgNOSHJhk07GVSe6d5O9oHl72jLHl7YHj6vbl3yQ5bOxha0nuARwF7EXz7IAZq6pbgVe1L5+d5LNJdhzXn3sm+bv26cDj/U87f0yS+8+mzdbbaQ7I7wUcn2SHtr0N2/c/1t6Rk9zmtHdV9SvgNe3Lv24/iweOrU+yUZLHJPkgzZOeh2nBfT6SFjcHLkvSAlZV1yR5Is29/HeiObNwVJJr+P276tSEbf81ySNpnrr8auBVSa6j+TY/wMuAVzLL8QVV9Zn2UqN3AvsD+ye5nubswlh/Tpuw2anAj4D7ARe2d1UaCyiPqarLmUJV/SjJs4HP0lw2c0H7GazgjjsSnQS8fDbvZV1V1QfaMzhv4Y7P4kaaJxxvwR1fxq0Zcj8W5OcjafHyTIIkLXDtN79/AvwDzX3wrwY2p3nGwXnAB4DdgU8N2PbFwHOA/6Q5cA3NAfxTqmrit/2z6dO72z59nOYAeH2a24ieB7wPeMWE8rfSPIjtUzSXK21FE07uwwy/sKqqLwM7Ax9r29yEJmicSfNE4SdW1Q1zfU9zVVVvAx4MfJTmKdehOTi/AjgR+Htg13nox4L8fCQtTpn8AZ6SJEmSliPPJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqWG/UHViOkvwY2BxYM+KuSJIkaWlbCVxXVfedzUaGhNHYfOONN956p5122nrUHZEkSdLSdf7553PTTTfNejtDwmis2WmnnbY+++yzR90PSZIkLWGrVq3inHPOWTPb7RyTIEmSJKnDkCBJkiSpw5AgSZIkqcOQIEmSJKnDkCBJkiSpw5AgSZIkqcOQIEmSJKnDkCBJkiSpw5AgSZIkqcOQIEmSJKnDkCBJkiSpw5AgSZIkqcOQIEmSJKnDkCBJkiSpw5AgSZIkqWPRhYQk+yX5QJIzklyXpJIcM0nZo9v1U00nTdjmoGnKv3B+3qkkSZI0GuuNugNz8HrgwcD1wOXAjlOUPQ5YM8m65wLbASdOsv5LwLkDln9vRr2UJEmSFqnFGBJeQRMOLgZ2B06ZrGBVHUcTFDqSbAm8GvgtcPQkmx9XVZOtkyRJkpasRRcSqup3oSDJXKt5LrAxcGxVXdVHvxaLlYccP+ouzLs1h+496i5IkiQtKosuJPTk79r5R6cos0uSlwMbAT8FTqmqy4feM0mSJGnEll1ISPJIYGfgovFnJQZ42YTXtyc5Anh5Vd08w7bOnmTVVOMoJEmSpJFadHc36sHz2/nHJln/Y+AlwA7ACuBewF/QDIB+AXDUkPsnSZIkjdSyOpOQZAuaA/5JByxX1WnAaeMW3Qh8Lsm3ge8Dz05yWFV9f7r2qmrVJP04G3jI7HovSZIkzY/ldibhOcAmwL/PdsByVV0GnNC+3K3vjkmSJEkLxXILCWMDlj8yx+1/2c5X9NAXSZIkaUFaNiEhya40D2G7qKpOnWM1u7bzS3rplCRJkrQALZuQwB0Dlqe67SlJHjtgWZK8BngkcBXw1f67J0mSJC0Mi27gcpJ9gH3al9u080cmObr9+aqqOnjCNpsDz6QZsPyJaZo4PclFwHdpno+wBfBo4IE0g5gPqKrr1vV9SJIkSQvVogsJwC7AgROWbddOAD8BDp6w/gCacQQzecLyu4CHA3sBWwNrgUuBDwLvriovNZIkSdKStuhCQlWtBlbPcpsPAR+aYdl/nH2vJEmSpKVjOY1JkCRJkjQDhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkd6426A5L6t/KQ40fdhXm35tC9R90FSZKWDM8kSJIkSeowJEiSJEnqMCRIkiRJ6lh0ISHJfkk+kOSMJNclqSTHTFJ2Zbt+sunYKdo5MMl3klyf5NokpyZ5yvDemSRJkrQwLMaBy68HHgxcD1wO7DiDbb4PHDdg+Q8GFU7yLuBVbf0fAzYAngV8OclLqurwOfRbkiRJWhQWY0h4Bc3B+8XA7sApM9jm3KpaPZPKkzyKJiD8CHhYVf26Xf5O4GzgXUm+UlVrZt91SZIkaeFbdJcbVdUpVfXDqqohNfHCdv7PYwGhbXcN8EFgQ+Cvh9S2JEmSNHKLLiTM0b2SvCDJa9v5g6You1c7/+qAdSdOKCNJkiQtOYvxcqO5eHw7/U6SU4EDq+rScctWAH8AXF9VVwyo54ftfPuZNJrk7ElWzWQchSRJkjQSS/1Mwo3AW4FVwFbtNDaOYQ/gpDYYjNminV87SX1jy7fsvaeSJEnSArGkzyRU1ZXAGycsPj3JE4AzgV2B5wHvm23VM2x/1aDl7RmGh8yyTUmSJGleLOmQMJmqui3JETQhYTfuCAljZwq2GLjh9GcatACtPOT4UXdBkiRpUVnqlxtN5Zft/HeXG1XVDcBPgU2T3HPANvdv5xcNuW+SJEnSyCznkPCIdn7JhOUnt/MnDdjmyRPKSJIkSUvOkg4JSXZNssGA5XvRPJQN4JgJqz/czl+XZKtx26wEXgTcAny8985KkiRJC8SiG5OQZB9gn/blNu38kUmObn++qqoObn8+DHhAe7vTy9tlD+KO5xy8oarOGl9/VZ2V5N3AK4Hzknwe2AB4JrA18BKftixJkqSlbNGFBGAX4MAJy7ZrJ4CfAGMh4VPAM4CH0VwqtD7wC+CzwOFVdcagBqrqVUnOA14MPB9YC5wDvLOqvtLfW5EkSZIWnkUXEqpqNbB6hmWPBI6cYzufAD4xl20lSZKkxWxJj0mQJEmSNHuGBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHeuNugOS1IeVhxw/6i7MuzWH7j3qLkiSlijPJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqWHQhIcl+ST6Q5Iwk1yWpJMdMUvb+Sf4pyclJLkvy2yS/SPKlJHtOss1BbZ2TTS8c7juUJEmSRmu9UXdgDl4PPBi4Hrgc2HGKsm8Fngn8L3ACcDWwA/A04GlJXlZV759k2y8B5w5Y/r059luSJElaFBZjSHgFTTi4GNgdOGWKsl8FDquq/xq/MMnuwDeAdyb5XFVdMWDb46rq6H66LEmSJC0ei+5yo6o6pap+WFU1g7JHTwwI7fLTgFOBDYBH9d9LSZIkafFajGcS+nJrO79tkvW7JHk5sBHwU+CUqrp8Ng0kOXuSVVNdIiVJkiSN1LIMCUnuAzwOuBE4fZJiL5vw+vYkRwAvr6qbh9k/SZIkaZSWXUhIsiHwr8CGwKur6tcTivwYeAnwdZqxD1sAjwHeAbwA2Bz4y5m0VVWrJunD2cBD5tJ/SZIkadgW3ZiEdZHkzsCngEcDnwHeNbFMVZ1WVYdX1UVVdWNVXVFVnwP2BH4NPDvJg+e145IkSdI8WjYhoQ0IxwD7A58FnjOTwc9jquoymtuoAuzWfw8lSZKkhWFZhIQk6wH/BjwL+DTwl1U12YDlqfyyna/oq2+SJEnSQrPkxyQk2YDmzMHTgU8Cf11Va+dY3a7t/JI++iZJkiQtREv6TEI7SPmLNAHhSGYQEJI8dsCyJHkN8EjgKpqHtEmSJElL0qI7k5BkH2Cf9uU27fyRSY5uf76qqg5uf/4w8Gc0B/Y/Bd6YZGKVp1bVqeNen57kIuC77TZb0Ax0fiDNLVMPqKrrentDkiRJ0gKz6EICsAtw4IRl27UTwE+AsZBw33Z+V+CNU9R56rif3wU8HNgL2BpYC1wKfBB4d1V5qZEkSZKWtEUXEqpqNbB6hmX3mEP9/zjbbSRJkqSlZEmPSZAkSZI0e4YESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkd6426A5KkuVl5yPGj7sK8W3Po3qPugiQtC55JkCRJktRhSJAkSZLUYUiQJEmS1GFIkCRJktRhSJAkSZLU0WtISOLdkiRJkqRFru8zCZcl+eck9+25XkmSJEnzpO+QsCHwGuCHSU5M8vQkXtIkSZIkLSJ9H8DfEzgI+DbwRODfac4uvDnJvXtuS5IkSdIQ9BoSquqWqvpkVT0GeABwOLAR8AbgkiT/kWTvJOmzXUmSJEn9GdqlQFV1flW9DLgXd5xdeArwH8CaJK9Pco9htS9JkiRpboY+XqCqbgG+CPwb8FMgwL2Bt9CEhXcl2WDY/ZAkSZI0M0MNCUkemuRjwM+ADwCbA/8CPBR4PnAJ8Arg3cPshyRJkqSZ6/25BklWAAcALwB2oTlzcB7wIeCYqrqhLXpOko8DXweeCby4775IkiRJmr1eQ0KSDwPPBjYFbqW5xOhfquqsQeWr6vYkJwN79NkPSZIkSXPX95mE5wNrgLcDR1bVVTPY5rS2vCRJkqQFoO+Q8FTghKqqmW5QVWcCZ/bcD0mSJElz1GtIqKrj+6xPkiRJ0vzr9e5GSfZM8tEk95xk/b3a9bv12a4kSZKk/vR9udFLgQdU1RWDVlbVz9qAsBVwes9tS5IkSepB389JWMX04wvOBB7ec7uSJEmSetJ3SLg7zYPTpvLztpwkSZKkBajvkHAtsO00ZbYFbpimjCRJkqQR6TskfBfYJ8k9Bq1Msg2wT1tOkiRJ0gLUd0g4HNgcOD3JnyVZDyDJekn2pnlw2mbAB3puV5IkSVJP+n5OwleTvAN4DfBlYG2Sq4C70gSSAO+oqhP6bFeSJElSf/o+k0BVvQ54CvB14Dc0g5R/A3wN2LtdL0mSJGmB6vs5CQC0Zwo8WyBJkiQtQr2fSZAkSZK0uA3lTAJAkg2BLYE7D1pfVdM9T0GSJEnSCPR+JiHJs5OcS/MshJ8Blw2YLl2H+vdL8oEkZyS5LkklOWaabR6V5IQkVye5Mcl5SV6eZGCAabd5SpJTk1yb5Pok/5nkwLn2W5IkSVosej2TkOS5wCeAtcC3aQLBbX22AbweeDBwPXA5sOM0fXo68AXgZuAzwNXAU4H3AI8G9h+wzYtpbtP6K+AY4LfAfsDRSXauqoP7ejOSJEnSQtP35Uavpnnq8mOr6gc91z3mFTTh4GJgd+CUyQom2Rz4GHA7sEdVfa9d/gbgZGC/JM+qqmPHbbMSeBdNmHhoVa1pl7+F5iFwr0ryhar6Vu/vTJIkSVoA+r7c6P7AZ4cYEKiqU6rqh1VVMyi+H3A34NixgNDWcTPNGQmAv5+wzd8AGwKHjwWEdptfA29vX75wjt2XJEmSFry+Q8KvgZt6rnNd7NXOvzpg3enAjcCj2kHWM9nmxAllJEmSpCWn78uNjgf2SJIZftM/bDu084smrqiq25L8GHgAsB1w/gy2uSLJDcC2STapqhunajzJ2ZOsmnIchSRJkjRKfZ9JOARYAXwwySY91z0XW7TzaydZP7Z8yzlss8Uk6yVJkqRFre8zCZ+mOYh+AXBAkguBawaUq6p6Ys9tz0Xa+WzOesx4m6paNbCC5gzDQ2bRpiRJkjRv+g4Jfzru582Ah05Sbr4uRZruW//NJ5Qb+/mu7Ta/mmKb69a5d5IkSdIC1PflRuvPcNqg53Ync2E7337iiiTrAfeleY7DJTPc5p40l1NdPt14BEmSJGmx6jUkVNXtM536bHcKJ7fzJw1YtxuwCXBWVd0yw22ePKGMJEmStOT0fSZhofk8cBXwrCS/u/QpyUbA29qXH5qwzceBW4AXtw9WG9tmK+C17csPD6m/kiRJ0sj1PSaBJKF52NgBwE7AiqraqF23C83Dyj5QVT+cY/37APu0L7dp549McnT781VVdTBAVV2X5O9owsKpSY6leZLy02hudfp54DPj66+qHyf5R+D9wPeSfAb4Lc2D2bYF/q9PW5YkSdJS1mtISLI+zbMSHkczAPhmurcX/QnwfJoD9dVzbGYX4MAJy7Zrp7E2Dh5bUVXHJdkdeB2wL7ARcDHwSuD9g57nUFUfSLKmreevaM64/C/w+qr6xBz7LUmSJC0KfV9udDDNHY7eBtwN+Oj4lVX1a+AMYM63P62q1VWVKaaVA7b5ZlX9WVVtVVUbV9XOVfWeqcZGVNWXq2r3qtqsqlZU1cMMCJIkSVoO+g4JzwG+VVVvag/AB93q9BLgPj23K0mSJKknfYeE7YCzpilzNXCXntuVJEmS1JO+Q8LNTP7gsjF/yOCnMEuSJElaAPoOCecCj08y8GFpSTYHngB8p+d2JUmSJPWk75BwBM14g08k2XT8ijYgHAVsDXyk53YlSZIk9aTXW6BW1b8meQLwXJpnGfwaIMm3gZ2BjYGPVNVX+mxXkiRJUn96f+JyVR1I8yyEi2kedhbg4cClwAuq6u/7blOSJElSf3p/4jJAVR0BHNFecrQ1cG1VXTuMtiRJkiT1ayghYUxVXQ9cP8w2JEmSJPWr98uNJEmSJC1uvZ5JSHLRDItWVe3QZ9uSJEmS+tH35UabADVg+RbA2C1RfwHc1nO7kiRJknrS9y1Qt51sXZIdgfcB6wNP7rNdSZIkSf2ZtzEJVXUB8AxgJfCG+WpXkiRJ0uzM68DlqroR+BrwnPlsV5IkSdLMjeLuRrfSPGRNkiRJ0gI0ryEhydY0lxxdPp/tSpIkSZq5vm+B+top2rk3TUDYCnh9n+1KkiRJ6k/ft0B92zTrrwcOrap39NyuJEmSpJ70HRIeP8nytcCvgf+tqt/23KYkSZKkHvX9nIST+qxPkiRJ0vwbxd2NJEmSJC1gfQ9cvtdct62qn/XZF0mSJElz0/eYhMuBmsN2NYS+SJIkSZqDvg/MPw38IfAY4DfAecDPaR6e9iBgM+AM4NKe25UkSZLUk75DwpuBbwEfAN5UVdeMrUiyJfBW4NnA31bVxT23LUmSJKkHfQ9cPgw4v6peNj4gAFTVNVX1EuCCtpwkSZKkBajvkLA7cPo0ZU5vy0mSJElagPoOCRsC95imzDbARj23K0mSJKknfY9J+D7wrCTvr6rzJq5MsgvwTOCcntuVJC0DKw85ftRdmHdrDt171F2QtAz1HRLeAhwPfCfJJ2kuLfoFzdmF3YHntm2+ped2JUmSJPWk15BQVV9LcgDwYeB5wN+OWx3gWuCFVfWNPtuVJEmS1J/eH2BWVZ9JcgLwDOAhwBY04eAc4ItV9Zu+25QkSZLUn6E85bgNAp9sJ0mSJEmLSN93N+pIslmSew6zDUmSJEn96j0kJFmR5LAklwPXAJeNW/fwJP/R3uVIkiRJ0gLU6+VGSTYDzgR2Bn4AXAfsMK7I/wB70Tx1+dw+25YkSZLUj77PJLyeJiA8r6oeBHx2/MqqugE4DXhcz+1KkiRJ6knfIWFf4OtVdVT7ugaUWQNs23O7kiRJknrSd0jYluapy1O5nua2qJIkSZIWoL5DwvXA3aYpc1/gqp7blSRJktSTvkPCd4GnJNl00Mok2wBPBs7quV1JkiRJPek7JLwfuCvwlST3H7+iff0ZYOO2nCRJkqQFqNdboFbViUneRnOXowuAWwCS/JzmMqQAr6uqM/tsV5IkSVJ/en+YWlW9EXgicAJwQ7t4Q+DrwBOr6h19tzmVJAclqWmm28eVXzlN2WPns/+SJEnSfOv1TMKYqvoG8I1h1D0H5wJvnmTdY2ke7nbigHXfB44bsPwHPfVLkiRJWpD6fuLy14Gzqmp1n/Wui6o6l0me7pzkW+2PHx2w+tyF9D4kSZKk+dL35UaPATbouVf8jHkAAB/oSURBVM6hSPJA4BHAT4HjR9wdSZIkacHo+3Kji4F791znsLygnR9ZVbcPWH+vJC8A7gL8CvhWVZ03b72TJEmSRqTvkHAk8MYk21bV5T3X3ZskGwPPAdYCR0xS7PHtNH67U4EDq+rSGbZz9iSrdpxZTyVJkqT51/flRl8Avgl8M8kLk6xK8gdJ7jVx6rnd2foLYEvgxKq6bMK6G4G3AquArdppd+AUYA/gpCQr5q+rkiRJ0vzq+0zCpUDRPA/hg1OUqyG0PRvPb+cfmbiiqq4E3jhh8elJngCcCewKPA9433SNVNWqQcvbMwwPmU2HJUmSpPnS94H6p2kCwIKV5I+BRwGX0zzLYUaq6rYkR9CEhN2YQUiQJEmSFqO+n7j8nD7rG5LpBixP5Zft3MuNJEmStGT1/sTlhSzJRsBzaQYsHzmHKh7Rzi/prVOSJEnSArPOZxKS/BXNg8cWw+1B96cZiPyVAQOWAUiyK/BfVfXbCcv3Al7RvjxmqL2UJKm18pDl9SifNYfuPeouSKKfy42OBlYDvwsJSQ6kuVXoXj3U36exAcuDnrA85jDgAe3tTsdu4/ogYOy9vKGqzhpO9yRJkqTRG9YdhlbS3DZ0wUiyE80ToacbsPwp4BnAw4AnA+sDvwA+CxxeVWcMuauSJEnSSI3yNqTzqqrOp7k163TljmRu4xUkSZKkJWFZDVyWJEmSND1DgiRJkqSOvkLCgn6AmiRJkqSZ62tMwuokqycuTDLZw8qqqpbNeAhJkiRpMenrQH3aAcHrWF6SJEnSPFnnkFBVjmuQJEmSlhAP8CVJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdSyLkJBkTZKaZPr5JNs8KskJSa5OcmOS85K8PMmd57v/kiRJ0nxab9QdmEfXAu8dsPz6iQuSPB34AnAz8BngauCpwHuARwP7D6+bkiRJ0mgtp5BwTVWtnq5Qks2BjwG3A3tU1ffa5W8ATgb2S/Ksqjp2mJ2VJEmSRmVZXG40S/sBdwOOHQsIAFV1M/D69uXfj6JjkiRJ0nxYTmcSNkzyHOAPgRuA84DTq+r2CeX2audfHVDH6cCNwKOSbFhVtwytt5IkSdKILKeQsA3wqQnLfpzkr6vqtHHLdmjnF02soKpuS/Jj4AHAdsD5UzWY5OxJVu04sy5LkiRJ82+5XG70ceBxNEFhBbAz8BFgJXBikgePK7tFO792krrGlm/ZfzclSZKk0VsWZxKq6s0TFv0AeGGS64FXAauBZ8ywuoxVO4N2Vw2soDnD8JAZtidJkiTNq+VyJmEyH27nu41bNnamYAsG23xCOUmSJGlJWe4h4cp2vmLcsgvb+fYTCydZD7gvcBtwyXC7JkmSJI3Gcg8Jj2zn4w/4T27nTxpQfjdgE+As72wkSZKkpWrJh4QkD0iy9YDl9wEOb18eM27V54GrgGcleei48hsBb2tffmhI3ZUkSZJGbjkMXN4fOCTJKcCPgd8A9wP2BjYCTgDeNVa4qq5L8nc0YeHUJMcCVwNPo7k96ueBz8zrO5AkSZLm0XIICafQHNz/Cc3lRSuAa4AzaZ6b8Kmq6typqKqOS7I78DpgX5owcTHwSuD9E8tLkiRJS8mSDwntg9JOm7bg72/3TeDP+u+RJEmStLAt+TEJkiRJkmbHkCBJkiSpw5AgSZIkqWPJj0mQJEmLx8pDjh91F+bdmkP3HnUXpN/jmQRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHYYESZIkSR2GBEmSJEkdhgRJkiRJHeuNugOSJEnL2cpDjh91F+bdmkP3HnUXNA3PJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6jAkSJIkSeowJEiSJEnqMCRIkiRJ6ljyISHJXZI8L8kXk1yc5KYk1yY5M8nfJrnThPIrk9QU07Gjei+SJEnSfFhv1B2YB/sDHwKuAE4BLgXuAfw5cATw5CT7V1VN2O77wHED6vvBEPsqSZIkjdxyCAkXAU8Djq+qtWMLk7wW+A6wL01g+MKE7c6tqtXz1UlJkiRpoVjylxtV1clV9eXxAaFd/nPgw+3LPea9Y5IkSdICtRzOJEzl1nZ+24B190ryAuAuwK+Ab1XVefPWM0mSJGlElm1ISLIe8Ffty68OKPL4dhq/zanAgVV16QzbOHuSVTvOsJuSJEnSvFvylxtN4VDggcAJVfW1cctvBN4KrAK2aqfdaQY97wGclGTF/HZVkiRJmj/L8kxCkpcCrwIuAJ47fl1VXQm8ccImpyd5AnAmsCvwPOB907VTVasmaf9s4CGz77kkSZI0fMvuTEKSF9Ec4P8vsGdVXT2T7arqNppbpgLsNqTuSZIkSSO3rEJCkpcDh9M862DP9g5Hs/HLdu7lRpIkSVqylk1ISPJPwHuAc2kCwpVzqOYR7fyS3jomSZIkLTDLIiQkeQPNQOWzgcdV1VVTlN01yQYDlu8FvKJ9ecxQOipJkiQtAEt+4HKSA4G3ALcDZwAvTTKx2JqqOrr9+TDgAe3tTi9vlz0I2Kv9+Q1VddYw+yxJkiSN0pIPCcB92/mdgZdPUuY04Oj2508BzwAeBjwZWB/4BfBZ4PCqOmNoPZUkSVoGVh5y/Ki7MO/WHLr3qLswK0s+JFTVamD1LMofCRw5rP5IkiRJC92yGJMgSZIkaeYMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmTSLJtkqOS/CzJLUnWJHlvkq1G3TdJkiRpmNYbdQcWoiT3A84C7g58CbgAeDjwMuBJSR5dVb8aYRclSZKkofFMwmD/QhMQXlpV+1TVIVW1F/AeYAfgn0faO0mSJGmIDAkTJNkOeAKwBvjghNVvAm4AnptkxTx3TZIkSZoXhoTft1c7/3pVrR2/oqp+A3wT2AR4xHx3TJIkSZoPjkn4fTu084smWf9DmjMN2wMnTVVRkrMnWfXg888/n1WrVs2th+vgip9eO+9tSpIkLXervvHGkbR7/vnnA6yc7XaGhN+3RTuf7Gh6bPmW69DG7TfddNO155xzzpp1qGMudmznF8xzu1o43AfkPiD3AbkPjMA5vxhZ0yuB62a7kSFh9tLOa7qCVTX/pwqmMHZmY6H1S/PHfUDuA3IfkPuAZsIxCb9v7EzBFpOs33xCOUmSJGlJMST8vgvb+faTrL9/O59szIIkSZK0qBkSft8p7fwJSTqfT5LNgEcDNwHfnu+OSZIkSfPBkDBBVf0I+DrNII8XTVj9ZmAF8MmqumGeuyZJkiTNCwcuD/YPwFnA+5M8Djgf2BXYk+Yyo9eNsG+SJEnSUKVq2pv0LEtJ7g28BXgScBfgCuA44M1VdfUo+yZJkiQNkyFBkiRJUodjEiRJkiR1GBIkSZIkdRgSJEmSJHUYEiRJkiR1GBIkSZIkdRgSJEmSJHUYEhaxJNsmOSrJz5LckmRNkvcm2WqW9Wzdbremrednbb3bDqvv6se67gNJViQ5IMmnk1yQ5IYkv0nyvSSvSrLBsN+D1k1ffwcm1LlbktuTVJK39dlfDUef+0GSnZN8MsllbV1XJjktyV8No+/qR4/HBI9J8qV2+5uTXJrkhCRPGlbftTD5nIRFKsn9aJ4KfXfgS8AFwMNpngp9IfDoqvrVDOq5S1vP9sDJwHeBHYGnA1cCj6yqS4bxHrRu+tgH2j/6JwJXA6cAFwNbA08Ftmnrf1xV3Tykt6F10NffgQl1bgacB9wV2BT456p6fZ/9Vr/63A+SHAQcAdwIfAVYA2wJPBD4WVU9q+fuqwc9HhP8PfAvwA3AF4HLgW2BPwc2AV5fVf88jPegBaiqnBbhBHwNKOAlE5a/u13+4RnW85G2/LsnLH9pu/yro36vTsPbB4BdgAOADSYs3ww4u63nVaN+r07D2wcG1HkUTWh8bVvH20b9Pp3mZz8AHgHcBpwLbDNg/fqjfq9Ow9sHgPWBa4CbgB0mrNsJuJkmPG446vfrND+TZxIWoSTbAT+i+YbnflW1dty6zYArgAB3r6obpqhnBfBLYC1wz6r6zbh1d2rbWNm24dmEBaSvfWCaNv4S+FfgK1X11HXutHo1jH0gydOB44DnAusBH8czCQtan/tBktOBxwI7V9UPhtZp9arHY4J7AD8HzquqBw9Yfx6wM3DXmuUZSi1OjklYnPZq518f/8cAoD3Q/ybNacFHTFPPI4GNgW+ODwhtPWuBr7cv91znHqtvfe0DU7m1nd+2DnVoeHrdB5LcHfgYcFxVHdNnRzVUvewH7Ri0xwLfA/4nyZ5JDm7HJj2u/eJIC1NffwuupPnicPsk9x+/Isn2wP2Bcw0Iy4f/6BenHdr5RZOs/2E7336e6tH8m4/f3d+086+uQx0anr73gY/S/J/wwnXplOZdX/vBw8aVP7md3gm8C/h/wLlJ/mgd+qnh6WUfqObSkhfR/B04O8knkrwjySdpLj/9H2D/HvqrRWK9UXdAc7JFO792kvVjy7ecp3o0/4b6u0vyYuBJNNcmHzWXOjR0ve0DSf6G5mYFz6yqX/TQN82fvvaDu7fzvwCuohmoehJwN+BNNJegHZ9k56r67dy7qyHo7W9BVX0uyc+AfwPG383qFzSXH3rp8TLimYSlKe18XQec9FWP5t+cf3dJ/hx4L821qftW1a3TbKKFaUb7QJKVNL/vz1XVZ4fcJ82/mf4tuPO4+fOq6otVdV1V/Qg4kOYypO2BfYfTTQ3RjP8/SPIcmjNHZ9AMVt6knZ8EHA4cO6Q+agEyJCxOY98KbDHJ+s0nlBt2PZp/Q/ndJdmH5j+BK4E9HLC+oPW1DxxFczeTf+ijU5p3fe0Hv27ntwAnjF/RXobypfblw2fbQQ1dL/tAO+7gKJrLip5bVRdU1U1VdQHNmaSzgf2T7LHuXdZiYEhYnC5s55NdXzg24Giy6xP7rkfzr/ffXZL9gc/RnFbevaounGYTjVZf+8BDaC41+WX78LRKUjSXFgC8rl123Lp1V0PS9/8Hv5k4+LU1FiI2nkXfND/62geeQHMb1NMGDIBeC5zevlw1l05q8XFMwuJ0Sjt/QpI7Dbjd2aNpvhn89jT1fLst9+gkmw24BeoTJrSnhaOvfWBsm78EPgn8FNjTMwiLQl/7wCdpLimY6P7AbjTjUs4G/mude6xh6Gs/OI9mLMJdk9xjwNiUB7bzNeveZfWsr31gw3Z+t0nWjy13TMoy4ZmERai9RvTrNM8weNGE1W8GVgCfHH8/5CQ7JtlxQj3XA59qy6+eUM+L2/q/5gHjwtPXPtAuP5BmP7gU2M3f9+LQ49+Bl1bV8yZO3HEm4fh22QeH9mY0Zz3uB7fRPFwT4P+Mv+Vpkp2Bg2huh/z5nt+C1lGP/x+c0c73S/Kg8SuS7ALsRzOu4eT+eq+FzIepLVIDHsF+PrArzTMNLgIeNf5exu3lA1RVJtRzl7ae7Wn+4X+HZpDS02muS39U+wdIC0wf+0CSPWkGqd2J5lrUywY0dU1VvXdIb0ProK+/A5PUfRA+TG1R6PH/g01oBqg+gubM0ak03x7vS3OZ0auq6t1Dfjuagx73gaOAv6Y5W/BF4Cc04WMfYAPgvVX1iiG/HS0QhoRFLMm9gbfQ3KryLjRPVTwOeHNVXT2h7KQHB0m2prnF3T7APYFfAScCb6yqy4f5HrRu1nUfGHcgOJWfVNXK/nqtPvX1d2BAvQdhSFg0evz/YBPg1cCzgPsCNwPfBf5vVZ04zPegddPHPpAkNHezOgh4MLAZcB1NaPxYVXl3o2XEkCBJkiSpwzEJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJkiRJkjoMCZIkSZI6DAmSJEmSOgwJktSTJCuTVJKjJyw/ul2+ckjt7tHWv3oY9WtqSU5N8t9JhvZ/attGDav+hSjJqna//ttR90VajgwJkhaV9qBh/HR7kquSnJzkgFH3bxgmCx8avST7AbsDb6qqtaPuzzCMKqBU1dnAccDbkmw63+1Ly916o+6AJM3Rm9v5+sAOwD7AnklWVdUrR9etgV4DHAr8dEj1fwfYCbhqSPVrgCQB3gZcBHxxyM39FbDJkNtYiN4B/CfwUuDtI+6LtKwYEiQtSlW1evzrJI8DvgG8PMn7q2rNKPo1SFVdAVwxxPpvBC4YVv2a1J/SBNTXVdVQv2mvqkuHWf9CVVXfSXIB8IIkh1XV7aPuk7RceLmRpCWhqk6iOVAO8DDoXqaTZPskn0lyZZK1SfYY2zbJ1knekeT8JDcluTbJSUmeMKit5P+3d+exexR1HMffHwS50xZEUK4WJHJYbhAIYAVq8IByVKCgUBoIl4gSgmhAmqASVFDBgArllDNyyh2BYrnlvkEqILdIKVDulq9/fGfhebb7/H7P79dfi6WfV7LZdmZ2ZvZ42p3dmVktLuk4Sc9KekfSo5IOpsO/qT2NSZC0YanXc5LelfSCpGsl7VTixwNPluR71LpajS1pOo5JkLSKpDNL/u9Jer78fZWGtONLPiMkjZZ0h6S3JE2RdJ6kZTsd/04kjZF0g6RXy7F6RNLhkhZsSBula8sykk4pdZ7Rsp/VcVxJ0oGS7i/na2JLHvNJ2lfSPyRNk/Rm+fN+TWMGeiuzF1Vf+fMb8h1bnSNJIyVNKvV5WdJpkgaXdOtIurwcn2mSLutwnczU5af1vEtaW9IVkqaWc3ajpE0a8unpWmy7jqrfD9mdqt7Vb2Jt2+Uk/V7Sv8p1/ErZlw0ayllc0hGSHpT0uqQ3JE0uv4P1Go7zecAKZKPMzOYQv0kws08SlXX9qe7KZJeFx4GzgYWB1wEkrQhMBIYCk4CrgUWBbwFXS9onIk7+sIC8ub2ObIjcV/IbDBxBuZnqurLS3sBJwAzgMuCfwGeB9YH9gQtK3QYDB5XyLmnJ4t5e8t8A+BuweMn/YWBVYDdglKQtI+LOhk33B7Yt29wIfBnYGVhL0toR8W6X+zcBGAc8C1wETAU2Ao4CtpQ0MiKm1zZbArgNmFa2+QB4qZbmd8BmwBXAleTxq5wF7Ao8A5xCXgvbAycCm5Z9r+umzPq+CdgCeDEiJveQdFvyWroc+AOwCTAWGCbpMPJamgRMAIYD2wArSxrehzEO6wOHAreS+7wCsCNwXTlfj3WZT91UslvfWGBFPuriB/BU9QdJ6wLXksfxGvIYfobsAniTpO0j4sqSVuRvbJOW+k4HlgdGkMfirlo9bi7rkSV/M5sTIsKLFy9e5pqFvOmLhvCtyJu7D4AVS9jQKj3wiw75TSzb7FILH0zehL8NLN0S/pOS34XAfC3hw4ApJe70Wl6nl/ChLWGrA++XbdZoqNdyLX8e2pRvS/yIEj++JUzAIyV8t1r6nUv4o7V9GF/CXweG17Y5p8Tt1OV5GlvSXwQsXIuryjmo6dwCZwLzN+RZHcfngGEN8WNK/N3AYi3hiwJ3lrhd+1JmD/u3atnur73s/3TgKy3h85Hd4qKc+/q5mVDiRjVcp1ELG9FS/7G1uH1K+Im9XYs9XUedym6Jmx94AnindT9L3OfLuXoBWLCEDS9lXNyQ13zAkIbwQWWbO7o9P168eJn1xd2NzGyuVLpYjJf0c0l/IZ9OCvhtRDxdS/4S7U9BqzzWIp/+XxgR57XGRcRU4EhgIfKpbGVPslFxaLQ86Y2IJ4Hj+7AL+5E3WEdFxEP1yIh4tg95NdmEvJG9NSLOruV9PnAT2Z9+04Ztj4+IB2ph1duUDbss/yDyBnlcRLxdizsKeIXmp/rvAYfEzG8YWv2yHO+6cWV9WERMqwIj4k3gR+Wve/WzzLoVyrq3sSbnRsSNLXX5gHzbAfBg/dyQjRWAtftQl5sj4vRa2Knk8e/2fPXXN8k3dSe07idARDwP/BJYBtiytl39miAiPoiIVxvCXyMbISvU48xs9nF3IzObWx1Z1kF2i5gETIiIPzekvS+au8hsXNaD1PyNgaXKejXIvtTAF4BnormLycSWevVmo7K+qsv0fbVuWV/fIf56soGwDvD3WlxTF6RnynpIbwVLWgRYi5xt6QfZw2Qm71KOa81TEfGfXoq4o0P4umQDbmJD3I1kt6R1+llm3ZJlPdNNbU3TsXy+rOvdauCjGbCW60NdZiojIt6X9BJdnK9ZVP2GVuzwG6rGvqxGdg17mHxDN6Z09buUbLDeGRHv9VDOFGDpAamxmXXFjQQzmytFROOdZwcvdgivbvRGlqWTao72QWXdqb96p3KaDC7r2TUtalXXTk+6q/DBDXFTG8Kqp+yf6qLsIeRbnaXovtFU6eYYdkozCJjSdLMZEdMl/Zcc89GfMuuqJ+EL9ZLutYaw6V3ELdCHujSdryqvbs7XrKh+Q9/uJd1iABExQ9IWwE+B0cAxJf4NSWcAP259C9RiYRrePpjZ7OPuRmY2L+g0PWV1k3ZQRKiHZc9a+k5PNJfpQ52qG7s+zxjUpaquner0uVq62VH2Pb0c16aGXjdTifZ0PpeQNNMNtqT5ycG0r/ezzLrqzcOSPab6/1N1kWt6SNjUYOxNda5H9XKuP+zuFxGvRsQPI2J58k3DXuT4mO+RA/nblFmpBvPRMTezOcCNBDObl91W1pt1kzgi3iAHaS4raeWGJCP6UfbXu0hbzd7Tl6fC95T1iA7xVfjdfcizK+VJ8EPAGpKWGOj8e3AP+f/a5g1xm5PHb6D29yHyvKw6QPnNKVX3qOUb4tbvsM0MAElN11+ffkN1EfFEREwgxwZNA0Y1JPsi+Waqx9m8zGxguZFgZvOsyOk/JwE7SBrXlEbScEmtXVROI//tPKZ13n1Jw8ivwnbrJLI7yBGSVm8ot7VP+qvk0+6+DNy8GXgM2FTS6Freo8mb5sfJ/uCzw3HAp4FTq28C1OowpEydOZBOLeujy7iIqqxFyC9eQ84eNMvKYNp7gTUlLTwQec4h1XiOvVsDJQ0nB5s3eaWsm66/S4HJwAGSvtG0saSNq/MhaZikNRqSDQEWpLlLUTV+54YO9TOz2cBjEsxsXrcrOYh3gqTvk99TmEoOHF0T+BI5OLPq6nAsOf/7jsDdkq4h+8LvTA4A3rabQiPiYUn7k3Pn3yPpUvI7CUuST3TfAL5a0k6TdDuwmaSzyZv7GcBlEXF/h/xD0h7kdJvnl/wfJZ/Kblfy3z26n4u/TyLi1PJhrP2ByeU4/ZucS38Y2Ug5Ddh3AMs8R9IoYCfgIUmXkI2r7UqZFzTMJjQrLgTWI7+XcMUA5js7VdfZmNIQvZ28+R9V4nZq2OY6cszBRZKuJG/kn46Is8oA6R3I7xdcIekWsvH0Fvm2YgNgJbJ721vkgPaLJd0FPEgO4l6qlL8AH41RaPU18nq/dNZ338y65UaCmc3TIuLZcjN7IHnjvxvZLeVFciaWE4AHWtK/K2krcq7/ncmnr08BPwMupstGQsnrZEkPAoeQ3X+2I2cEup/8yFSr7wK/AbYmvwcg8iNljY2Ekv/t5YNqh5Pfkdim5H8uOfVqfz+y1ZWIOEDSVWRDYCuyX/kUsrHwK6BpJqpZNYacyWgc+a0AyO9FHEtDf/dZNIG8DnZnLmkkRMQ7krYEfk0O1t+AvFnflTw3TY2EU8iPqe1CfrRtfvIYn1XyvL9MJ3ww+eG4aprgF8guYEeS1x3kTExHk92LtibfILxMzvR0fES0zfYlaRD5u7g8Ip7BzOYYRfRnvJaZmZlJ+iOwB/lxsv7MkmQ9kHQg+f2RzSNi0sddH7N5iRsJZmZm/SRpabL7zhkRceDHXZ9PkjLWYzJwS0SM7i29mQ0sD1w2MzPrp4h4CfgO8HzrQHYbEEOBP5Hd8cxsDvObBDMzMzMza+OnHmZmZmZm1saNBDMzMzMza+NGgpmZmZmZtXEjwczMzMzM2riRYGZmZmZmbdxIMDMzMzOzNm4kmJmZmZlZGzcSzMzMzMysjRsJZmZmZmbWxo0EMzMzMzNr40aCmZmZmZm1cSPBzMzMzMzauJFgZmZmZmZt/gf2ccCXV/MWbgAAAABJRU5ErkJggg==\n", + "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "image/png": { - "height": 277, + "height": 290, "width": 388 }, "needs_background": "light" @@ -427,20 +596,12 @@ } ], "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "## This cell was run after the fact from a fresh notebook; rerun to \n", - "## y_test := as above. Ground truth trip durations\n", - "## y_pred := model output (as in the cell above?)\n", - "\n", - "y_test = np.random.uniform(0, 1, size=1000)\n", - "y_pred = np.random.uniform(0, 1, size=1000)\n", - "\n", - "err = np.abs(y_pred - y_test)\n", - "ax = pd.Series(err).plot.hist()\n", - "ax.set_xlabel(\"Prediction error (minutes)\")\n", - "ax.set_title(\"Prediction error\")" + "df = pd.DataFrame({\"vals\": vals, \"edges\": edges[1:]})\n", + "ax = df.plot.bar(x=\"edges\", y=\"vals\", width=1)\n", + "ax.set_ylabel(\"Frequency\")\n", + "ax.set_xlabel(\"Maximum error (minutes)\")\n", + "ax.set_title(\"Prediction error\")\n", + "ax.legend_.remove()" ] }, { @@ -507,9 +668,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:root] *", + "display_name": "Python [conda env:skorch]", "language": "python", - "name": "conda-root-py" + "name": "conda-env-skorch-py" }, "language_info": { "codemirror_mode": { @@ -521,7 +682,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.4" + "version": "3.8.5" } }, "nbformat": 4, From 848710dbaaa1590157d470a9e09a3c627d948725 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 5 Aug 2020 15:21:18 -0500 Subject: [PATCH 09/11] Some updates --- hyper-parameter-optimmization/environment.yml | 17 + .../hyper-parameter-optimization.ipynb | 1160 +++++++++++++++++ hyper-parameter-optimmization/torch_model.py | 13 + 3 files changed, 1190 insertions(+) create mode 100644 hyper-parameter-optimmization/environment.yml create mode 100644 hyper-parameter-optimmization/hyper-parameter-optimization.ipynb create mode 100644 hyper-parameter-optimmization/torch_model.py diff --git a/hyper-parameter-optimmization/environment.yml b/hyper-parameter-optimmization/environment.yml new file mode 100644 index 0000000..b908644 --- /dev/null +++ b/hyper-parameter-optimmization/environment.yml @@ -0,0 +1,17 @@ +name: pytorch +channels: + - conda-forge + - pytorch + - defaults +dependencies: + - python=3.7 + - dask + - numpy + - pandas + - coiled + - dask-ml + - skorch + - scipy + - matplotlib + - pytorch>1.1.0 + - s3fs \ No newline at end of file diff --git a/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb new file mode 100644 index 0000000..46dccc4 --- /dev/null +++ b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb @@ -0,0 +1,1160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyperparameter Optimization with Dask and Coiled\n", + "\n", + "This example will walk through the following:\n", + "\n", + "* **Getting and processing the data.**\n", + "* **Defining a model and parameters.**\n", + "* **Finding the best parameters,** and some details on why we're using the chosen search algorithm.\n", + "* **Scoring** and deploying.\n", + "\n", + "All of these tasks will be performed on the New York City Taxi Cab dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating Cluster. This takes about a minute ... \r" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 0
  • \n", + "
  • Cores: 0
  • \n", + "
  • Memory: 0 B
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import coiled\n", + "import dask.distributed\n", + "\n", + "cluster = coiled.Cluster(\n", + " n_workers=20,\n", + " configuration=\"coiled-examples/pytorch\"\n", + ")\n", + "client = dask.distributed.Client(cluster)\n", + "\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ☝️ Don’t forget to click the \"Dashboard\" link above to view the cluster dashboard!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get and pre-process data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", + "\n", + "These data have records on 84 million taxi rides.\n", + "\n", + "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "\n", + "features = [\"passenger_count\", \"trip_distance\", \"fare_amount\"]\n", + "categorical_features = [\"RatecodeID\", \"payment_type\"]\n", + "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", + "\n", + "df = dd.read_csv(\n", + " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", + " parse_dates=output,\n", + " usecols=features + categorical_features + output,\n", + " dtype={\n", + " \"passenger_count\": \"UInt8\",\n", + " \"RatecodeID\": \"category\",\n", + " \"payment_type\": \"category\",\n", + " },\n", + " blocksize=\"16 MiB\",\n", + ")\n", + "\n", + "# one hot encode the categorical columns\n", + "df = df.categorize(categorical_features)\n", + "df = dd.get_dummies(df, columns=categorical_features)\n", + "\n", + "# persist so only download once\n", + "df = df.persist()\n", + "\n", + "data = df[[c for c in df.columns if c not in output]]\n", + "data = data.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "durations = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.total_seconds() / 60 # minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.model_selection import train_test_split\n", + "import dask\n", + "\n", + "features = data.to_dask_array(lengths=True).astype(\"float32\")\n", + "output = durations.to_dask_array(lengths=True).astype(\"float32\")\n", + "X_train, X_test, y_train, y_test = train_test_split(features, output, shuffle=True)\n", + "\n", + "# persist the data so it's not re-computed\n", + "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define model and hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use a simple neural network from [PyTorch] using [Skorch], a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", + "\n", + "This network is only small for demonstration. If desired, we could use much larger networks on GPUs.\n", + "\n", + "[PyTorch]:https://pytorch.org/\n", + "[skorch]:https://skorch.readthedocs.io/en/stable/" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Import our HiddenLayerNet pytorch model from a local torch_model.py module\n", + "from torch_model import HiddenLayerNet\n", + "# Send module with HiddenLayerNet to workers on cluster\n", + "client.upload_file(\"torch_model.py\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "class HiddenLayerNet(nn.Module):\n", + " def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation=\"relu\"):\n", + " super().__init__()\n", + " self.fc1 = nn.Linear(n_features, n_hidden)\n", + " self.fc2 = nn.Linear(n_hidden, n_outputs)\n", + " self.activation = getattr(F, activation)\n", + "\n", + " def forward(self, x, **kwargs):\n", + " return self.fc2(self.activation(self.fc1(x)))" + ] + } + ], + "source": [ + "# Print contents of torch_model.py module\n", + "!cat torch_model.py" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "from skorch import NeuralNetRegressor\n", + "\n", + "niceties = {\n", + " \"callbacks\": False,\n", + " \"warm_start\": True,\n", + " \"train_split\": None,\n", + " \"max_epochs\": 1,\n", + "}\n", + "\n", + "model = NeuralNetRegressor(\n", + " module=HiddenLayerNet,\n", + " module__n_features=X_train.shape[1],\n", + " optimizer=optim.SGD,\n", + " criterion=nn.MSELoss,\n", + " lr=0.0001,\n", + " **niceties,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import loguniform, uniform\n", + "\n", + "params = {\n", + " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", + " \"batch_size\": [32, 64, 128, 256],\n", + "# \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", + "# \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", + "# \"optimizer__momentum\": uniform(0, 1),\n", + "# \"optimizer__nesterov\": [True],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the best hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", + "\n", + "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`.\n", + "\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.model_selection import HyperbandSearchCV\n", + "search = HyperbandSearchCV(model, params, random_state=2, verbose=True,\n", + " max_iter=2,\n", + "# max_iter=9,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, `HyperbandSearchCV` will call `partial_fit` on each chunk of the Dask Array. `HyperbandSearchCV`'s rule of thumb specifies how to train for longer or sample more parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV, bracket=0] creating 1 models\n", + "[CV, bracket=0] For training there are between 2756 and 169772 examples in each chunk\n", + "[CV, bracket=0] validation score of 0.0210 received after 1 partial_fit calls\n", + "[CV, bracket=0] validation score of 0.0270 received after 2 partial_fit calls\n" + ] + }, + { + "data": { + "text/plain": [ + "HyperbandSearchCV(estimator=[uninitialized](\n", + " module=,\n", + " module__n_features=15,\n", + "),\n", + " max_iter=2,\n", + " parameters={'batch_size': [32, 64, 128, 256],\n", + " 'module__activation': ['relu', 'elu', 'softsign',\n", + " 'leaky_relu', 'rrelu']},\n", + " random_state=2, verbose=True)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train2 = y_train.reshape(-1, 1).persist()\n", + "search.fit(X_train, y_train2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", + "\n", + "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.02695897736664199" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'module__activation': 'softsign', 'batch_size': 128}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[initialized](\n", + " module_=HiddenLayerNet(\n", + " (fc1): Linear(in_features=15, out_features=100, bias=True)\n", + " (fc2): Linear(in_features=100, out_features=1, bias=True)\n", + " ),\n", + ")" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_estimator_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means we can deploy the best model and score on the entire dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.02479510859559464" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dask_ml.wrappers import ParallelPostFit\n", + "deployed_model = ParallelPostFit(search.best_estimator_)\n", + "deployed_model.score(X_test, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualization" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What does the error distribution look like on this larger dataset?" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "y_pred = deployed_model.predict(X_test)\n", + "y_pred = y_pred.flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Array Chunk
Bytes 67.52 MB 151.23 kB
Shape (8440119,) (18904,)
Count 942 Tasks 471 Chunks
Type int64 numpy.ndarray
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + " \n", + " \n", + "\n", + " \n", + " 8440119\n", + " 1\n", + "\n", + "
" + ], + "text/plain": [ + "dask.array<_predict, shape=(8440119,), dtype=int64, chunksize=(18904,), chunktype=numpy.ndarray>" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import dask.array as da\n", + "\n", + "err = np.abs(y_pred - y_test)\n", + "max_min_err = 20\n", + "vals, edges = da.histogram(err, range=(0, max_min_err), bins=max_min_err)\n", + "vals, edges = dask.compute(vals, edges)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8440119,)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df = pd.DataFrame({\"vals\": vals, \"edges\": edges[1:]})\n", + "ax = df.plot.bar(x=\"edges\", y=\"vals\", width=1)\n", + "ax.set_ylabel(\"Frequency\")\n", + "ax.set_xlabel(\"Maximum error (minutes)\")\n", + "ax.set_title(\"Prediction error\")\n", + "ax.legend_.remove()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Why not simply sampling instead?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sampling solves the memory issues:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# X_train_small = data.sample(frac=0.01, random_state=123).to_dask_array().astype(\"float32\").compute()\n", + "# y_train_small = durations.sample(frac=0.01, random_state=123).to_dask_array().astype(\"float32\").compute()\n", + "\n", + "# X_train_small # NumPy ndarray; must fit in memory" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But `HyperbandSearchCV` is meant for computationally-constrained problems, regardless of their memory usage (which [Dask-ML's documentation on hyperparameter searches][2] also indicate). `HyperbandSearchCV` would still be relevant:\n", + "\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "# search = HyperbandSearchCV(model, params, max_iter=81, random_state=0)\n", + "# search.fit(X_train_small, y_train_small.reshape(-1, 1), classes=[0, 1]);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs.\n", + "\n", + "If we had a simpler model and a massive dataset, `IncrementalSearchCV` is recommended. It mirrors Scikit-Learn's `RandomizedSearchCV` but works on Dask Arrays/Dataframes, both of which can be larger than memory." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hyper-parameter-optimmization/torch_model.py b/hyper-parameter-optimmization/torch_model.py new file mode 100644 index 0000000..b6683f0 --- /dev/null +++ b/hyper-parameter-optimmization/torch_model.py @@ -0,0 +1,13 @@ +import torch.optim as optim +import torch.nn as nn +import torch.nn.functional as F + +class HiddenLayerNet(nn.Module): + def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation="relu"): + super().__init__() + self.fc1 = nn.Linear(n_features, n_hidden) + self.fc2 = nn.Linear(n_hidden, n_outputs) + self.activation = getattr(F, activation) + + def forward(self, x, **kwargs): + return self.fc2(self.activation(self.fc1(x))) \ No newline at end of file From b8b03198db5c1e4c19ffac25c4b94bf26ddddabd Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 5 Aug 2020 15:21:44 -0500 Subject: [PATCH 10/11] Remove old notebook --- hyper-parameter-optimization.ipynb | 690 ----------------------------- 1 file changed, 690 deletions(-) delete mode 100644 hyper-parameter-optimization.ipynb diff --git a/hyper-parameter-optimization.ipynb b/hyper-parameter-optimization.ipynb deleted file mode 100644 index ecf26bf..0000000 --- a/hyper-parameter-optimization.ipynb +++ /dev/null @@ -1,690 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This example will walk through the following:\n", - "\n", - "* **Getting and processing the data.**\n", - "* **Defining a model and parameters.**\n", - "* **Finding the best parameters,** and some details on why we're using the chosen search algorithm.\n", - "* **Scoring** and deploying.\n", - "\n", - "All of these tasks will be performed on the New York City Taxi Cab dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Setup cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import coiled\n", - "import dask.distributed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "cluster = coiled.Cluster(\n", - " n_workers=20, \n", - " configuration=\"coiled/default\", \n", - ")\n", - "client = dask.distributed.Client(cluster)\n", - "\n", - "client" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Get and pre-process data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", - "\n", - "These data have records on 84 million taxi rides.\n", - "\n", - "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import dask.dataframe as dd\n", - "\n", - "features = [\"passenger_count\", \"trip_distance\", \"fare_amount\"]\n", - "categorical_features = [\"RatecodeID\", \"payment_type\"]\n", - "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", - "\n", - "df = dd.read_csv(\n", - " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", - " parse_dates=output,\n", - " usecols=features + categorical_features + output,\n", - " dtype={\n", - " \"passenger_count\": \"UInt8\",\n", - " \"RatecodeID\": \"category\",\n", - " \"payment_type\": \"category\",\n", - " },\n", - " blocksize=\"16 MiB\",\n", - ")\n", - "\n", - "# one hot encode the categorical columns;\n", - "# if df[\"foo\"].unique() == [1, 3, 4], add columns foo_1, foo_3, foo_4\n", - "df = dd.get_dummies(df, columns=categorical_features)\n", - "\n", - "# persist so only download once\n", - "df = df.persist()\n", - "\n", - "data = df[[c for c in df.columns if c not in output]]\n", - "data = data.fillna(0)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "durations = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.total_seconds() / 60 # minutes" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "from dask_ml.model_selection import train_test_split\n", - "import dask\n", - "\n", - "features = data.to_dask_array(lengths=True).astype(\"float32\")\n", - "output = durations.to_dask_array(lengths=True).astype(\"float32\")\n", - "X_train, X_test, y_train, y_test = train_test_split(features, output, shuffle=True)\n", - "\n", - "# persist the data so it's not re-computed\n", - "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Define model and hyperparameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use a simple neural network from [PyTorch] usin Skorch, a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", - "\n", - "This network is only small for demonstration. If desired, we could use much larger networks on GPUs.\n", - "\n", - "[PyTorch]:https://pytorch.org/\n", - "[skorch]:https://skorch.readthedocs.io/en/stable/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If desired, this model could use GPUs." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "import torch.optim as optim\n", - "import torch.nn as nn\n", - "import torch.nn.functional as F\n", - "\n", - "class HiddenLayerNet(nn.Module):\n", - " def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation=\"relu\"):\n", - " super().__init__()\n", - " self.fc1 = nn.Linear(n_features, n_hidden)\n", - " self.fc2 = nn.Linear(n_hidden, n_outputs)\n", - " self.activation = getattr(F, activation)\n", - "\n", - " def forward(self, x, **kwargs):\n", - " return self.fc2(self.activation(self.fc1(x)))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from torch_model import HiddenLayerNet" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(900, 14)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "from skorch import NeuralNetRegressor\n", - "\n", - "niceties = {\n", - " \"callbacks\": False,\n", - " \"warm_start\": True,\n", - " \"train_split\": None,\n", - " \"max_epochs\": 1,\n", - "}\n", - "\n", - "model = NeuralNetRegressor(\n", - " module=HiddenLayerNet,\n", - " module__n_features=X_train.shape[1],\n", - " optimizer=optim.SGD,\n", - " criterion=nn.MSELoss,\n", - " lr=0.0001,\n", - " **niceties,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "from scipy.stats import loguniform, uniform\n", - "\n", - "params = {\n", - " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", - " \"batch_size\": [32, 64, 128, 256],\n", - " \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", - " \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", - " \"optimizer__momentum\": uniform(0, 1),\n", - " \"optimizer__nesterov\": [True],\n", - "}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Find the best hyperparameters" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", - "\n", - "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`.\n", - "\n", - "[2]:https://ml.dask.org/hyper-parameter-search.html" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from dask_ml.model_selection import HyperbandSearchCV\n", - "search = HyperbandSearchCV(model, params, random_state=42, verbose=True, max_iter=9)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "By default, `HyperbandSearchCV` will call `partial_fit` on each chunk of the Dask Array. `HyperbandSearchCV`'s rule of thumb specifies how to train for longer or sample more parameters." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[CV, bracket=2] creating 9 models\n", - "[CV, bracket=1] creating 5 models\n", - "[CV, bracket=0] creating 3 models\n", - "[CV, bracket=0] For training there are between 360 and 360 examples in each chunk\n", - "[CV, bracket=1] For training there are between 360 and 360 examples in each chunk\n", - "[CV, bracket=0] validation score of -0.3191 received after 1 partial_fit calls\n", - "[CV, bracket=2] For training there are between 360 and 360 examples in each chunk\n", - "[CV, bracket=1] validation score of 0.0170 received after 1 partial_fit calls\n", - "[CV, bracket=1] validation score of 0.0322 received after 3 partial_fit calls\n", - "[CV, bracket=2] validation score of 0.2228 received after 1 partial_fit calls\n", - "[CV, bracket=2] validation score of -0.7214 received after 3 partial_fit calls\n", - "[CV, bracket=1] validation score of 0.0183 received after 9 partial_fit calls\n", - "[CV, bracket=0] validation score of -0.2677 received after 9 partial_fit calls\n", - "[CV, bracket=2] validation score of -0.5336 received after 9 partial_fit calls\n" - ] - }, - { - "data": { - "text/plain": [ - "HyperbandSearchCV(estimator=[uninitialized](\n", - " module=,\n", - " module__n_features=14,\n", - "),\n", - " max_iter=9,\n", - " parameters={'batch_size': [32, 64, 128, 256],\n", - " 'module__activation': ['relu', 'elu', 'softsign',\n", - " 'leaky_relu', 'rrelu'],\n", - " 'optimizer__lr': ,\n", - " 'optimizer__momentum': ,\n", - " 'optimizer__nesterov': [True],\n", - " 'optimizer__weight_decay': },\n", - " random_state=42, verbose=True)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_train2 = y_train.reshape(-1, 1).persist()\n", - "_ = search.fit(X_train, y_train2)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Score" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", - "\n", - "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.018286365127180515" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search.best_score_" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'batch_size': 32,\n", - " 'module__activation': 'rrelu',\n", - " 'optimizer__lr': 0.0002668107973843001,\n", - " 'optimizer__momentum': 0.5920831762255758,\n", - " 'optimizer__nesterov': True,\n", - " 'optimizer__weight_decay': 3.6363529586270234e-05}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search.best_params_" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[initialized](\n", - " module_=HiddenLayerNet(\n", - " (fc1): Linear(in_features=14, out_features=100, bias=True)\n", - " (fc2): Linear(in_features=100, out_features=1, bias=True)\n", - " ),\n", - ")" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "search.best_estimator_" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This means we can deploy the best model and score on the entire dataset:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.33630241003610284" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from dask_ml.wrappers import ParallelPostFit\n", - "deployed_model = ParallelPostFit(search.best_estimator_)\n", - "deployed_model.score(X_test, y_test)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What does the error distribution look like on this larger dataset?" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = deployed_model.predict(X_test)\n", - "y_pred = y_pred.flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 800 B 400 B
Shape (100,) (50,)
Count 4 Tasks 2 Chunks
Type int64 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 100\n", - " 1\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array<_predict, shape=(100,), dtype=int64, chunksize=(50,), chunktype=numpy.ndarray>" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import dask.array as da\n", - "\n", - "err = np.abs(y_pred - y_test)\n", - "max_min_err = 20\n", - "vals, edges = da.histogram(err, range=(0, max_min_err), bins=max_min_err)\n", - "vals, edges = dask.compute(vals, edges)" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(100,)" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "image/png": { - "height": 290, - "width": 388 - }, - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "df = pd.DataFrame({\"vals\": vals, \"edges\": edges[1:]})\n", - "ax = df.plot.bar(x=\"edges\", y=\"vals\", width=1)\n", - "ax.set_ylabel(\"Frequency\")\n", - "ax.set_xlabel(\"Maximum error (minutes)\")\n", - "ax.set_title(\"Prediction error\")\n", - "ax.legend_.remove()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Why not simply sampling instead?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sampling solves the memory issues:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "X_train_small = X_train.sample(frac=0.01, random_state=123).compute()\n", - "y_train_small = y_train.sample(frac=0.01, random_state=123).compute()\n", - "\n", - "X_train_small # NumPy ndarray; must fit in memory" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But `HyperbandSearchCV` is meant for computationally-constrained problems, regardless of their memory usage (which [Dask-ML's documentation on hyperparameter searches][2] also indicate). `HyperbandSearchCV` would still be relevant:\n", - "\n", - "[2]:https://ml.dask.org/hyper-parameter-search.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "search = HyperbandSearchCV(model, params, max_iter=81, random_state=0)\n", - "search.fit(X_train_small, y_train_small, classes=[0, 1]);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs.\n", - "\n", - "If we had a simpler model and a massive dataset, `IncrementalSearchCV` is recommended. It mirrors Scikit-Learn's `RandomizedSearchCV` but works on Dask Arrays/Dataframes, both of which can be larger than memory." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python [conda env:skorch]", - "language": "python", - "name": "conda-env-skorch-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From fff894b25fba69ce23fd812bffa7b7159654c9f9 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 12 Aug 2020 10:16:22 -0500 Subject: [PATCH 11/11] Update notebook --- .../hyper-parameter-optimization.ipynb | 820 ++---------------- hyper-parameter-optimmization/torch_model.py | 1 + 2 files changed, 95 insertions(+), 726 deletions(-) diff --git a/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb index 46dccc4..ffeff57 100644 --- a/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb +++ b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb @@ -32,9 +32,25 @@ "name": "stdout", "output_type": "stream", "text": [ - "Creating Cluster. This takes about a minute ... \r" + "Creating Cluster. This takes about a minute ...\r" ] - }, + } + ], + "source": [ + "# Create cluster with Coiled\n", + "import coiled\n", + "\n", + "cluster = coiled.Cluster(\n", + " n_workers=20,\n", + " configuration=\"coiled-examples/pytorch\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -43,40 +59,35 @@ "\n", "

Client

\n", "\n", "\n", "\n", "

Cluster

\n", "
    \n", - "
  • Workers: 0
  • \n", - "
  • Cores: 0
  • \n", - "
  • Memory: 0 B
  • \n", + "
  • Workers: 20
  • \n", + "
  • Cores: 80
  • \n", + "
  • Memory: 343.60 GB
  • \n", "
\n", "\n", "\n", "" ], "text/plain": [ - "" + "" ] }, - "execution_count": 1, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "import coiled\n", + "# Connect Dask to the cluster\n", "import dask.distributed\n", "\n", - "cluster = coiled.Cluster(\n", - " n_workers=20,\n", - " configuration=\"coiled-examples/pytorch\"\n", - ")\n", "client = dask.distributed.Client(cluster)\n", - "\n", "client" ] }, @@ -107,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -129,6 +140,8 @@ " blocksize=\"16 MiB\",\n", ")\n", "\n", + "df = df.repartition(partition_size=\"10 MiB\").persist()\n", + "\n", "# one hot encode the categorical columns\n", "df = df.categorize(categorical_features)\n", "df = dd.get_dummies(df, columns=categorical_features)\n", @@ -142,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -151,16 +164,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from dask_ml.model_selection import train_test_split\n", "import dask\n", "\n", - "features = data.to_dask_array(lengths=True).astype(\"float32\")\n", - "output = durations.to_dask_array(lengths=True).astype(\"float32\")\n", - "X_train, X_test, y_train, y_test = train_test_split(features, output, shuffle=True)\n", + "X = data.to_dask_array(lengths=True).astype(\"float32\")\n", + "y = durations.to_dask_array(lengths=True).astype(\"float32\")\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, shuffle=True)\n", "\n", "# persist the data so it's not re-computed\n", "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" @@ -187,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -199,13 +212,14 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "import torch\n", "import torch.optim as optim\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", @@ -229,10 +243,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ + "import torch\n", "import torch.optim as optim\n", "import torch.nn as nn\n", "from skorch import NeuralNetRegressor\n", @@ -244,7 +259,13 @@ " \"max_epochs\": 1,\n", "}\n", "\n", - "model = NeuralNetRegressor(\n", + "class NonNanLossRegressor(NeuralNetRegressor):\n", + " def get_loss(self, y_pred, y_true, X=None, training=False):\n", + " if torch.abs(y_true - y_pred).abs().mean() > 1e6:\n", + " return torch.tensor([0.0], requires_grad=True)\n", + " return super().get_loss(y_pred, y_true, X=X, training=training)\n", + "\n", + "model = NonNanLossRegressor(\n", " module=HiddenLayerNet,\n", " module__n_features=X_train.shape[1],\n", " optimizer=optim.SGD,\n", @@ -256,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -265,10 +286,10 @@ "params = {\n", " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", " \"batch_size\": [32, 64, 128, 256],\n", - "# \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", - "# \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", - "# \"optimizer__momentum\": uniform(0, 1),\n", - "# \"optimizer__nesterov\": [True],\n", + " \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", + " \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", + " \"optimizer__momentum\": uniform(0, 1),\n", + " \"optimizer__nesterov\": [True],\n", "}" ] }, @@ -299,15 +320,12 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from dask_ml.model_selection import HyperbandSearchCV\n", - "search = HyperbandSearchCV(model, params, random_state=2, verbose=True,\n", - " max_iter=2,\n", - "# max_iter=9,\n", - " )" + "search = HyperbandSearchCV(model, params, random_state=2, verbose=True, max_iter=9)" ] }, { @@ -319,34 +337,48 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "[CV, bracket=0] creating 1 models\n", - "[CV, bracket=0] For training there are between 2756 and 169772 examples in each chunk\n", - "[CV, bracket=0] validation score of 0.0210 received after 1 partial_fit calls\n", - "[CV, bracket=0] validation score of 0.0270 received after 2 partial_fit calls\n" + "[CV, bracket=2] creating 9 models\n", + "[CV, bracket=1] creating 5 models\n", + "[CV, bracket=0] creating 3 models\n", + "[CV, bracket=0] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=1] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=2] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=1] validation score of 0.0202 received after 1 partial_fit calls\n", + "[CV, bracket=0] validation score of -3.3790 received after 1 partial_fit calls\n", + "[CV, bracket=1] validation score of 0.0210 received after 3 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0229 received after 1 partial_fit calls\n", + "[CV, bracket=1] validation score of -299404463816680.2500 received after 9 partial_fit calls\n", + "[CV, bracket=0] validation score of -11.9127 received after 9 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0232 received after 3 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0280 received after 9 partial_fit calls\n" ] }, { "data": { "text/plain": [ - "HyperbandSearchCV(estimator=[uninitialized](\n", + "HyperbandSearchCV(estimator=[uninitialized](\n", " module=,\n", " module__n_features=15,\n", "),\n", - " max_iter=2,\n", + " max_iter=9,\n", " parameters={'batch_size': [32, 64, 128, 256],\n", " 'module__activation': ['relu', 'elu', 'softsign',\n", - " 'leaky_relu', 'rrelu']},\n", + " 'leaky_relu', 'rrelu'],\n", + " 'optimizer__lr': ,\n", + " 'optimizer__momentum': ,\n", + " 'optimizer__nesterov': [True],\n", + " 'optimizer__weight_decay': },\n", " random_state=2, verbose=True)" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -374,16 +406,16 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.02695897736664199" + "0.028028356182226544" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -394,16 +426,21 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'module__activation': 'softsign', 'batch_size': 128}" + "{'batch_size': 256,\n", + " 'module__activation': 'softsign',\n", + " 'optimizer__lr': 0.00015404537696021744,\n", + " 'optimizer__momentum': 0.15141540401838427,\n", + " 'optimizer__nesterov': True,\n", + " 'optimizer__weight_decay': 0.000576470051148445}" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -414,13 +451,13 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[initialized](\n", + "[initialized](\n", " module_=HiddenLayerNet(\n", " (fc1): Linear(in_features=15, out_features=100, bias=True)\n", " (fc2): Linear(in_features=100, out_features=1, bias=True)\n", @@ -428,7 +465,7 @@ ")" ] }, - "execution_count": 13, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -441,21 +478,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This means we can deploy the best model and score on the entire dataset:" + "This means we can deploy the best model and score on the testing dataset:" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.02479510859559464" + "0.028248285332490686" ] }, - "execution_count": 14, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -465,675 +502,6 @@ "deployed_model = ParallelPostFit(search.best_estimator_)\n", "deployed_model.score(X_test, y_test)" ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Visualization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "What does the error distribution look like on this larger dataset?" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "y_pred = deployed_model.predict(X_test)\n", - "y_pred = y_pred.flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "\n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Array Chunk
Bytes 67.52 MB 151.23 kB
Shape (8440119,) (18904,)
Count 942 Tasks 471 Chunks
Type int64 numpy.ndarray
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - " \n", - " \n", - "\n", - " \n", - " 8440119\n", - " 1\n", - "\n", - "
" - ], - "text/plain": [ - "dask.array<_predict, shape=(8440119,), dtype=int64, chunksize=(18904,), chunktype=numpy.ndarray>" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_pred" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import pandas as pd\n", - "import dask.array as da\n", - "\n", - "err = np.abs(y_pred - y_test)\n", - "max_min_err = 20\n", - "vals, edges = da.histogram(err, range=(0, max_min_err), bins=max_min_err)\n", - "vals, edges = dask.compute(vals, edges)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(8440119,)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "y_test.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "df = pd.DataFrame({\"vals\": vals, \"edges\": edges[1:]})\n", - "ax = df.plot.bar(x=\"edges\", y=\"vals\", width=1)\n", - "ax.set_ylabel(\"Frequency\")\n", - "ax.set_xlabel(\"Maximum error (minutes)\")\n", - "ax.set_title(\"Prediction error\")\n", - "ax.legend_.remove()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Why not simply sampling instead?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sampling solves the memory issues:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "# X_train_small = data.sample(frac=0.01, random_state=123).to_dask_array().astype(\"float32\").compute()\n", - "# y_train_small = durations.sample(frac=0.01, random_state=123).to_dask_array().astype(\"float32\").compute()\n", - "\n", - "# X_train_small # NumPy ndarray; must fit in memory" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "But `HyperbandSearchCV` is meant for computationally-constrained problems, regardless of their memory usage (which [Dask-ML's documentation on hyperparameter searches][2] also indicate). `HyperbandSearchCV` would still be relevant:\n", - "\n", - "[2]:https://ml.dask.org/hyper-parameter-search.html" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# search = HyperbandSearchCV(model, params, max_iter=81, random_state=0)\n", - "# search.fit(X_train_small, y_train_small.reshape(-1, 1), classes=[0, 1]);" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`HyperbandSearchCV` would not be relevant when the search problem is not computationally-constrained, which happens with a smaller search space or a simpler model that doesn't require GPUs.\n", - "\n", - "If we had a simpler model and a massive dataset, `IncrementalSearchCV` is recommended. It mirrors Scikit-Learn's `RandomizedSearchCV` but works on Dask Arrays/Dataframes, both of which can be larger than memory." - ] } ], "metadata": { diff --git a/hyper-parameter-optimmization/torch_model.py b/hyper-parameter-optimmization/torch_model.py index b6683f0..94d9a90 100644 --- a/hyper-parameter-optimmization/torch_model.py +++ b/hyper-parameter-optimmization/torch_model.py @@ -1,3 +1,4 @@ +import torch import torch.optim as optim import torch.nn as nn import torch.nn.functional as F