diff --git a/hyper-parameter-optimmization/environment.yml b/hyper-parameter-optimmization/environment.yml new file mode 100644 index 0000000..b908644 --- /dev/null +++ b/hyper-parameter-optimmization/environment.yml @@ -0,0 +1,17 @@ +name: pytorch +channels: + - conda-forge + - pytorch + - defaults +dependencies: + - python=3.7 + - dask + - numpy + - pandas + - coiled + - dask-ml + - skorch + - scipy + - matplotlib + - pytorch>1.1.0 + - s3fs \ No newline at end of file diff --git a/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb new file mode 100644 index 0000000..ffeff57 --- /dev/null +++ b/hyper-parameter-optimmization/hyper-parameter-optimization.ipynb @@ -0,0 +1,528 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hyperparameter Optimization with Dask and Coiled\n", + "\n", + "This example will walk through the following:\n", + "\n", + "* **Getting and processing the data.**\n", + "* **Defining a model and parameters.**\n", + "* **Finding the best parameters,** and some details on why we're using the chosen search algorithm.\n", + "* **Scoring** and deploying.\n", + "\n", + "All of these tasks will be performed on the New York City Taxi Cab dataset." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup cluster" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Creating Cluster. This takes about a minute ...\r" + ] + } + ], + "source": [ + "# Create cluster with Coiled\n", + "import coiled\n", + "\n", + "cluster = coiled.Cluster(\n", + " n_workers=20,\n", + " configuration=\"coiled-examples/pytorch\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "

Client

\n", + "\n", + "
\n", + "

Cluster

\n", + "
    \n", + "
  • Workers: 20
  • \n", + "
  • Cores: 80
  • \n", + "
  • Memory: 343.60 GB
  • \n", + "
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Connect Dask to the cluster\n", + "import dask.distributed\n", + "\n", + "client = dask.distributed.Client(cluster)\n", + "client" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### ☝️ Don’t forget to click the \"Dashboard\" link above to view the cluster dashboard!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get and pre-process data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This example will mirror the Kaggle \"[NYC Taxi Trip Duration][1]\" example with different data.\n", + "\n", + "These data have records on 84 million taxi rides.\n", + "\n", + "[1]:https://www.kaggle.com/c/nyc-taxi-trip-duration/" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import dask.dataframe as dd\n", + "\n", + "features = [\"passenger_count\", \"trip_distance\", \"fare_amount\"]\n", + "categorical_features = [\"RatecodeID\", \"payment_type\"]\n", + "output = [\"tpep_pickup_datetime\", \"tpep_dropoff_datetime\"]\n", + "\n", + "df = dd.read_csv(\n", + " \"s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv\", \n", + " parse_dates=output,\n", + " usecols=features + categorical_features + output,\n", + " dtype={\n", + " \"passenger_count\": \"UInt8\",\n", + " \"RatecodeID\": \"category\",\n", + " \"payment_type\": \"category\",\n", + " },\n", + " blocksize=\"16 MiB\",\n", + ")\n", + "\n", + "df = df.repartition(partition_size=\"10 MiB\").persist()\n", + "\n", + "# one hot encode the categorical columns\n", + "df = df.categorize(categorical_features)\n", + "df = dd.get_dummies(df, columns=categorical_features)\n", + "\n", + "# persist so only download once\n", + "df = df.persist()\n", + "\n", + "data = df[[c for c in df.columns if c not in output]]\n", + "data = data.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "durations = (df[\"tpep_dropoff_datetime\"] - df[\"tpep_pickup_datetime\"]).dt.total_seconds() / 60 # minutes" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.model_selection import train_test_split\n", + "import dask\n", + "\n", + "X = data.to_dask_array(lengths=True).astype(\"float32\")\n", + "y = durations.to_dask_array(lengths=True).astype(\"float32\")\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, shuffle=True)\n", + "\n", + "# persist the data so it's not re-computed\n", + "X_train, X_test, y_train, y_test = dask.persist(X_train, X_test, y_train, y_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Define model and hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use a simple neural network from [PyTorch] using [Skorch], a simple wrapper that provides a Scikit-Learn API for PyTorch.\n", + "\n", + "This network is only small for demonstration. If desired, we could use much larger networks on GPUs.\n", + "\n", + "[PyTorch]:https://pytorch.org/\n", + "[skorch]:https://skorch.readthedocs.io/en/stable/" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Import our HiddenLayerNet pytorch model from a local torch_model.py module\n", + "from torch_model import HiddenLayerNet\n", + "# Send module with HiddenLayerNet to workers on cluster\n", + "client.upload_file(\"torch_model.py\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "import torch\n", + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "class HiddenLayerNet(nn.Module):\n", + " def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation=\"relu\"):\n", + " super().__init__()\n", + " self.fc1 = nn.Linear(n_features, n_hidden)\n", + " self.fc2 = nn.Linear(n_hidden, n_outputs)\n", + " self.activation = getattr(F, activation)\n", + "\n", + " def forward(self, x, **kwargs):\n", + " return self.fc2(self.activation(self.fc1(x)))" + ] + } + ], + "source": [ + "# Print contents of torch_model.py module\n", + "!cat torch_model.py" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.optim as optim\n", + "import torch.nn as nn\n", + "from skorch import NeuralNetRegressor\n", + "\n", + "niceties = {\n", + " \"callbacks\": False,\n", + " \"warm_start\": True,\n", + " \"train_split\": None,\n", + " \"max_epochs\": 1,\n", + "}\n", + "\n", + "class NonNanLossRegressor(NeuralNetRegressor):\n", + " def get_loss(self, y_pred, y_true, X=None, training=False):\n", + " if torch.abs(y_true - y_pred).abs().mean() > 1e6:\n", + " return torch.tensor([0.0], requires_grad=True)\n", + " return super().get_loss(y_pred, y_true, X=X, training=training)\n", + "\n", + "model = NonNanLossRegressor(\n", + " module=HiddenLayerNet,\n", + " module__n_features=X_train.shape[1],\n", + " optimizer=optim.SGD,\n", + " criterion=nn.MSELoss,\n", + " lr=0.0001,\n", + " **niceties,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import loguniform, uniform\n", + "\n", + "params = {\n", + " \"module__activation\": [\"relu\", \"elu\", \"softsign\", \"leaky_relu\", \"rrelu\"],\n", + " \"batch_size\": [32, 64, 128, 256],\n", + " \"optimizer__lr\": loguniform(1e-4, 1e-3),\n", + " \"optimizer__weight_decay\": loguniform(1e-6, 1e-3),\n", + " \"optimizer__momentum\": uniform(0, 1),\n", + " \"optimizer__nesterov\": [True],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All of these parameters control model architecture, execpt for two basic optimizatino parameters, `batch_size` and `learning_rate_init`. They control finding the best model of a particular architecture." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Find the best hyperparameters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our search is \"computationally-constrained\" because (hypothetically) it requires GPUs and has a pretty complicated search space (in reality it has neither of those features). And obviously it's \"memory-constrained\" because the dataset doesn't fit in memory.\n", + "\n", + "[Dask-ML's documentation on hyperparameter searches][2] indicates that we should use `HyperbandSearchCV`.\n", + "\n", + "[2]:https://ml.dask.org/hyper-parameter-search.html" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from dask_ml.model_selection import HyperbandSearchCV\n", + "search = HyperbandSearchCV(model, params, random_state=2, verbose=True, max_iter=9)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, `HyperbandSearchCV` will call `partial_fit` on each chunk of the Dask Array. `HyperbandSearchCV`'s rule of thumb specifies how to train for longer or sample more parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[CV, bracket=2] creating 9 models\n", + "[CV, bracket=1] creating 5 models\n", + "[CV, bracket=0] creating 3 models\n", + "[CV, bracket=0] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=1] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=2] For training there are between 119153 and 249047 examples in each chunk\n", + "[CV, bracket=1] validation score of 0.0202 received after 1 partial_fit calls\n", + "[CV, bracket=0] validation score of -3.3790 received after 1 partial_fit calls\n", + "[CV, bracket=1] validation score of 0.0210 received after 3 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0229 received after 1 partial_fit calls\n", + "[CV, bracket=1] validation score of -299404463816680.2500 received after 9 partial_fit calls\n", + "[CV, bracket=0] validation score of -11.9127 received after 9 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0232 received after 3 partial_fit calls\n", + "[CV, bracket=2] validation score of 0.0280 received after 9 partial_fit calls\n" + ] + }, + { + "data": { + "text/plain": [ + "HyperbandSearchCV(estimator=[uninitialized](\n", + " module=,\n", + " module__n_features=15,\n", + "),\n", + " max_iter=9,\n", + " parameters={'batch_size': [32, 64, 128, 256],\n", + " 'module__activation': ['relu', 'elu', 'softsign',\n", + " 'leaky_relu', 'rrelu'],\n", + " 'optimizer__lr': ,\n", + " 'optimizer__momentum': ,\n", + " 'optimizer__nesterov': [True],\n", + " 'optimizer__weight_decay': },\n", + " random_state=2, verbose=True)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_train2 = y_train.reshape(-1, 1).persist()\n", + "search.fit(X_train, y_train2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`HyperbandSearchCV` and the like mirror the Scikit-Learn model selection interface, so all attributes of Scikit-Learn's [RandomizedSearchCV][rscv] are available:\n", + "\n", + "[rscv]:https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.028028356182226544" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'batch_size': 256,\n", + " 'module__activation': 'softsign',\n", + " 'optimizer__lr': 0.00015404537696021744,\n", + " 'optimizer__momentum': 0.15141540401838427,\n", + " 'optimizer__nesterov': True,\n", + " 'optimizer__weight_decay': 0.000576470051148445}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[initialized](\n", + " module_=HiddenLayerNet(\n", + " (fc1): Linear(in_features=15, out_features=100, bias=True)\n", + " (fc2): Linear(in_features=100, out_features=1, bias=True)\n", + " ),\n", + ")" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "search.best_estimator_" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This means we can deploy the best model and score on the testing dataset:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.028248285332490686" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from dask_ml.wrappers import ParallelPostFit\n", + "deployed_model = ParallelPostFit(search.best_estimator_)\n", + "deployed_model.score(X_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/hyper-parameter-optimmization/torch_model.py b/hyper-parameter-optimmization/torch_model.py new file mode 100644 index 0000000..94d9a90 --- /dev/null +++ b/hyper-parameter-optimmization/torch_model.py @@ -0,0 +1,14 @@ +import torch +import torch.optim as optim +import torch.nn as nn +import torch.nn.functional as F + +class HiddenLayerNet(nn.Module): + def __init__(self, n_features=10, n_outputs=1, n_hidden=100, activation="relu"): + super().__init__() + self.fc1 = nn.Linear(n_features, n_hidden) + self.fc2 = nn.Linear(n_hidden, n_outputs) + self.activation = getattr(F, activation) + + def forward(self, x, **kwargs): + return self.fc2(self.activation(self.fc1(x))) \ No newline at end of file