From fe77875262a6599101a01d0fdc208aa25de855b5 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 19:58:12 +0000 Subject: [PATCH 1/8] refactor sequential notebooks into independent notebooks --- .../1_retail_recommend_dataprep.ipynb | 715 ------------- ...rain_tune.ipynb => retail_recommend.ipynb} | 995 ++++++++++-------- ....ipynb => retail_recommend_pipeline.ipynb} | 113 +- 3 files changed, 610 insertions(+), 1213 deletions(-) delete mode 100644 use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb rename use-cases/retail_recommend/{2_retail_recommend_train_tune.ipynb => retail_recommend.ipynb} (68%) rename use-cases/retail_recommend/{3_retail_recommend_pipeline.ipynb => retail_recommend_pipeline.ipynb} (96%) diff --git a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb b/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb deleted file mode 100644 index 6b4c30b5e4..0000000000 --- a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb +++ /dev/null @@ -1,715 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 1. Data Preparation\n", - "\n", - "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", - "\n", - "## Dataset\n", - "\n", - "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", - "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", - "+ Description: Product (item) name. Nominal.\n", - "+ Quantity: The quantities of each product (item) per transaction. Numeric.\n", - "+ InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.\n", - "+ UnitPrice: Unit price. Numeric, Product price per unit in sterling.\n", - "+ CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.\n", - "+ Country: Country name. Nominal, the name of the country where each customer resides. \n", - "\n", - "Citation: Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197–208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Solution Architecture\n", - "----\n", - "![Architecture](./images/retail_rec_dataprep.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored variables and their in-db values:\n" - ] - } - ], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sagemaker\n", - "import sagemaker.amazon.common as smac\n", - "import boto3\n", - "\n", - "import io\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from scipy.sparse import csr_matrix, hstack, save_npz\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "bucket = sagemaker_session.default_bucket()\n", - "print(f\"using bucket{bucket} in region {region} \\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(541909, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom
\n", - "
" - ], - "text/plain": [ - " InvoiceNo StockCode Description Quantity \\\n", - "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n", - "1 536365 71053 WHITE METAL LANTERN 6 \n", - "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", - "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", - "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", - "\n", - " InvoiceDate UnitPrice CustomerID Country \n", - "0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n", - "1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n", - "3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(\"data/Online Retail.csv\")\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preprocessing\n", - "\n", - "First, we check for any null (i.e. missing) values." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "InvoiceNo 0\n", - "StockCode 0\n", - "Description 1454\n", - "Quantity 0\n", - "InvoiceDate 0\n", - "UnitPrice 0\n", - "CustomerID 135080\n", - "Country 0\n", - "dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(406829, 8)\n" - ] - } - ], - "source": [ - "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", - "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmYAAAFNCAYAAACqr6PiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3de7icZX3v//d3zVo5kCMh4ZhgQA4aK6JNUXZr1Q2WQ2v5WUXB1qLVclmP9fDrT6pVy5b2p3ZX66nIFhXxAIhWIxvFWjZVKYKJAhI0EkJCwjGQMzmtmbn3H889K5OVmTWzksyslcz7dV3rWjPPaZ5nzayVT+77+9x3pJSQJEnS2Osb6xOQJElSwWAmSZI0ThjMJEmSxgmDmSRJ0jhhMJMkSRonDGaSJEnjhMFMUlsi4vKI+Lv9dKxjI2JLRJTy81si4o3749j5eN+LiIv21/FG8bofjognIuLRbr/23oiIP42IH4z1eUjaJRzHTFJErASOAMpABbgX+DJwRUqpuhfHemNK6Yej2OcW4Csppc+P5rXyvh8CTkgp/dlo992fImIe8BvgaSmlx5tsMxP4R+DlwHTgfuCfUkpXdeH85gMPAAMppXKTbRJwYkppeafPR1Jj/WN9ApLGjZellH4YETOAFwH/AjwfeP3+fJGI6G8WDA5wTwOeHCGUTQB+CDwOnA6sAc4AroqIGSmlT3btTCWNW3ZlStpNSmljSmkR8Grgooj4LYCI+FJEfDg/nh0RN0TEhohYFxE/joi+iLgaOBb4bu6q/JuImB8RKSLeEBEPAjfXLav/z+HTI+KOiNgYEd+JiFn5tV4cEWvqzzEiVkbEmRFxNvC3wKvz692V1w91jebzen9ErIqIxyPiyzl8UnceF0XEg7kb8n3NfjYRMSPvvzYf7/35+GcC/w4cnc/jSw12f23+2ZyfUnogpTSYUvo+8HbgwxExLb9GiogT6l6z/ud+aP65r42I9fnx3Lptb4mI/xERt0bE5oj4QUTMzqt/lL9vyOd4ekS8LiJ+kvetrb8rr391RNwTES+rO/5A/hmd2uxnJGnfGMwkNZRSuoOiVeeFDVa/O6+bQ9EF+rfFLum1wIMUrW9TU0ofrdvnRcAzgbOavOSfA38BHE3RpdqyBSkHm38Ars2v95wGm70uf70EOB6YCnx62Da/B5xM0YL1gYh4ZpOX/BQwIx/nRfmcX5+7bc8BHs7n8boG+74U+F5K6alhy78JHAK8oPmVDukDvkjROncssK3BtbyGopXzcGAC8J68/Pfz95n5HG+r3ymlVFv/nLz+Woru7Pou4nOBR1JKd7ZxrpL2gsFM0kgeBmY1WD4IHEVRTzWYUvpxal2w+qGU0lMppW1N1l+dUronB5e/A15VuzlgH/0p8M8ppRUppS3AJcAFw1rr/j6ltC2ldBdwF7BHwMvn8mrgkpTS5pTSSuB/UrSEtWM28Mjwhblb9wmKkDuilNKTKaVvppS2ppQ2A5dRBMR6X0wp/Sb/nK8D9qV16yvAuRExPT9/LXD1PhxPUgsGM0kjOQZY12D5x4DlwA8iYkVEvLeNY60exfpVwABFmNlXR+fj1R+7n6Klr6b+LsqtFK1qw82maIEafqxj2jyPJyjC7G5yQJwNrG11gIg4JCI+l7tRN1F0T84cFmDbuZa2pJQeBm4FXpFvXDgH+OreHk9SawYzSQ1FxO9QhI6fDF+XW4zenVI6HngZ8K6IOKO2uskhW7Wozat7fCxFq9wTwFMUXX218yqxe+tSq+M+TNH1V3/sMvBYi/2GeyKf0/BjPdTm/j8EzomIKcOWvyIf9478fCt11wscWff43RRdrs9PKU1nV/dktPH6e3sL/lUU3ZnnA7ellNq9Xkl7wWAmaTcRMT0i/gi4hmIIi1822OaPIuKEiAhgE8UQG5W8+jGKGqzR+rOIWBARhwCXAtenlCoUQ1BMiog/jIgB4P3AxLr9HgPmR0Szv2dfB94ZEcdFxFR21aSN6s7QfC7XAZdFxLSIeBrwLoruvnZcTVGX941808FARJxFUUv30ZTSxrzdncBrIqKUb26o76qcRlFXtiHfHPHBUVzCWqDKyO9No/fu28DzgHdQ1JxJ6iCDmaSa70bEZoouxfcB/0zzoTJOpGgB2gLcBnw2pXRLXvePwPvzHZvvabJ/I1cDX6LoiptEcbciObC8Gfg8RevUUxQBp+Yb+fuTEfHzBsf9Qj72jyjG8doOvG0U51Xvbfn1V1C0JH4tH7+llNIO4EyKn+/tFAHr+8AngL+v2/QdFK2QGyjq475dt+4TwGSK1ruf5v3bklLaSlGTdmt+bxrdbPAhiuE7NkTEq/J+2yhuUDgO+Fa7rydp7zjArCSNgdz69z2KsPm6Nm6eGDMR8QHgpLEexFfqBbaYSdIYSCkNUtSX3U9RNzYu5S7TNwBXjPW5SL3AFjNJUkMR8ZcU3adXp5TeNNbnI/UCg5kkSdI4YVemJEnSOGEwkyRJGif6W28y/s2ePTvNnz9/rE9DkiSppSVLljyRUmo4DdtBEczmz5/P4sWLx/o0JEmSWoqIVc3W2ZUpSZI0ThjMJEmSxgmDmSRJ0jhhMJMkSRonDGaSJEnjhMFMkiRpnDCYSZIkjRMGM0mSpHHCYCZJkjROGMwkSZLGCYOZJI3g77+7lJt//dhYn4akHmEwk6Qmntiygy/eupJblq0d61OR1CMMZpLUxJJV6wEoV9MYn4mkXmEwk6QmFq9cB0ClYjCT1B0GM0lqYrEtZpK6zGAmSQ1sH6xwz0MbAahUq2N8NpJ6hcFMkhq4a/UGBnMXpi1mkrrFYCZJDdS6MedMm0g1GcwkdUf/WJ+AJI1Hi1eu44TDpzJQ6qNs8b+kLrHFTJKGqVYTS1atZ+HTDqW/L6jYlSmpSwxmkjTMfY9vYdP2Mgvnz6LUF9aYSeoag5kkDbN4VTF+mS1mkrrNYCZJwyxeuZ7ZUyfytMMOyS1mDpchqTsMZpI0zOJV61j4tEOJCPpLtphJ6h6DmSTVeWzTdlav28bC+YcCUOrrs8ZMUtcYzCSpzuKVxfhlC+fPArDGTFJXGcwkqc7PVq5j0kAfzzp6OkBRY+Y4ZpK6xGAmSXWWrFrPqfNmMlAq/jzaYiapmwxmklTnwXVbOemIaUPPvStTUjcZzCSpTqWahlrLwBYzSd1lMJOkOoOVKv19MfS8z5H/JXWRwUyS6lSqiVJdMLPFTFI3GcwkKUspUa6m3VrMHMdMUjcZzCQpq+WvfmvMJI0Rg5kkZYOV4u7L0m4tZkG54l2ZkrrDYCZJWa1lrN8aM0ljxGAmSVmtlmy3FrOSd2VK6h6DmSRltZax4eOYVZPBTFJ3GMwkKSs3rDHzrkxJ3WMwk6Ss3KTGLCWoGs4kdYHBTJKySqMas/zYVjNJ3WAwk6Ss3KTGDPDOTEldYTCTpKxSbTyOGUC56lhmkjrPYCZJ2WClcY0Z2GImqTsMZpKUNawxy92a1phJ6oa2gllEnB0RyyJieUS8t8H6iRFxbV5/e0TMr1t3SV6+LCLOGsUxPxURW/busiRp9KwxkzTWWgaziCgBnwHOARYAF0bEgmGbvQFYn1I6Afg48JG87wLgAuBZwNnAZyOi1OqYEbEQmLmP1yZJozJyjZnBTFLntdNidhqwPKW0IqW0E7gGOG/YNucBV+XH1wNnRETk5deklHaklB4AlufjNT1mDm0fA/5m3y5NkkZnxBqzisFMUue1E8yOAVbXPV+TlzXcJqVUBjYCh42w70jHfCuwKKX0yEgnFREXR8TiiFi8du3aNi5DkkY28jhm3pUpqfPaCWbRYNnw/zo222ZUyyPiaOB84FOtTiqldEVKaWFKaeGcOXNabS5JLQ2N/F+qbzEr/kxaYyapG9oJZmuAeXXP5wIPN9smIvqBGcC6EfZttvy5wAnA8ohYCRwSEcvbvBZJ2ie1GrNaGAOo3QdgjZmkbmgnmP0MODEijouICRTF/IuGbbMIuCg/fiVwc0op5eUX5Ls2jwNOBO5odsyU0v9OKR2ZUpqfUpoPbM03FEhSx9VqzIZPYg62mEnqjv5WG6SUyhHxVuAmoAR8IaW0NCIuBRanlBYBVwJX59atdRRBi7zddcC9QBl4S0qpAtDomPv/8iSpfZWGXZnelSmpe1oGM4CU0o3AjcOWfaDu8XaK2rBG+14GXNbOMRtsM7Wd85Ok/WGoxqxB8X/F4n9JXeDI/5KUNaoxG2oxc7gMSV1gMJOkrNywxsyR/yV1j8FMkrKGw2WUrDGT1D0GM0nKyg0HmPWuTEndYzCTpKxSKWrMBhrUmBnMJHWDwUySsqEWs5KTmEsaGwYzScoaDZdhi5mkbjKYSVLmJOaSxprBTJKy2nAZu9eYWfwvqXsMZpKUVapVIqCvvsXM4TIkdZHBTJKywWrarb4MrDGT1F0GM0nKKtW0W30ZeFempO4ymElSVq6k3ebJhLoWs4rF/5I6z2AmSVmlWt1tOiawxUxSdxnMJCkrN6wx865MSd1jMJOkrFzZs8as1rNpi5mkbjCYSVJWtJgNrzGzxUxS9xjMJClrVGNWa0CzxUxSNxjMJCkrNxguIyLo7wsqTskkqQsMZpKUFcNlxB7LS31hi5mkrjCYSVJWtJjt+Wexvy+oVAxmkjrPYCZJWaVaZaBki5mksWMwk6SsUY0ZQH+pz7syJXWFwUySMmvMJI01g5kkZY0mMYeixqxqMJPUBQYzScrK1SoDpT3/LNpiJqlbDGaSlI3UYuY4ZpK6wWAmSVmjSczBFjNJ3WMwk6Ss0STmUMyX6V2ZkrrBYCZJWbla3WMSc7DFTFL3GMwkKatU0x6TmAP0l8IWM0ldYTCTpKzZALO2mEnqFoOZJGXNBpj1rkxJ3WIwk6Ss2STmpb6g7CTmkrrAYCZJWbNJzL0rU1K3GMwkKWtWY9ZnjZmkLjGYSVI2co2ZwUxS5xnMJCmrjFRjZjCT1AUGM0nKyk1rzLwrU1J3GMwkCahWE9WE45hJGlMGM0kCKqkIXtaYSRpLBjNJgqFxyhrXmPU5jpmkrjCYSRJFfRnYYiZpbBnMJAmGglejScxLJWvMJHWHwUySYCh4NWsxqyaDmaTOM5hJEq1qzIJyxeEyJHVeW8EsIs6OiGURsTwi3ttg/cSIuDavvz0i5tetuyQvXxYRZ7U6ZkRcGRF3RcTdEXF9REzdt0uUpNasMZM0HrQMZhFRAj4DnAMsAC6MiAXDNnsDsD6ldALwceAjed8FwAXAs4Czgc9GRKnFMd+ZUnpOSukU4EHgrft4jZLU0og1Zn191phJ6op2WsxOA5anlFaklHYC1wDnDdvmPOCq/Ph64IyIiLz8mpTSjpTSA8DyfLymx0wpbQLI+08G/GsoqeNqwavRALO2mEnqlv42tjkGWF33fA3w/GbbpJTKEbEROCwv/+mwfY/Jj5seMyK+CJwL3Au8u41zlKR9Uqsx66+rMfva7Q8CcO8jmyhXE1/96SqK/zPu7jXPP7Y7JynpoNdOi9mef4X2bMVqts1olxcPUno9cDTwK+DVDU8q4uKIWBwRi9euXdtoE0lqW63GrFGLWV8OY7aZSeq0doLZGmBe3fO5wMPNtomIfmAGsG6EfVseM6VUAa4FXtHopFJKV6SUFqaUFs6ZM6eNy5Ck5mpdlY0mMa8tqtqdKanD2glmPwNOjIjjImICRTH/omHbLAIuyo9fCdycUkp5+QX5rs3jgBOBO5odMwonwFCN2cuAX+/bJUpSayPVmPXlZeYySZ3WssYs14y9FbgJKAFfSCktjYhLgcUppUXAlcDVEbGcoqXsgrzv0oi4jqJWrAy8JbeE0eSYfcBVETGdorvzLuCv9u8lS9Kehu7KbDCOWa0r00FmJXVaO8X/pJRuBG4ctuwDdY+3A+c32fcy4LI2j1kFfredc5Kk/Wmw0rzGLOzKlNQljvwvSYxcY1ZrMavYYiapwwxmksTINWalsMZMUncYzCQJqDQYx6ymtsgaM0mdZjCTJNobx8waM0mdZjCTJHZ1ZTaaK7PPrkxJXWIwkyTqh8sYaRwzk5mkzjKYSRKN58qsqWU1g5mkTjOYSRJ1NWYjdWVWu3pKknqQwUySqKsxG6n43xYzSR1mMJMkWtWYFd8NZpI6zWAmSbSqMfOuTEndYTCTJNqsMbPFTFKHGcwkiZFrzEpOYi6pSwxmkkT9lEyOYyZp7BjMJImRJzG3xkxStxjMJInirsxSXxDRPJhVTGaSOsxgJknAYLXasLUMHPlfUvcYzCSJosZsoFkw67MrU1J3GMwkiaLGrHmLmcX/krrDYCZJFPVj/aXGfxL7HC5DUpcYzCSJYoDZ1i1m3TwjSb3IYCZJFFMyNRrDDBzHTFL3GMwkiVpXpndlShpbBjNJoij+bzSBOdR1ZdqXKanDDGaShDVmksYHg5kk0arGrPhuV6akTjOYSRKtaszylEwGM0kdZjCTJGoDzDavMQugWu3uOUnqPQYzSSK3mDXpyoQinNmVKanTDGaSBAxWmhf/Q1FnZjCT1GkGM0miaDEbaFJjBkWLmblMUqcZzCSJkWvMoAhmFv9L6jSDmSTRTo2ZA8xK6jyDmSTRTo1ZOMCspI4zmEkS7dWYWfwvqdMMZpJEEcxGrjGzK1NS5xnMJInaJOa2mEkaWwYzSQLK1phJGgcMZpJEOy1mDjArqfMMZpLEyJOYQ+7KtMlMUocZzCSJWovZyAPMmsskdZrBTJKo3ZVpV6aksWUwkySKAWZHrDHrc0omSZ1nMJMk2q0x6+IJSepJBjNJPS+l1HIS85LjmEnqAoOZpJ5XK+ofuSvTGjNJnddWMIuIsyNiWUQsj4j3Nlg/MSKuzetvj4j5desuycuXRcRZrY4ZEV/Ny++JiC9ExMC+XaIkjWywUvRRjlz8b4uZpM5rGcwiogR8BjgHWABcGBELhm32BmB9SukE4OPAR/K+C4ALgGcBZwOfjYhSi2N+FXgG8GxgMvDGfbpCSWqhkpvMWk5ibo2ZpA5rp8XsNGB5SmlFSmkncA1w3rBtzgOuyo+vB86IiMjLr0kp7UgpPQAsz8dresyU0o0pA+4A5u7bJUrSyMo5mLWcxNwWM0kd1k4wOwZYXfd8TV7WcJuUUhnYCBw2wr4tj5m7MF8LfL+Nc5SkvVZrMWs1XIbBTFKntRPMGv2lGv7Xqdk2o11e77PAj1JKP254UhEXR8TiiFi8du3aRptIUlvKbdeYdeuMJPWqdoLZGmBe3fO5wMPNtomIfmAGsG6EfUc8ZkR8EJgDvKvZSaWUrkgpLUwpLZwzZ04blyFJjZXbaTGz+F9SF7QTzH4GnBgRx0XEBIpi/kXDtlkEXJQfvxK4OdeILQIuyHdtHgecSFE31vSYEfFG4CzgwpSSpbaSOm6oK7PUosbMJjNJHdbfaoOUUjki3grcBJSAL6SUlkbEpcDilNIi4Erg6ohYTtFSdkHed2lEXAfcC5SBt6SUKgCNjplf8nJgFXBbcf8A30opXbrfrliShmmrxazPrkxJndcymEFxpyRw47BlH6h7vB04v8m+lwGXtXPMvLytc5Kk/aVSdRwzSeODI/9L6nmDldYtZiWHy5DUBQYzST2vvRozB5iV1HkGM0k9r/0aM1vMJHWWwUxSz2unxizsypTUBQYzST2vnRqz2gCzyXAmqYMMZpJ6Xrs1ZoBDZkjqKIOZpJ63axLzke/KBLszJXWWwUxSz6vVmLUq/geDmaTOMphJ6nm1GrNWA8wCDpkhqaMMZpJ6Xq3GbKDFXJlgi5mkzjKYSep57dSY2ZUpqRsMZpJ6Xls1Zt6VKakLDGaSel55VDVmJjNJnWMwk9TzhqZkKo0UzIrvdmVK6iSDmaSet2uuzBGK/3MyqxjMJHWQwUxSz6tU2q8xM5dJ6iSDmaSeN3RX5ghdmY78L6kbDGaSet6urkwHmJU0tgxmknpeZRQ1ZraYSeokg5mknlcbLqO9ccwMZpI6x2AmqedVqlUidrWKNVJb5V2ZkjrJYCap5w1W04itZWCNmaTuMJhJ6nmVahqxvgwcYFZSdxjMJPW8cqWNFjOL/yV1gcFMUs+rVKsjjmEGdmVK6g6DmaSeVx5NjZktZpI6yGAmqecVXZnWmEkaewYzST2vXE2UrDGTNA4YzCT1vEq1Sr81ZpLGAYOZpJ7XVouZXZmSusBgJqnnjWa4DEf+l9RJBjNJPa/c1gCztRqzbpyRpF5lMJPU89qpMSsN1ZiZzCR1jsFMUs+zxkzSeGEwk9TzRjclUzfOSFKvMphJ6nntTWJeBLNki5mkDjKYSep55TZqzHIu865MSR1lMJPU8ypt1ZgFgQPMSuosg5mkntfOJOZQ1JlZ/C+pkwxmknpeO5OYQ3FnpsFMUicZzCT1vHK1SqlFjRkU3ZmOYyapkwxmknpepd2uzAiHy5DUUQYzST2vnQFmwa5MSZ1nMJPU88qVxEA7NWYW/0vqMIOZpJ5XrqZR1Jh14YQk9SyDmaSeV6lW26wxsytTUmcZzCT1vPZrzMKR/yV1VFvBLCLOjohlEbE8It7bYP3EiLg2r789IubXrbskL18WEWe1OmZEvDUvSxExe98uT5Jaa2cSc6jVmHXhhCT1rJbBLCJKwGeAc4AFwIURsWDYZm8A1qeUTgA+Dnwk77sAuAB4FnA28NmIKLU45q3AmcCqfbw2SWpLpZroL7X+f2rJccwkdVg7LWanActTSitSSjuBa4Dzhm1zHnBVfnw9cEZERF5+TUppR0rpAWB5Pl7TY6aUfpFSWrmP1yVJbStbYyZpnGgnmB0DrK57viYva7hNSqkMbAQOG2Hfdo45ooi4OCIWR8TitWvXjmZXSRpSrSaqifZqzBwuQ1KHtRPMGv21Gv6Xqdk2o13etpTSFSmlhSmlhXPmzBnNrpI0pFbM78j/ksaDdoLZGmBe3fO5wMPNtomIfmAGsG6Efds5piR1XLmSg1kbNWZ9gTVmkjqqnWD2M+DEiDguIiZQFPMvGrbNIuCi/PiVwM0ppZSXX5Dv2jwOOBG4o81jSlLHlfOIse23mBnMJHVOy2CWa8beCtwE/Aq4LqW0NCIujYg/zptdCRwWEcuBdwHvzfsuBa4D7gW+D7wlpVRpdkyAiHh7RKyhaEW7OyI+v/8uV5J2V8ktYO3XmHX6jCT1sv52Nkop3QjcOGzZB+oebwfOb7LvZcBl7RwzL/8k8Ml2zkuS9lW5OpoaM+/KlNRZjvwvqaeNrsbMrkxJnWUwk9TTajVm7U7J5CTmkjrJYCapp1VG05XpOGaSOsxgJqmnlUdT/G+NmaQOM5hJ6mm1GrOBdufKNJdJ6iCDmaSeNpoas3ASc0kdZjCT1NNGVWNmV6akDjOYSeppo6ox6wsq5jJJHWQwk9TTdrWYtTmOmV2ZkjrIYCappw1W8lyZpdYtZiW7MiV1mMFMUk8bXY2Z45hJ6iyDmaSeNtoaM3syJXWSwUxST6tURlNjhjVmkjrKYCapp9XGMWunxqwvgoR1ZpI6x2AmqaeVRzlXJoC5TFKnGMwk9bTKqObKLLaxxUxSpxjMJPW08ihrzMA6M0mdYzCT1NNGW2MGeGempI4xmEnqaXtTY1axK1NShxjMJPW00dWYFd+tMZPUKQYzST1tNDVmpfCuTEmdZTCT1NNqNWal0dSYWWQmqUMMZpJ62uhqzIrvdmVK6hSDmaSetmtKpvZbzCz+l9QpBjNJPW1Uk5g7XIakDjOYSepplWqi1BdEjOKuTJOZpA4xmEnqaYPValvdmLDnlEwpJT518318/scrOnZ+knqLwUxST6tUUvvBrG/3rswtO8o8snE7P1n+RKdOT1KPMZhJ6mnl3JXZjuHDZTy2aQcAv3l0c2dOTlLPMZhJ6ikbtu5kR7ky9LxSTfSX2vtTOHzk/8c2bQfg4Y3b2bR9cP+eqKSeZDCT1DNSSvzhJ3/CR7+/bGhZea9qzIrntWAGcN9jtppJ2ncGM0k9Y836bTy0YRs/XfHk0LLyXtWYFcns0U3bmTVlAgC/tjtT0n5gMJPUM+5aswGAZY9uZvtg0Z1Zqaa2pmOC3YfLqKbE45t2cNIR05gyoWSdmaT9wmAmqWfcvWYjUBT83/vIpqHH7UxgDrsPl7Fh6yA7K1WOnD6Jk46cxjK7MiXtBwYzST3jrtUbOHrGJADuXl20no2qxqyvNiXTrvqyI6dP5BlHTmPZo5tJTtUkaR8ZzCT1hEo1cc9DGzlzwRHMmTZxV+tZZTTDZRTfqykNBbPDp0/ipCOmsX7rIE9s2dmRc5fUO/rH+gQkqRtWrN3CUzsrnDJ3Jg9v2DZUb1YMl9FeMCvVjWP26KbtzJw8wKSBEvMPOwQoatfmTJvYmQuQ1BNsMZPUE+7KLWTPmTuDU+bOZMUTT7F5+2AeYHa0NWbw+KYdHDG96BY96chpANaZSdpnBjNJPeHuNRuYOrGf4+dM5ZS5M0gJfvnQxqLFbJQ1ZoOVKms37wpms6dOZPbUCd6ZKWmfGcwk9YS7Vm/gt46ZTqkvOGXuTKC4S3OwMpoBZovvj2/eQSUljpi+q9vypCO8M1PSvjOYSTro7SxX+dUjm3lODmSzpkxg3qzJ3LV6w6hqzGpdmY9u3AbAkfkOTyiC2W8e2zw0j2bNvQ9vYuNWp2uS1B6DmaSD3q8f3cTOSnWopQzglLkzuXvNxr2qMXtk43b6oujCrDn5yGls3VnhoQ3bhpY9unE7/89nbuWSf7t7P12JpIOdwUzSQWXpwxt5YsuO3ZbVCv9PmTtjaNlz5s7goQ3beHzT9lHUmBXfd5SrHDZlIgN1k5+fXLsBoK7O7PM/XsHOSpXv3fMoyx/fslfXI6m3GMwkHTRWrN3Cyz/zX1z0hTsoV6pDy+9evYFZUyYw99DJQ8tq3ZoPbxxFMItd29XXlwGcePhUYNedmeuf2slXb3+Ql5w8h4n9fVz+n/fv3UVJ6ikGM0kHhZQS7//2PQAsfXgTV922amjd3Ws2csrcGURdsPqtY2YMFfOPtsYM4Ii6+jKAaZMGOGbm5KEWsy/+10q2DVa45NxncuFpx/LtXzzEmvVb9zhmfYCUJIOZpIPCt+98iP+6/0k+8LIFvOTkOfzzD5bxyMZtbN1Z5r7HN+9WXwYwZWI/J+RWrnZrzOpyGUdMmyWRZZ4AABFGSURBVLTH+pOPLG4A2LKjzJdufYA/WHAEJx0xjb984fFEwP/60Yrdtl/26GZO//9v5p3X3kml6nROktoMZhFxdkQsi4jlEfHeBusnRsS1ef3tETG/bt0lefmyiDir1TEj4rh8jPvyMSfs2yVKOlisXreV1155Ox/8zj1sH6wMLd+wdScfvuFXPPfYmbzmtGO59LzfopISH1q0lHse2kQ1wanzdtWXfe32B/na7Q8ydeLA0HFry+q/huuLoJbNjpzeOJjdv3YLV/3XSjZtL/Pml5wAwNEzJ/Mnz53LNT9bzdrNRf3b/Wu38Kefv53tOyv82y8e4m+uv3uPOzoB7ntsMxu2OtWT1CtaBrOIKAGfAc4BFgAXRsSCYZu9AVifUjoB+DjwkbzvAuAC4FnA2cBnI6LU4pgfAT6eUjoRWJ+PLakHbNw6yLU/e5Cblj7K4LAuvv999yOc+y8/Zsmq9Vx12yrO+/St/CbXc33k+79mw7ZB/uHlz6avL5g36xDefsaJ3LT0Mf7lP34DsEeLGTBUc1aK9royoRhktr8vmDV1z/8znnzENAYriU/dfB+/d8JsTp236zXf9OKnM1ip8oVbH2DVk0/xmv/1UyDxb2/5Xd555kl88+dreN+37xmaCH31uq1c/OXFvPTjP+JFH7uFq29buUer2qbtg3znzof4+YPrnUBdOki0M1fmacDylNIKgIi4BjgPuLdum/OAD+XH1wOfjqKY4zzgmpTSDuCBiFiej0ejY0bEr4D/Drwmb3NVPu6/7tXVST0kpcTOSpUd5SoBTOjvY0Kpb6iuarBSZevOCtt2VoiAyRNKHDJQor/UR0qJp3ZW2LRtkE3bBylFMH3yANMm9TN5oES5mlj31E7Wbt7B2i07GOjrY/a0CcyZOpFDD5nAtsEKq57cysonn2LVk1uZ0N/H/MMO4WmHTWHerMmse2onS1atZ8mq9fziwQ1MmVjit489lOc97VCeO+9Q7n1kE9ctXs2Nv3yEHeUikM2eOoFXPG8u5516DF+5fRVfu/1BTp03k09d+FxWPPEU777uTv740z/hz0+fz9fvWM3Fv388zzxq+tDP4y9feDzf/sVD3Lr8SY6ZOXm3oS1qasGszZ7MYtuAOdMm7lZvVnPSEcWdmdsHq7z5JU/fbd1xs6dw7rOP4urbVrHozofZWa5yzcWnc8LhU3n7GSewo1zhs7fcz4RSMPOQCVz+n/fTF8HbzziRJavW8XffWcrX7ljNB1+2gJ3lKtcvWcNNSx8d+nkdP3sKr/jtubz8ucfw1I4yP13xJLeteJIlq9ZzxPRJvOD4wzj9+MNYOP9Qtu2s8KtHN/PrRzZx3+NbmDVlAs84chrPOHI6Tz98CjvKVR58ciur123loQ3bmDapn3mzDuHYWYdw1IzJDFaqPL5pB49t3s7azTuYMrGfw6dN5Ijpkzj0kAHK1cT6rTtZ/9Qg67fuZPJAiVlTJnDolAlMmVCimmDL9jKbtheft0kDJaZN6mf6pGLu0Wo1sXWwwlM7yjy1o8zEgRJTJ/QzZeKuz+uOcpVtOytsG6wwUOrjkAklJg+UdpudYftghR3lKgN9fUwc6GNi/67fh0o1sbNcZUe5Qn+pWNffF0Pra79Pg5ViZogJpb6hY9fWl6uJciVR6gsGSrFbDWPtNcrVKqUISn17rq9WUx6upVg/XEqJSl4/fN/6Y7QSQdP9Nf60E8yOAVbXPV8DPL/ZNimlckRsBA7Ly386bN9j8uNGxzwM2JBSKjfYfsx8+IZ7ueq2lUPPd3VmQJOHu9Wi1G/f7Hdj+H92E6nF+vY0+1VsdB7RZGt/n8e/cv5HppGJ/X1UU2Kw0vhTM6HURyX/A9BIf19QHuGPf6kvRqyPitj1+Z000Mcpc2eycdsgn7nl/t32mzapn/MXzuVVC+fxxJYdXHPHaq78yQN8LtdlvelFT+fdf3ASA6U+5s06hBvf8ULede1dXPGjFRwzczJ/feaJu73uQKmPf3j5s3nl5bftNkxGvSOnT2r6j+JI19uovgzg6YdPyTMLzOD04w/bY/2bX3wCN9z9CBHw9b98wdAQGxHB/3vWyewoV7nyJw8A8LLnHM3fnvsMjpoxmZQS37vnUT58w71ccEXxJ3XG5AFetXAe5516NCvWPsX1S9bwsZuW8bGblg293tEzikD2yMbtfOnWlVwxrMYNipC5cdvg0Oen/v1qdv0jvd+tPi8DpWj6WYTi8zhYrTY9hwn9fZQrVZq9xMT+PsrVxp/niGJ9pdr496EviuM3W18LYNUq7Gxw08ZAKejvK36fBit7XkP9+uHXEAEDfX2U+oJqDn3119AX0F/qoxTF+koOdaMRUfybELlLvngeObjtepzSrn+DisdAXla7pgS7tdLWjkndcRl6vbw8H3/XcYe9Rt3r1JbXn3vtGAw9bnzsff0n6+sXv6BhC3u3RKvm74g4HzgrpfTG/Py1wGkppbfVbbM0b7MmP7+fomXsUuC2lNJX8vIrgRspulD3OGbd9ifk5fOAG1NKz25wXhcDF+enJwPLhm/TQbOBJ7r4euOJ1957evW6oXevvVevG7z2Xrz2sbjup6WU5jRa0U6L2RpgXt3zucDDTbZZExH9wAxgXYt9Gy1/ApgZEf251azRawGQUroCuKKN89/vImJxSmnhWLz2WPPae+/ae/W6oXevvVevG7z2Xrz28Xbd7VRW/Aw4Md8tOYGimH/RsG0WARflx68Ebk5FU9wi4IJ81+ZxwInAHc2Omff5P/kY5GN+Z+8vT5Ik6cDRssUs14y9FbgJKAFfSCktjYhLgcUppUXAlcDVubh/HUXQIm93HcWNAmXgLSmlCkCjY+aX/P+AayLiw8Av8rElSZIOeu10ZZJSupGiNqx+2QfqHm8Hzm+y72XAZe0cMy9fwa47N8erMelCHSe89t7Tq9cNvXvtvXrd4LX3onF13S2L/yVJktQdTskkSZI0ThjMsog4NSJ+GhF3RsTiiDgtL4+I+GSeOuruiHhe3T4X5amj7ouIi+qW/3ZE/DLv88k82C4RMSsi/j1v/+8RcWj3r3RPEfG2PD3W0oj4aN3y/TKd1khTdo0HEfGeiEgRMTs/P6jf84j4WET8Ol/bv0XEzLp1PfGet9Lseg80ETEvIv5PRPwq/36/Iy9v+Lncn5/98SCKmWZ+ERE35Oej/ryO9ndiPIiImRFxff49/1VEnN4L73lEvDN/zu+JiK9HxKQD8j1PKflVdOf+ADgnPz4XuKXu8fcoxqx7AXB7Xj4LWJG/H5ofH5rX3QGcnvf5Xt1xPwq8Nz9+L/CRcXDdLwF+CEzMzw/P3xcAdwETgeOA+ylu1Cjlx8cDE/I2C/I+1wEX5MeXA3+VH78ZuDw/vgC4dqyvu+7651HchLIKmN0j7/kfAP358Udq59Qr73kbP5+m13ugfQFHAc/Lj6cBv8nvc8PP5f787I+HL+BdwNeAG/LzUX1e9+Z3Yjx8Ucya88b8eAIw82B/zykGo38AmFz3Xr/uQHzPx/wDNF6+KP5xfnV+fCHwtfz4c8CFddsto/hjdyHwubrln8vLjgJ+Xbd8aLvavvnxUcCycXDd1wFnNlh+CXDJsJ/P6fnrpuHb5V/QJ9j1D/7QdrV98+P+vF2M9bXn87keeA6wkl3B7KB+z4dd/8uBr/bSe97Gz6Th9Y71ee2na/sO8NJmn8v9+dkf6y+KcTD/g2Kavxv25vM62t+Jsb7mfC7TKQJKDFt+UL/n7JqBaFZ+D28AzjoQ33O7Mnf5a+BjEbEa+CeKHzo0npLqmBbL1zRYDnBESukRgPz98P18DXvjJOCFuSn3PyPid/Ly0V73SNNp7TZlF1CbsmtMRcQfAw+llO4atupgf8/r/QXF/3ihB97zNjW73gNa7qp5LnA7zT+X+/OzP9Y+AfwNUJs7aW8+r6P9eYwHxwNrgS/mbtzPR8QUDvL3PKX0EMW/3Q8Cj1C8h0s4AN/ztobLOFhExA+BIxuseh9wBvDOlNI3I+JVFOOnnUnjabfSXiwfMy2uu5+imfoFwO8A10XE8TS/jkZhvtV1j9nPpMW1/y1Ft94euzVYdtC85yml7+Rt3kcxvuBXa7s12P6Ae8/3gwP53BuKiKnAN4G/TiltGqEk6ID/7ANExB8Bj6eUlkTEi2uLG2za6vM62t+J8aAfeB7wtpTS7RHxLxRdl80cLO/5ocB5FN2PG4BvAOc02HTcv+c9FcxSSmc2WxcRXwbekZ9+A/h8ftxsWqk1wIuHLb8lL5/bYHuAxyLiqJTSIxFxFPD4Xl3IKLW47r8CvpWKttk7IqJKMW/Y/pxOq9mUXR3X7Noj4tkUv8B35X+k5gI/j+Kmj4P6PYeiqBf4I+CM/N7DQfKe7wftTEN3wIiIAYpQ9tWU0rfy4mafy/352R9Lvwv8cUScC0yi6N77BKP/vI72d2I8WAOsSSndnp9fTxHMDvb3/EzggZTSWoCI+Bbw3zgQ3/Ox7hceL1/Ar4AX58dnAEvy4z9k98LIO/LyWRT9+IfmrweAWXndz/K2tcLIc/Pyj7F78eVHx8F1vwm4ND8+iaKpNoBnsXsB5AqK4sf+/Pg4dhVAPivv/w12L7J8c378FnYvsrxurK+7wc9hJbtqzA729/xsitk45gxb3lPv+Qg/n6bXe6B95c/jl4FPDFve8HO5Pz/74+WLIlzUiv9H9Xndm9+J8fAF/Bg4OT/+UH6/D+r3HHg+sBQ4JJ/XVcDbDsT3fMw/QOPlC/g9iv7ouyhqMH47Lw/gMxR3Y/wSWFi3z18Ay/PX6+uWLwTuyft8ml0D+R5GUYx6X/4+axxc9wTgK/l8fw7897p178vXsIy6u24o7uL5TV73vrrlx1PcrbM8/zLU7vSclJ8vz+uPH+vrbvBzWMmuYHawv+fLKQL4nfnr8l58z1v8jBpe74H2lf+uJeDuuvf73Gafy/352R8vX+wezEb9eR3t78R4+AJOBRbn9/3bFMHqoH/Pgb8Hfp3P7WqKcHXAveeO/C9JkjROeFemJEnSOGEwkyRJGicMZpIkSeOEwUySJGmcMJhJkiSNEwYzSQeNiJgbEd+JiPsiYkVEfDoiJu7n13hxRPy3uudviog/z49fFxFH78/Xk9RbDGaSDgpRTOHwLeDbKaUTgROBycBH9/NLvZhiRHEAUkqXp5S+nJ++DjCYSdprjmMm6aAQEWcAH0wp/X7dsunAKuDvgGeklN6al98A/FNK6ZaI+FeKeWInA9enlD6Yt1lJMXr4y4AB4HxgO/BToEIxUfTbKGYK2UIxSPGXgIeAbRSDVL4xpfTyfLyXAn+VUvqTjv0QJB3wbDGTdLB4FsXsHUNSSpsoAtNI8wK/L6W0EDgFeFFEnFK37omU0vOAfwXek1JaSTGty8dTSqemlH5c91rXU4y2/qcppVOBG4FnRsScvMnrgS/uw/VJ6gEGM0kHi6CYfqjR8pG8KiJ+DvyCItwtqFtXm/R7CTB/NCeTiu6Iq4E/i4iZwOkU8wpKUlMj/S9Skg4kS4FX1C/IXZlHAE8CJ9WtmpTXHwe8B/idlNL6iPhSbV22I3+vsHd/L78IfJeiC/QbKaXyXhxDUg+xxUzSweI/gEPq7pAsAf+TYpLlB4BTI6IvIuYBp+V9pgNPARsj4gjgnDZeZzMwrZ11KaWHgYeB91PUn0nSiAxmkg4Kuevw5cArI+I+ilayakrpMuBWinD2S+CfgJ/nfe6i6MJcCnwhb9fKd4GXR8SdEfHCYeu+BFye103Oy74KrE4p3bsv1yepN3hXpqSDUh5r7OvAn6SUlrTavoPn8WngFymlK8fqHCQdOAxmktQhEbGEoqv0pSmlHa22lySDmSRJ0jhhjZkkSdI4YTCTJEkaJwxmkiRJ44TBTJIkaZwwmEmSJI0TBjNJkqRx4v8C83vrjX6ytCcAAAAASUVORK5CYII=\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"Quantity\"], kde=True)\n", - "plt.title(\"Distribution of Quantity\")\n", - "plt.xlabel(\"Quantity\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAmkAAAFNCAYAAABbpPhvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjMsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+AADFEAAAgAElEQVR4nO3dfZRlVX3m8e+TbkANigJtgrzYrXTUxlHjVCAZM4kjcWjU2FlriDYagxFDjJDMaLIUonESIjOiK5JJAnGRgCKCDWJeWqMikSSaFW0oDBga0lo2CC1GICCKL2jjb/64u+VS3qq63VTbu6q+n7Xu6nP32WffvXedKh7Oyz2pKiRJktSXH9rTHZAkSdL3M6RJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJulBkrwjye/OU1uHJbk3ybL2/h+SvHI+2m7tfTjJCfPV3k587puT3Jnk33dT+/cmecLuaLu1vznJs3dX+5LmhyFNWkKS3Jzkm0m+luQrSf45yauSfO9vQVW9qqr+YMy2fm62OlV1S1XtW1X3z0Pffy/Je6a1f2xVXfBQ297JfhwK/Bawpqp+dMT6lyf5pxHlc87XDm3Otrbt3pXkzXP0qZJ8vYW7LyZ5+45gPEP7R1TVP4zTF0l7jiFNWnp+vqoeCTweeAvweuC8+f6QJMvnu81OPB74j6q6fU93ZJqnV9W+wNHAS4BfnV5hEf9MpEXJkCYtUVV1T1VtBF4MnJDkqfDgIzdJDkzywXbU7a4kn0jyQ0kuBA4DPtCO3rwuycp2ROfEJLcAVw6VDYeDJya5Ksk9Sf4myf7ts56dZNtwH3ccfUqyFvgd4MXt865r6793+rT1641JvpDk9iTvTrJfW7ejHyckuaWdqnzDTHOTZL+2/R2tvTe29n8OuAJ4XOvHu3Zl7tscn53kb9tRzU1Jnji0vpIcnuQk4KXA69rnfWCutqvq34BPADt+njcneX2SzwBfT7J8+KhekmVJfifJ51tfrmlHC0ny5CRXtJ/9liQvGurj85Lc0Lb5YpLf3pW5kDQzQ5q0xFXVVcA24L+OWP1bbd0K4EcYBKWqqpcBtzA4KrdvVb11aJufBZ4CHDPDR/4y8ArgccB24I/H6ONHgP8DXNI+7+kjqr28vf4b8ARgX+BPp9X5aeBJDI42vSnJU2b4yD8B9mvt/Gzr869U1d8BxwK3tX68fK6+z+J44PeBxwBTwBnTK1TVucBFwFvb5/38XI0mWcPgZ/kv0z7r+cCjq2r7tE1e29Y/D3gUg5/NN5L8MINAejHw2FbnnCRHtO3OA36tHZV9KnDlOIOWND5DmiSA24D9R5R/BzgIeHxVfaeqPlFzP/D396rq61X1zRnWX1hV11fV14HfBV402/VTO+GlwNuramtV3QucBqyfdhTv96vqm1V1HXAd8H1hr/XlxcBpVfW1qroZ+EPgZfPQx2F/WVVXtdB0EfCMh9jep5PcDXwA+AvgnUPr/riqbp3hZ/JK4I1VtaUGrquq/wBeANxcVe+squ1V9Wng/cBxbbvvAGuSPKqq7m7rJc0jQ5okgIOBu0aUv43BUZ6PJtma5NQx2rp1J9Z/AdgLOHCsXs7uca294baXMzgCuMPw3ZjfYHC0bboDgb1HtHXwmP3YzmBM0+3FINjsTF92xjOr6jFV9cSqemNVfXdo3Ww/k0OBz48ofzxwVDvV/ZUkX2EQhHfcLPE/GBx9+0KSf0zyUw+x/5KmMaRJS1ySn2AQQL7vjsR2JOm3quoJwM8Dr01y9I7VMzQ515G2Q4eWD2MQXO4Evg48YqhfyxicZh233dsYBIvhtrcDX55ju+nubH2a3tYXx9z+FuCwJNlRkOQRDE4ZfmHGrWY217gfahu3Ak+cofwfq+rRQ699q+rXAarq6qpax2Bcfw1cOg/9lDTEkCYtUUkeleQFwAbgPVX1ryPqvKBdwB7gq8D97QWD8LMr3+X1S0nWtOByOnBZ+4qOzwIPS/L8JHsBbwT2Gdruy8DKDH1dyDTvBV6TZFWSfXngGrbp12DNqvXlUuCMJI9M8ngG1229Z/Ytv2cT8C3g1CQPa9d2vQWYZNdC2q7O87j+AviDJKsz8LQkBwAfBH4sycuS7NVeP5HkKUn2TvLSJPtV1Xd4YN+QNI8MadLS84EkX2NwpOQNwNuBX5mh7mrg74B7gU8C5wx9v9b/Bd7YToXtzJ19FwLvYnC672HAb8LgblPg1QxCwxcZHFkbvtvzfe3f/0gy6vqn81vbHwduYhCUfmMn+jXsN9rnb2VwhPHi1v6cquo+BhfpP5tB/7cyOBX7ojGu5xvlPAbXfn0lyV/vwvZzeTuDUPpRBmHrPODhVfU14L8D6xkcpfx34EweCM4vA25O8lXgVcAv7Ya+SUtadu1vhiRJknYnj6RJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdWj53FX6d+CBB9bKlSv3dDckSZLmdM0119xZVSvmqrcoQtrKlSuZnJzc092QJEmaU5Kxvtja052SJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXIkCZJktShRfHszh+EizfdMla9lxx12G7uiSRJWgo8kiZJktQhQ5okSVKHDGmSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHVorJCWZG2SLUmmkpw6Yv0+SS5p6zclWTm07rRWviXJMUPl5ye5Pcn109p6W5J/S/KZJH+V5NG7PjxJkqSFac6QlmQZcDZwLLAGOD7JmmnVTgTurqrDgbOAM9u2a4D1wBHAWuCc1h7Au1rZdFcAT62qpwGfBU7byTFJkiQteOMcSTsSmKqqrVX1bWADsG5anXXABW35MuDoJGnlG6rqvqq6CZhq7VFVHwfumv5hVfXRqtre3n4KOGQnxyRJkrTgjRPSDgZuHXq/rZWNrNMC1j3AAWNuO5tXAB/eifqSJEmLwjghLSPKasw642w7+kOTNwDbgYtmWH9Skskkk3fcccc4TUqSJC0Y44S0bcChQ+8PAW6bqU6S5cB+DE5ljrPt90lyAvAC4KVVNTLUVdW5VTVRVRMrVqwYYxiSJEkLxzgh7WpgdZJVSfZmcCPAxml1NgIntOXjgCtbuNoIrG93f64CVgNXzfZhSdYCrwdeWFXfGH8okiRJi8ecIa1dY3YKcDlwI3BpVW1OcnqSF7Zq5wEHJJkCXguc2rbdDFwK3AB8BDi5qu4HSPJe4JPAk5JsS3Jia+tPgUcCVyS5Nsk75mmskiRJC8bycSpV1YeAD00re9PQ8reAX5xh2zOAM0aUHz9D/cPH6ZMkSdJi5hMHJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDY4W0JGuTbEkyleTUEev3SXJJW78pycqhdae18i1JjhkqPz/J7Umun9bW/kmuSPK59u9jdn14kiRJC9OcIS3JMuBs4FhgDXB8kjXTqp0I3F1VhwNnAWe2bdcA64EjgLXAOa09gHe1sulOBT5WVauBj7X3kiRJS8o4R9KOBKaqamtVfRvYAKybVmcdcEFbvgw4Okla+Yaquq+qbgKmWntU1ceBu0Z83nBbFwC/sBPjkSRJWhTGCWkHA7cOvd/WykbWqartwD3AAWNuO92PVNWXWltfAh47Rh8lSZIWlXFCWkaU1Zh1xtl2lyQ5Kclkksk77rhjPpqUJEnqxjghbRtw6ND7Q4DbZqqTZDmwH4NTmeNsO92XkxzU2joIuH1Upao6t6omqmpixYoVYwxDkiRp4RgnpF0NrE6yKsneDG4E2DitzkbghLZ8HHBlVVUrX9/u/lwFrAaumuPzhts6AfibMfooSZK0qMwZ0to1ZqcAlwM3ApdW1eYkpyd5Yat2HnBAkingtbQ7MqtqM3ApcAPwEeDkqrofIMl7gU8CT0qyLcmJra23AM9N8jngue29JEnSkpLBAa+FbWJioiYnJ3frZ1y86Zax6r3kqMN2az8kSdLCluSaqpqYq55PHJAkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOjRWSEuyNsmWJFNJTh2xfp8kl7T1m5KsHFp3WivfkuSYudpMcnSSTye5Nsk/JTn8oQ1RkiRp4ZkzpCVZBpwNHAusAY5PsmZatROBu6vqcOAs4My27RpgPXAEsBY4J8myOdr8M+ClVfUM4GLgjQ9tiJIkSQvPOEfSjgSmqmprVX0b2ACsm1ZnHXBBW74MODpJWvmGqrqvqm4Cplp7s7VZwKPa8n7Abbs2NEmSpIVr+Rh1DgZuHXq/DThqpjpVtT3JPcABrfxT07Y9uC3P1OYrgQ8l+SbwVeAnR3UqyUnASQCHHXbYGMOQJElaOMY5kpYRZTVmnZ0tB3gN8LyqOgR4J/D2UZ2qqnOraqKqJlasWDGy45IkSQvVOCFtG3Do0PtD+P5TkN+rk2Q5g9OUd82y7cjyJCuAp1fVplZ+CfBfxhqJJEnSIjJOSLsaWJ1kVZK9GdwIsHFanY3ACW35OODKqqpWvr7d/bkKWA1cNUubdwP7Jfmx1tZzgRt3fXiSJEkL05zXpLVrzE4BLgeWAedX1eYkpwOTVbUROA+4MMkUgyNo69u2m5NcCtwAbAdOrqr7AUa12cp/FXh/ku8yCG2vmNcRS5IkLQAZHPBa2CYmJmpycnK3fsbFm24Zq95LjvImBkmSNLMk11TVxFz1fOKAJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXIkCZJktQhQ5okSVKHDGmSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXIkCZJktQhQ5okSVKHDGmSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHVorJCWZG2SLUmmkpw6Yv0+SS5p6zclWTm07rRWviXJMXO1mYEzknw2yY1JfvOhDVGSJGnhWT5XhSTLgLOB5wLbgKuTbKyqG4aqnQjcXVWHJ1kPnAm8OMkaYD1wBPA44O+S/FjbZqY2Xw4cCjy5qr6b5LHzMVBJkqSFZJwjaUcCU1W1taq+DWwA1k2rsw64oC1fBhydJK18Q1XdV1U3AVOtvdna/HXg9Kr6LkBV3b7rw5MkSVqYxglpBwO3Dr3f1spG1qmq7cA9wAGzbDtbm09kcBRuMsmHk6webyiSJEmLxzghLSPKasw6O1sOsA/wraqaAP4cOH9kp5KTWpCbvOOOO0Z2XJIkaaEaJ6RtY3CN2A6HALfNVCfJcmA/4K5Ztp2tzW3A+9vyXwFPG9Wpqjq3qiaqamLFihVjDEOSJGnhGCekXQ2sTrIqyd4MbgTYOK3ORuCEtnwccGVVVStf3+7+XAWsBq6ao82/Bp7Tln8W+OyuDU2SJGnhmvPuzqranuQU4HJgGXB+VW1OcjowWVUbgfOAC5NMMTiCtr5tuznJpcANwHbg5Kq6H2BUm+0j3wJclOQ1wL3AK+dvuJIkSQtDBge8FraJiYmanJzcrZ9x8aZbxqr3kqMO2639kCRJC1uSa9q197PyiQOSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXIkCZJktQhQ5okSVKHDGmSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXIkCZJktQhQ5okSVKHDGmSJEkdMqRJkiR1yJAmSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1KGxQlqStUm2JJlKcuqI9fskuaSt35Rk5dC601r5liTH7ESbf5Lk3l0bliRJ0sI2Z0hLsgw4GzgWWAMcn2TNtGonAndX1eHAWcCZbds1wHrgCGAtcE6SZXO1mWQCePRDHJskSdKCNc6RtCOBqaraWlXfBjYA66bVWQdc0JYvA45Okla+oaruq6qbgKnW3oxttgD3NuB1D21okiRJC9c4Ie1g4Nah99ta2cg6VbUduAc4YJZtZ2vzFGBjVX1pvCFIkiQtPsvHqJMRZTVmnZnKR4XDSvI44BeBZ8/ZqeQk4CSAww47bK7qkiRJC8o4R9K2AYcOvT8EuG2mOkmWA/sBd82y7UzlPw4cDkwluRl4RJKpUZ2qqnOraqKqJlasWDHGMCRJkhaOcULa1cDqJKuS7M3gRoCN0+psBE5oy8cBV1ZVtfL17e7PVcBq4KqZ2qyqv62qH62qlVW1EvhGuxlBkiRpSZnzdGdVbU9yCnA5sAw4v6o2JzkdmKyqjcB5wIXtqNddDEIXrd6lwA3AduDkqrofYFSb8z88SZKkhSmDA14L28TERE1OTu7Wz7h40y1j1XvJUV4fJ0mSZpbkmqqamKueTxyQJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjo0VkhLsjbJliRTSU4dsX6fJJe09ZuSrBxad1or35LkmLnaTHJRK78+yflJ9npoQ5QkSVp45gxpSZYBZwPHAmuA45OsmVbtRODuqjocOAs4s227BlgPHAGsBc5JsmyONi8Cngz8J+DhwCsf0gglSZIWoHGOpB0JTFXV1qr6NrABWDetzjrggrZ8GXB0krTyDVV1X1XdBEy19mZss6o+VA1wFXDIQxuiJEnSwjNOSDsYuHXo/bZWNrJOVW0H7gEOmGXbOdtspzlfBnxkVKeSnJRkMsnkHXfcMcYwJEmSFo5xQlpGlNWYdXa2fNg5wMer6hOjOlVV51bVRFVNrFixYlQVSZKkBWv5GHW2AYcOvT8EuG2GOtuSLAf2A+6aY9sZ20zyv4EVwK+N0T9JkqRFZ5wjaVcDq5OsSrI3gxsBNk6rsxE4oS0fB1zZrinbCKxvd3+uAlYzuM5sxjaTvBI4Bji+qr770IYnSZK0MM15JK2qtic5BbgcWAacX1Wbk5wOTFbVRuA84MIkUwyOoK1v225OcilwA7AdOLmq7gcY1Wb7yHcAXwA+Obj3gL+sqtPnbcSSJEkLQAYHvBa2iYmJmpyc3K2fcfGmW8aq95KjDtut/ZAkSQtbkmuqamKuej5xQJIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIkyRJ6pAhTZIkqUOGNEmSpA4Z0iRJkjpkSJMkSeqQIU2SJKlDhjRJkqQOGdIkSZI6NFZIS7I2yZYkU0lOHbF+nySXtPWbkqwcWndaK9+S5Ji52kyyqrXxudbm3g9tiJIkSQvPnCEtyTLgbOBYYA1wfJI106qdCNxdVYcDZwFntm3XAOuBI4C1wDlJls3R5pnAWVW1Gri7tS1JkrSkjHMk7Uhgqqq2VtW3gQ3Auml11gEXtOXLgKOTpJVvqKr7quomYKq1N7LNts1zWhu0Nn9h14cnSZK0MC0fo87BwK1D77cBR81Up6q2J7kHOKCVf2ratge35VFtHgB8paq2j6i/x7z7kzfz5g/eOFbdN//tDbu3M5Ikabd5+4ueztqnHrSnuwGMF9IyoqzGrDNT+agjeLPV//5OJScBJ7W39ybZMqrePDoQuHM3f0bvnAPnYAfnwTkA5wCcA1hkc3DsH+zSZjs7B48fp9I4IW0bcOjQ+0OA22aosy3JcmA/4K45th1Vfifw6CTL29G0UZ8FQFWdC5w7Rv/nRZLJqpr4QX1ej5wD52AH58E5AOcAnANwDmD3zcE416RdDaxud13uzeBGgI3T6mwETmjLxwFXVlW18vXt7s9VwGrgqpnabNv8fWuD1ubf7PrwJEmSFqY5j6S1a8xOAS4HlgHnV9XmJKcDk1W1ETgPuDDJFIMjaOvbtpuTXArcAGwHTq6q+wFGtdk+8vXAhiRvBv6ltS1JkrSkjHO6k6r6EPChaWVvGlr+FvCLM2x7BnDGOG228q0M7v7szQ/s1GrHnAPnYAfnwTkA5wCcA3AOYDfNQQZnGCVJktQTHwslSZLUIUPaGOZ6LNZCl+TmJP+a5Nokk61s/yRXtMdzXZHkMa08Sf64zcVnkjxzqJ0TWv3PJTlhps/rQZLzk9ye5Pqhsnkbc5L/3OZ0qm076utl9qgZ5uD3knyx7QvXJnne0LpF94i3JIcm+fskNybZnOR/tvIlsy/MMgdLZl9I8rAkVyW5rs3B77fykf3OPD4KsRezzMG7ktw0tB88o5Uvut+FHTJ4MtK/JPlge7/n9oOq8jXLi8GNDZ8HngDsDVwHrNnT/ZrnMd4MHDit7K3AqW35VODMtvw84MMMvtPuJ4FNrXx/YGv79zFt+TF7emyzjPlngGcC1++OMTO4i/mn2jYfBo7d02Mecw5+D/jtEXXXtH1/H2BV+51YNtvvB3ApsL4tvwP49T095hHjOgh4Zlt+JPDZNtYlsy/MMgdLZl9oP5t92/JewKb28x3Zb+DVwDva8nrgkl2dm15es8zBu4DjRtRfdL8LQ2N7LXAx8MHZ9t8fxH7gkbS5jfNYrMVo+FFfw4/nWge8uwY+xeB77Q4CjgGuqKq7qupu4AoGz2vtUlV9nMGdyMPmZcxt3aOq6pM1+I19Nx0+3myGOZjJonzEW1V9qao+3Za/BtzI4CknS2ZfmGUOZrLo9oX287y3vd2rvYqZ+z0vj0LczcPaKbPMwUwW3e8CQJJDgOcDf9Hez7b/7vb9wJA2t1GPxdrjj6qaZwV8NMk1GTzJAeBHqupLMPgjDjy2lc80H4thnuZrzAe35enlC8Up7fTF+Wmn+dj5OejyEW+zaacqfpzBEYQluS9MmwNYQvtCO8V1LXA7g2DxeWbu94MehQgMPwpxwf59nD4HVbVjPzij7QdnJdmnlS3W34U/Al4HfLe9n23/3e37gSFtbmM/qmoBe1ZVPRM4Fjg5yc/MUndnHwG2GOzsmBfyXPwZ8ETgGcCXgD9s5Yt6DpLsC7wf+F9V9dXZqo4oWxTzMGIOltS+UFX3V9UzGDzp5kjgKaOqtX+XxBwkeSpwGvBk4CcYnMJ8fau+6OYgyQuA26vqmuHiEVV/YPuBIW1u4zwWa0Grqtvav7cDf8XgD9SX2+Fp2r+3t+ozzcdimKf5GvO2tjy9vHtV9eX2h/q7wJ/zwHcW7uwcfO8Rb9PKu5NkLwbh5KKq+stWvKT2hVFzsBT3BYCq+grwDwyus5qp398ba8Z7FOKC+vs4NAdr2+nwqqr7gHey6/vBQvhdeBbwwiQ3MzgV+RwGR9b23H4w2wVrvgoGX/i7lcHFfzsu9DtiT/drHsf3w8Ajh5b/mcG1ZG/jwRdOv7UtP58HXyx6VSvfH7iJwYWij2nL++/p8c0x9pU8+KL5eRszg0ef/SQPXCD7vD093jHn4KCh5dcwuK4C4AgefCHsVgYXwc74+wG8jwdfbPvqPT3eEeMPg2tj/mha+ZLZF2aZgyWzLwArgEe35YcDnwBeMFO/gZN58AXjl+7q3PTymmUODhraT/4IeMti/V2YNh/P5oEbB/bYfrDHJ2IhvBjcxfJZBtcovGFP92eex/aEtqNcB2zeMT4G59U/Bnyu/bvjlyzA2W0u/hWYGGrrFQwukJwCfmVPj22Ocb+XwSmc7zD4v5sT53PMwARwfdvmT2lfHN3Ta4Y5uLCN8TMMnr07/B/qN7TxbGHorqyZfj/avnVVm5v3Afvs6TGPmIOfZnC64TPAte31vKW0L8wyB0tmXwCexuAxhJ9pP6s3zdZv4GHt/VRb/4RdnZteXrPMwZVtP7geeA8P3AG66H4Xps3Hs3kgpO2x/cAnDkiSJHXIa9IkSZI6ZEiTJEnqkCFNkiSpQ4Y0SZKkDhnSJEmSOmRIk7SkJLk/ybVJrk/yviSPmKHeh5I8+gfdP0nawa/gkLSkJLm3qvZtyxcB11TV24fWh8Hfxu/O1IYk/SB4JE3SUvYJ4PAkK5PcmOQc4NPAoUluTnIgQJJfbg+Yvi7Jha1sRZL3J7m6vZ61B8chaRFaPncVSVp82rP2jgU+0oqexODb0V/d1u+odwSDbw9/VlXdmWT/Vv//AWdV1T8lOQy4nNEP5ZakXWJIk7TUPDzJtW35E8B5wOOAL1TVp0bUfw5wWVXdCVBVd7XynwPW7AhzwKOSPLKqvrb7ui5pKTGkSVpqvllVzxguaEHr6zPUD4NnW073Q8BPVdU357d7kjTgNWmSNLuPAS9KcgDA0OnOj9cCM10AAABuSURBVAKn7KiU5BkjtpWkXWZIk6RZVNVm4AzgH5NcB+y4E/Q3gYl2Q8ENwKv2VB8lLU5+BYckSVKHPJImSZLUIUOaJElShwxpkiRJHTKkSZIkdciQJkmS1CFDmiRJUocMaZIkSR0ypEmSJHXo/wNVPne5VYE07wAAAABJRU5ErkJggg==\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"UnitPrice\"], kde=True)\n", - "plt.title(\"Distribution of Unit Prices\")\n", - "plt.xlabel(\"Price\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are no negative prices, which is good, but we can see some extreme outliers." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
QuantityUnitPriceCustomerID
count406829.000000406829.000000406829.000000
mean12.0613033.46047115287.690570
std248.69337069.3151621713.600303
min-80995.0000000.00000012346.000000
25%2.0000001.25000013953.000000
50%5.0000001.95000015152.000000
75%12.0000003.75000016791.000000
max80995.00000038970.00000018287.000000
\n", - "
" - ], - "text/plain": [ - " Quantity UnitPrice CustomerID\n", - "count 406829.000000 406829.000000 406829.000000\n", - "mean 12.061303 3.460471 15287.690570\n", - "std 248.693370 69.315162 1713.600303\n", - "min -80995.000000 0.000000 12346.000000\n", - "25% 2.000000 1.250000 13953.000000\n", - "50% 5.000000 1.950000 15152.000000\n", - "75% 12.000000 3.750000 16791.000000\n", - "max 80995.000000 38970.000000 18287.000000" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(274399, 6)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", - " \"Quantity\"\n", - "].sum()\n", - "df = df.loc[df > 0].reset_index()\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def loadDataset(dataframe):\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = dataframe[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", - "\n", - " row = range(len(dataframe))\n", - " col = [0] * len(dataframe)\n", - " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", - "\n", - " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", - "\n", - " return X, y" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = loadDataset(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9991284988048746" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# display sparsity\n", - "total_cells = X.shape[0] * X.shape[1]\n", - "(total_cells - X.nnz) / total_cells" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Data For Modeling\n", - "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save numpy arrays to local storage in /data folder\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", - "save_npz(\"data/X_train.npz\", X_train)\n", - "save_npz(\"data/X_test.npz\", X_test)\n", - "np.savez(\"data/y_train.npz\", y_train)\n", - "np.savez(\"data/y_test.npz\", y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "prefix = \"personalization\"\n", - "\n", - "train_key = \"train.protobuf\"\n", - "train_prefix = f\"{prefix}/train\"\n", - "\n", - "test_key = \"test.protobuf\"\n", - "test_prefix = f\"{prefix}/test\"\n", - "\n", - "output_prefix = f\"s3://{bucket}/{prefix}/output\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", - " buf = io.BytesIO()\n", - " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", - " buf.seek(0)\n", - " obj = \"{}/{}\".format(prefix, key)\n", - " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", - " return \"s3://{}/{}\".format(bucket, obj)\n", - "\n", - "\n", - "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", - "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", - "\n", - "print(train_data_location)\n", - "print(test_data_location)\n", - "print(\"Output: {}\".format(output_prefix))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'train_data_location' (str)\n", - "Stored 'test_data_location' (str)\n" - ] - } - ], - "source": [ - "%store train_data_location\n", - "%store test_data_location" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the next notebook we will explore training and tuning." - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb similarity index 68% rename from use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb rename to use-cases/retail_recommend/retail_recommend.ipynb index 3bb6535cf2..04b5d6df93 100644 --- a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 2. Train and Make Predictions\n", + "# Recommendation Engine for E-Commerce Sales\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", "## Dataset\n", "\n", "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "\n", "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", "+ Description: Product (item) name. Nominal.\n", @@ -28,9 +27,129 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Solution Architecture\n", + "## Part 1: Data Preparation\n", "----\n", - "![Architecture](./images/retail_rec_train_reg_deploy.png)" + "The first of the notebook will focus on preparing the data for training.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_dataprep.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import sagemaker.amazon.common as smac\n", + "import boto3\n", + "\n", + "import io\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from scipy.sparse import csr_matrix, hstack, save_npz\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert sagemaker.__version__ >= \"2.21.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region = boto3.Session().region_name\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "print(f\"using bucket{bucket} in region {region} \\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preprocessing\n", + "\n", + "First, we check for any null (i.e. missing) values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "print(df.shape)" ] }, { @@ -38,22 +157,171 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"Quantity\"], kde=True)\n", + "plt.title(\"Distribution of Quantity\")\n", + "plt.xlabel(\"Quantity\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"UnitPrice\"], kde=True)\n", + "plt.title(\"Distribution of Unit Prices\")\n", + "plt.xlabel(\"Price\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are no negative prices, which is good, but we can see some extreme outliers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def loadDataset(dataframe):\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", + "\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = dataframe[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", + "\n", + " row = range(len(dataframe))\n", + " col = [0] * len(dataframe)\n", + " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", + "\n", + " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", + "\n", + " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", + "\n", + " return X, y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = loadDataset(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display sparsity\n", + "total_cells = X.shape[0] * X.shape[1]\n", + "(total_cells - X.nnz) / total_cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save numpy arrays to local storage in /data folder\n" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", + "save_npz(\"data/X_train.npz\", X_train)\n", + "save_npz(\"data/X_test.npz\", X_test)\n", + "np.savez(\"data/y_train.npz\", y_train)\n", + "np.savez(\"data/y_test.npz\", y_test)" + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "!pip install -Uq sagemaker boto3" + "prefix = \"personalization\"\n", + "\n", + "train_key = \"train.protobuf\"\n", + "train_prefix = f\"{prefix}/train\"\n", + "\n", + "test_key = \"test.protobuf\"\n", + "test_prefix = f\"{prefix}/test\"\n", + "\n", + "output_prefix = f\"s3://{bucket}/{prefix}/output\"" ] }, { @@ -62,13 +330,38 @@ "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", + " buf = io.BytesIO()\n", + " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", + " buf.seek(0)\n", + " obj = \"{}/{}\".format(prefix, key)\n", + " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", + " return \"s3://{}/{}\".format(bucket, obj)\n", + "\n", + "\n", + "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", + "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", + "\n", + "print(train_data_location)\n", + "print(test_data_location)\n", + "print(\"Output: {}\".format(output_prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Train, Tune, and Deploy Model\n", + "----\n", + "This second part will focus on training, tuning, and deploying a model trained on the data prepared in part 1.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_train_reg_deploy.png)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -90,194 +383,283 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" + "region = boto3.Session().region_name\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "\n", + "prefix = \"personalization\"\n", + "\n", + "output_prefix = f\"s3://{bucket}/{prefix}/output\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load array\n", + "X_train = load_npz(\"./data/X_train.npz\")\n", + "X_test = load_npz(\"./data/X_test.npz\")\n", + "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", + "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", + "y_train = y_train_npzfile.f.arr_0\n", + "y_test = y_test_npzfile.f.arr_0\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", + "input_dims = X_train.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", + "\n", + "fm = sagemaker.estimator.Estimator(\n", + " container,\n", + " sagemaker_role,\n", + " instance_count=1,\n", + " instance_type=\"ml.c5.xlarge\",\n", + " output_path=output_prefix,\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", + "\n", + "fm.set_hyperparameters(\n", + " feature_dim=input_dims,\n", + " predictor_type=\"regressor\",\n", + " mini_batch_size=1000,\n", + " num_factors=64,\n", + " epochs=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 'training_job_name' not in locals():\n", + " \n", + " fm.fit({'train': train_data_location, 'test': test_data_location})\n", + " training_job_name = fm.latest_training_job.job_name\n", + " \n", + "else:\n", + " print(f'Using previous training job: {training_job_name}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", + "\n", + "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", + "\n", + "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.deserializers import JSONDeserializer\n", + "from sagemaker.serializers import JSONSerializer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class FMSerializer(JSONSerializer):\n", + " def serialize(self, data):\n", + " js = {\"instances\": []}\n", + " for row in data:\n", + " js[\"instances\"].append({\"features\": row.tolist()})\n", + " return json.dumps(js)\n", + "\n", + "\n", + "fm_predictor = fm.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " serializer=FMSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# find customer who spent the most money\n", + "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", + "\n", + "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", + "top_customer = (\n", + " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", + " popular_items = (\n", + " df.groupby([\"StockCode\", \"UnitPrice\"])\n", + " .nunique()[\"CustomerID\"]\n", + " .sort_values(ascending=False)\n", + " .reset_index()\n", + " )\n", + " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", + " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", + "\n", + " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", + " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", + "\n", + " # find customer's country\n", + " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", + " country = df_subset[\"Country\"].value_counts().index[0]\n", + "\n", + " data = {\n", + " \"StockCode\": top_n_items,\n", + " \"Description\": [item_map[i] for i in top_n_items],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices,\n", + " }\n", "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", + " df_inference = pd.DataFrame(data)\n", "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", + " # we need to build the data set similar to how we built it for training\n", + " # it should have the same number of features as the training data\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " enc.fit(df[onehot_cols])\n", + " onehot_output = enc.transform(df_inference[onehot_cols])\n", "\n", - "bucket = sagemaker_session.default_bucket()\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = df[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", "\n", - "prefix = \"personalization\"\n", + " row = range(len(df_inference))\n", + " col = [0] * len(df_inference)\n", + " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", "\n", - "output_prefix = f\"s3://{bucket}/{prefix}/output\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Data For Modeling\n", + " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + " result = fm_predictor.predict(X_inference.toarray())\n", + " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", + " index_array = np.array(preds).argsort()\n", + " items = enc.inverse_transform(onehot_output)[:, 0]\n", + " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", + " recommendations = [[i, item_map[i]] for i in top_recs]\n", + " return recommendations" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# load array\n", - "X_train = load_npz(\"./data/X_train.npz\")\n", - "X_test = load_npz(\"./data/X_test.npz\")\n", - "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", - "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", - "y_train = y_train_npzfile.f.arr_0\n", - "y_test = y_test_npzfile.f.arr_0" + "print(\"Top 5 recommended products:\")\n", + "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" ] }, { - "cell_type": "code", - "execution_count": 7, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + "Once you are done with the endpoint, you should delete the endpoint to save cost and free resources." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'input_dims' (int)\n" - ] - } - ], + "outputs": [], "source": [ - "input_dims = X_train.shape[1]\n", - "%store input_dims" + "fm_predictor.delete_model()\n", + "fm_predictor.delete_endpoint()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Train the factorization machine model\n", - "\n", - "Once we have the data preprocessed and available in the correct format for training, the next step is to actually train the model using the data. \n", + "## Optional Part: Registering the Model in SageMaker Model Registry\n", "\n", - "We'll use the Amazon SageMaker Python SDK to kick off training and monitor status until it is completed. In this example that takes only a few minutes. Despite the model only need 1-2 minutes to train, there is some extra time required upfront to provision hardware and load the algorithm container.\n", - "\n", - "First, let's specify our containers. To find the rigth container, we'll create a small lookup. More details on algorithm containers can be found in [AWS documentation.](https://docs-aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)" + "Once a useful model has been trained, you have the option to register the model for future reference and possible deployment. To do so, we must first properly associate the artifacts of the model." ] }, { - "cell_type": "code", - "execution_count": 9, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", - "\n", - "fm = sagemaker.estimator.Estimator(\n", - " container,\n", - " sagemaker_role,\n", - " instance_count=1,\n", - " instance_type=\"ml.c5.xlarge\",\n", - " output_path=output_prefix,\n", - " sagemaker_session=sagemaker_session,\n", - ")\n", - "\n", - "fm.set_hyperparameters(\n", - " feature_dim=input_dims,\n", - " predictor_type=\"regressor\",\n", - " mini_batch_size=1000,\n", - " num_factors=64,\n", - " epochs=20,\n", - ")" + "### Training data artifact" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if 'training_job_name' not in locals():\n", - " \n", - " fm.fit({'train': train_data_location, 'test': test_data_location})\n", - " training_job_name = fm.latest_training_job.job_name\n", - " %store training_job_name\n", - " \n", - "else:\n", - " print(f'Using previous training job: {training_job_name}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName=training_job_name)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training data artifact" - ] - }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/cdd7fbecb4eefa22c43b2ad48140acc2\n" - ] - } - ], + "outputs": [], "source": [ "training_data_s3_uri = training_job_info[\"InputDataConfig\"][0][\"DataSource\"][\"S3DataSource\"][\n", " \"S3Uri\"\n", @@ -318,17 +700,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/3acde2fc029adeff9c767be68feac3a7\n" - ] - } - ], + "outputs": [], "source": [ "trained_model_s3_uri = training_job_info[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", "\n", @@ -358,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -377,18 +751,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Association already exists with DataSet\n", - "Association with Model: SUCCEESFUL\n" - ] - } - ], + "outputs": [], "source": [ "artifact_list = [[training_data_artifact, \"ContributedTo\"], [model_artifact, \"Produced\"]]\n", "\n", @@ -430,41 +795,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## SageMaker Model Registry\n", - "\n", - "Once a useful model has been trained and its artifacts properly associated, the next step is to register the model for future reference and possible deployment.\n", - "\n", "### Create Model Package Group\n", "\n", - "A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." + "After associating all the relevant artifacts, the Model Package Group can now be created. A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'mpg_name' (str)\n", - "Model Package Group name: retail-recommendation-2021-03-01-21-41\n" - ] - } - ], + "outputs": [], "source": [ "if 'mpg_name' not in locals():\n", " timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", " mpg_name = f'retail-recommendation-{timestamp}'\n", - " %store mpg_name\n", "\n", "print(f'Model Package Group name: {mpg_name}')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -493,7 +844,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -519,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -546,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -562,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -586,17 +937,9 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model package status: Completed\n" - ] - } - ], + "outputs": [], "source": [ "mp_info = sagemaker_boto_client.describe_model_package(\n", " ModelPackageName=mp_response[\"ModelPackageArn\"]\n", @@ -615,7 +958,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -630,277 +973,25 @@ "update_response = sagemaker_boto_client.update_model_package(**model_package_update)" ] }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Name/SourceDirectionTypeAssociation TypeLineage Type
0s3://...1-03-01-21-36-56-437/output/model.tar.gzInputModelProducedartifact
1s3://...12437/personalization/test/test.protobufInputDataSetContributedToartifact
2s3://...437/personalization/train/train.protobufInputDataSetContributedToartifact
340461...2.amazonaws.com/factorization-machines:1InputImageContributedToartifact
4s3://...1-03-01-21-36-56-437/output/model.tar.gzOutputModelProducedartifact
\n", - "
" - ], - "text/plain": [ - " Name/Source Direction Type \\\n", - "0 s3://...1-03-01-21-36-56-437/output/model.tar.gz Input Model \n", - "1 s3://...12437/personalization/test/test.protobuf Input DataSet \n", - "2 s3://...437/personalization/train/train.protobuf Input DataSet \n", - "3 40461...2.amazonaws.com/factorization-machines:1 Input Image \n", - "4 s3://...1-03-01-21-36-56-437/output/model.tar.gz Output Model \n", - "\n", - " Association Type Lineage Type \n", - "0 Produced artifact \n", - "1 ContributedTo artifact \n", - "2 ContributedTo artifact \n", - "3 ContributedTo artifact \n", - "4 Produced artifact " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", - "\n", - "viz = LineageTableVisualizer(sagemaker_session)\n", - "display(viz.show(training_job_name=training_job_name))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Make Predictions\n", - "\n", - "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", - "\n", - "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", - "\n", - "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.deserializers import JSONDeserializer\n", - "from sagemaker.serializers import JSONSerializer" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "class FMSerializer(JSONSerializer):\n", - " def serialize(self, data):\n", - " js = {\"instances\": []}\n", - " for row in data:\n", - " js[\"instances\"].append({\"features\": row.tolist()})\n", - " return json.dumps(js)\n", - "\n", - "\n", - "fm_predictor = fm.deploy(\n", - " initial_instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " serializer=FMSerializer(),\n", - " deserializer=JSONDeserializer(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# find customer who spent the most money\n", - "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", - "\n", - "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", - "top_customer = (\n", - " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", - " popular_items = (\n", - " df.groupby([\"StockCode\", \"UnitPrice\"])\n", - " .nunique()[\"CustomerID\"]\n", - " .sort_values(ascending=False)\n", - " .reset_index()\n", - " )\n", - " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", - " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", - "\n", - " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", - " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", - "\n", - " # find customer's country\n", - " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", - " country = df_subset[\"Country\"].value_counts().index[0]\n", - "\n", - " data = {\n", - " \"StockCode\": top_n_items,\n", - " \"Description\": [item_map[i] for i in top_n_items],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices,\n", - " }\n", - "\n", - " df_inference = pd.DataFrame(data)\n", - "\n", - " # we need to build the data set similar to how we built it for training\n", - " # it should have the same number of features as the training data\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " enc.fit(df[onehot_cols])\n", - " onehot_output = enc.transform(df_inference[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = df[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", - "\n", - " row = range(len(df_inference))\n", - " col = [0] * len(df_inference)\n", - " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", + "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", "\n", - " result = fm_predictor.predict(X_inference.toarray())\n", - " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", - " index_array = np.array(preds).argsort()\n", - " items = enc.inverse_transform(onehot_output)[:, 0]\n", - " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", - " recommendations = [[i, item_map[i]] for i in top_recs]\n", - " return recommendations" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top 5 recommended products:\n" - ] - }, - { - "data": { - "text/plain": [ - "[['22423', 'REGENCY CAKESTAND 3 TIER'],\n", - " ['22776', 'SWEETHEART CAKESTAND 3 TIER'],\n", - " ['22624', 'IVORY KITCHEN SCALES'],\n", - " ['85123A', 'WHITE HANGING HEART T-LIGHT HOLDER'],\n", - " ['85099B', 'JUMBO BAG RED RETROSPOT']]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\"Top 5 recommended products:\")\n", - "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" + "viz = LineageTableVisualizer(sagemaker_session)\n", + "display(viz.show(training_job_name=training_job_name))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -912,7 +1003,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb similarity index 96% rename from use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb rename to use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 5d9e3b09ae..81f1a3b945 100644 --- a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 3. Build Pipeline\n", + "# Recommendation Engine for E-Commerce Sales - Pipeline Mode\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", @@ -32,28 +32,18 @@ "![Architecture](./images/retail_rec_pipeline.png)" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "! pip install --upgrade sagemaker" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -63,13 +53,16 @@ "from sagemaker.workflow.step_collections import RegisterModel\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString\n", + "import datetime\n", "import boto3\n", - "import time" + "import time\n", + "import pandas as pd\n", + "from preprocessing import loadDataset" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -78,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -108,9 +101,41 @@ "## Define Estimator" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, the number of feature dimensions must be calculated as it is a hyperparameter of the estimator. The feature dimensions are calculated by looking at the dataset, cleaning and preprocessing it as defined in the first part of [Recommendation Engine for E-Commerce Sales](retail_recommend.ipynb), and then counting the number of feature dimensions are in the processed dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "X, y = loadDataset(df)\n", + "input_dims = X.shape[1]\n", + "input_dims" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After calculating all the hyperparameters that are needed, the estimator is created." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -209,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -233,13 +258,22 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_step.properties.AlgorithmSpecification.TrainingImage._path" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container,#train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", @@ -252,10 +286,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", + "mpg_name = f'retail-recommendation-{timestamp}'\n", + "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", " estimator=fm,\n", @@ -271,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -306,20 +343,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'pipeline_name' (str)\n" - ] - } - ], + "outputs": [], "source": [ "pipeline_name = f\"PersonalizationDemo\"\n", - "%store pipeline_name\n", "\n", "pipeline = Pipeline(\n", " name=pipeline_name,\n", @@ -376,21 +404,14 @@ " display(viz.show(pipeline_execution_step=execution_step))\n", " time.sleep(5)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -402,7 +423,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4, From a3c42d585357ef1b0d8b6ef7d90ab8665c6b854f Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 20:20:39 +0000 Subject: [PATCH 2/8] cleanup --- .../retail_recommend/retail_recommend_pipeline.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 81f1a3b945..261616f913 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -256,15 +256,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_step.properties.AlgorithmSpecification.TrainingImage._path" - ] - }, { "cell_type": "code", "execution_count": null, From 790beddd64d61f5363b25f01ad2a7e16d2b07e0e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 20:36:02 +0000 Subject: [PATCH 3/8] reformat --- .../retail_recommend/retail_recommend.ipynb | 18 +++++++++--------- .../retail_recommend_pipeline.ipynb | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index 04b5d6df93..abe3fa0ea5 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -474,13 +474,13 @@ "metadata": {}, "outputs": [], "source": [ - "if 'training_job_name' not in locals():\n", - " \n", - " fm.fit({'train': train_data_location, 'test': test_data_location})\n", + "if \"training_job_name\" not in locals():\n", + "\n", + " fm.fit({\"train\": train_data_location, \"test\": test_data_location})\n", " training_job_name = fm.latest_training_job.job_name\n", - " \n", + "\n", "else:\n", - " print(f'Using previous training job: {training_job_name}')" + " print(f\"Using previous training job: {training_job_name}\")" ] }, { @@ -806,11 +806,11 @@ "metadata": {}, "outputs": [], "source": [ - "if 'mpg_name' not in locals():\n", - " timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", - " mpg_name = f'retail-recommendation-{timestamp}'\n", + "if \"mpg_name\" not in locals():\n", + " timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + " mpg_name = f\"retail-recommendation-{timestamp}\"\n", "\n", - "print(f'Model Package Group name: {mpg_name}')" + "print(f\"Model Package Group name: {mpg_name}\")" ] }, { diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 261616f913..a8a1b23605 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -264,7 +264,7 @@ "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=container,#train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container, # train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", @@ -281,8 +281,8 @@ "metadata": {}, "outputs": [], "source": [ - "timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", - "mpg_name = f'retail-recommendation-{timestamp}'\n", + "timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + "mpg_name = f\"retail-recommendation-{timestamp}\"\n", "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", From ced1ef28215bc9017c440e487dfa6037e832fec6 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:46:51 +0000 Subject: [PATCH 4/8] make pandas version compatible --- .../retail_recommend/retail_recommend.ipynb | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index abe3fa0ea5..6bb3adbe38 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -565,14 +565,17 @@ " # find customer's country\n", " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", " country = df_subset[\"Country\"].value_counts().index[0]\n", - "\n", - " data = {\n", - " \"StockCode\": top_n_items,\n", - " \"Description\": [item_map[i] for i in top_n_items],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices,\n", - " }\n", + " \n", + " data = []\n", + " flattened_item_map = [item_map[i] for i in top_n_items]\n", + " for idx in range(len(top_n_items)):\n", + " data.append({\n", + " \"StockCode\": top_n_items[idx],\n", + " \"Description\": flattened_item_map[idx],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices[idx],\n", + " })\n", "\n", " df_inference = pd.DataFrame(data)\n", "\n", @@ -987,7 +990,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From 873eefae97ef9e1b81afeaed23666ea925cce0a6 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:47:58 +0000 Subject: [PATCH 5/8] reformat --- .../retail_recommend/retail_recommend.ipynb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index 6bb3adbe38..ef3775439e 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -565,17 +565,19 @@ " # find customer's country\n", " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", " country = df_subset[\"Country\"].value_counts().index[0]\n", - " \n", + "\n", " data = []\n", " flattened_item_map = [item_map[i] for i in top_n_items]\n", " for idx in range(len(top_n_items)):\n", - " data.append({\n", - " \"StockCode\": top_n_items[idx],\n", - " \"Description\": flattened_item_map[idx],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices[idx],\n", - " })\n", + " data.append(\n", + " {\n", + " \"StockCode\": top_n_items[idx],\n", + " \"Description\": flattened_item_map[idx],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices[idx],\n", + " }\n", + " )\n", "\n", " df_inference = pd.DataFrame(data)\n", "\n", From aff2a74e1d99860331652c4fa479d14bf8ccac15 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:49:50 +0000 Subject: [PATCH 6/8] cleanup --- use-cases/retail_recommend/retail_recommend_pipeline.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index a8a1b23605..72bfcfa4e9 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -264,7 +264,7 @@ "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=container, # train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", From 3238674b098813487a33f65375bbfbcbd53327c4 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:59:47 +0000 Subject: [PATCH 7/8] dleete instance type --- use-cases/retail_recommend/retail_recommend_pipeline.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 72bfcfa4e9..3e5bc3b221 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -398,7 +398,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From c79501da119cd86f0e9769461d2bbbc2bbc08b6d Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 29 Apr 2022 00:56:44 +0000 Subject: [PATCH 8/8] edit links --- use-cases/index.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/use-cases/index.rst b/use-cases/index.rst index 9ab084bbf6..5f406a8668 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -27,9 +27,8 @@ E-Commerce Personalization .. toctree:: :maxdepth: 1 - retail_recommend/1_retail_recommend_dataprep - retail_recommend/2_retail_recommend_train_tune - retail_recommend/3_retail_recommend_pipeline + retail_recommend/retail_recommend + retail_recommend/retail_recommend_pipeline Computer Vision for Medical Imaging