From fe77875262a6599101a01d0fdc208aa25de855b5 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 19:58:12 +0000 Subject: [PATCH 01/27] refactor sequential notebooks into independent notebooks --- .../1_retail_recommend_dataprep.ipynb | 715 ------------- ...rain_tune.ipynb => retail_recommend.ipynb} | 995 ++++++++++-------- ....ipynb => retail_recommend_pipeline.ipynb} | 113 +- 3 files changed, 610 insertions(+), 1213 deletions(-) delete mode 100644 use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb rename use-cases/retail_recommend/{2_retail_recommend_train_tune.ipynb => retail_recommend.ipynb} (68%) rename use-cases/retail_recommend/{3_retail_recommend_pipeline.ipynb => retail_recommend_pipeline.ipynb} (96%) diff --git a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb b/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb deleted file mode 100644 index 6b4c30b5e4..0000000000 --- a/use-cases/retail_recommend/1_retail_recommend_dataprep.ipynb +++ /dev/null @@ -1,715 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 1. Data Preparation\n", - "\n", - "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", - "\n", - "## Dataset\n", - "\n", - "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", - "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", - "+ Description: Product (item) name. Nominal.\n", - "+ Quantity: The quantities of each product (item) per transaction. Numeric.\n", - "+ InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated.\n", - "+ UnitPrice: Unit price. Numeric, Product price per unit in sterling.\n", - "+ CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer.\n", - "+ Country: Country name. Nominal, the name of the country where each customer resides. \n", - "\n", - "Citation: Daqing Chen, Sai Liang Sain, and Kun Guo, Data mining for the online retail industry: A case study of RFM model-based customer segmentation using data mining, Journal of Database Marketing and Customer Strategy Management, Vol. 19, No. 3, pp. 197–208, 2012 (Published online before print: 27 August 2012. doi: 10.1057/dbm.2012.17)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Solution Architecture\n", - "----\n", - "![Architecture](./images/retail_rec_dataprep.png)" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored variables and their in-db values:\n" - ] - } - ], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import sagemaker\n", - "import sagemaker.amazon.common as smac\n", - "import boto3\n", - "\n", - "import io\n", - "import json\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "\n", - "from scipy.sparse import csr_matrix, hstack, save_npz\n", - "from sklearn.preprocessing import OneHotEncoder\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.model_selection import train_test_split" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", - "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", - "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", - "\n", - "bucket = sagemaker_session.default_bucket()\n", - "print(f\"using bucket{bucket} in region {region} \\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(541909, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER62010-12-01 08:26:002.5517850.0United Kingdom
153636571053WHITE METAL LANTERN62010-12-01 08:26:003.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER82010-12-01 08:26:002.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE62010-12-01 08:26:003.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.62010-12-01 08:26:003.3917850.0United Kingdom
\n", - "
" - ], - "text/plain": [ - " InvoiceNo StockCode Description Quantity \\\n", - "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \n", - "1 536365 71053 WHITE METAL LANTERN 6 \n", - "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", - "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", - "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", - "\n", - " InvoiceDate UnitPrice CustomerID Country \n", - "0 2010-12-01 08:26:00 2.55 17850.0 United Kingdom \n", - "1 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "2 2010-12-01 08:26:00 2.75 17850.0 United Kingdom \n", - "3 2010-12-01 08:26:00 3.39 17850.0 United Kingdom \n", - "4 2010-12-01 08:26:00 3.39 17850.0 United Kingdom " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = pd.read_csv(\"data/Online Retail.csv\")\n", - "print(df.shape)\n", - "df.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preprocessing\n", - "\n", - "First, we check for any null (i.e. missing) values." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "InvoiceNo 0\n", - "StockCode 0\n", - "Description 1454\n", - "Quantity 0\n", - "InvoiceDate 0\n", - "UnitPrice 0\n", - "CustomerID 135080\n", - "Country 0\n", - "dtype: int64" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(406829, 8)\n" - ] - } - ], - "source": [ - "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", - "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", - "print(df.shape)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"Quantity\"], kde=True)\n", - "plt.title(\"Distribution of Quantity\")\n", - "plt.xlabel(\"Quantity\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": { - "needs_background": "light" - }, - "output_type": "display_data" - } - ], - "source": [ - "plt.figure(figsize=(10, 5))\n", - "sns.distplot(df[\"UnitPrice\"], kde=True)\n", - "plt.title(\"Distribution of Unit Prices\")\n", - "plt.xlabel(\"Price\");" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There are no negative prices, which is good, but we can see some extreme outliers." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
QuantityUnitPriceCustomerID
count406829.000000406829.000000406829.000000
mean12.0613033.46047115287.690570
std248.69337069.3151621713.600303
min-80995.0000000.00000012346.000000
25%2.0000001.25000013953.000000
50%5.0000001.95000015152.000000
75%12.0000003.75000016791.000000
max80995.00000038970.00000018287.000000
\n", - "
" - ], - "text/plain": [ - " Quantity UnitPrice CustomerID\n", - "count 406829.000000 406829.000000 406829.000000\n", - "mean 12.061303 3.460471 15287.690570\n", - "std 248.693370 69.315162 1713.600303\n", - "min -80995.000000 0.000000 12346.000000\n", - "25% 2.000000 1.250000 13953.000000\n", - "50% 5.000000 1.950000 15152.000000\n", - "75% 12.000000 3.750000 16791.000000\n", - "max 80995.000000 38970.000000 18287.000000" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(274399, 6)" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", - " \"Quantity\"\n", - "].sum()\n", - "df = df.loc[df > 0].reset_index()\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def loadDataset(dataframe):\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = dataframe[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", - "\n", - " row = range(len(dataframe))\n", - " col = [0] * len(dataframe)\n", - " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", - "\n", - " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", - "\n", - " return X, y" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "X, y = loadDataset(df)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.9991284988048746" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# display sparsity\n", - "total_cells = X.shape[0] * X.shape[1]\n", - "(total_cells - X.nnz) / total_cells" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Data For Modeling\n", - "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", - "\n", - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save numpy arrays to local storage in /data folder\n" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", - "save_npz(\"data/X_train.npz\", X_train)\n", - "save_npz(\"data/X_test.npz\", X_test)\n", - "np.savez(\"data/y_train.npz\", y_train)\n", - "np.savez(\"data/y_test.npz\", y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "prefix = \"personalization\"\n", - "\n", - "train_key = \"train.protobuf\"\n", - "train_prefix = f\"{prefix}/train\"\n", - "\n", - "test_key = \"test.protobuf\"\n", - "test_prefix = f\"{prefix}/test\"\n", - "\n", - "output_prefix = f\"s3://{bucket}/{prefix}/output\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", - " buf = io.BytesIO()\n", - " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", - " buf.seek(0)\n", - " obj = \"{}/{}\".format(prefix, key)\n", - " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", - " return \"s3://{}/{}\".format(bucket, obj)\n", - "\n", - "\n", - "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", - "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", - "\n", - "print(train_data_location)\n", - "print(test_data_location)\n", - "print(\"Output: {}\".format(output_prefix))" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'train_data_location' (str)\n", - "Stored 'test_data_location' (str)\n" - ] - } - ], - "source": [ - "%store train_data_location\n", - "%store test_data_location" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the next notebook we will explore training and tuning." - ] - } - ], - "metadata": { - "instance_type": "ml.t3.medium", - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.5" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb similarity index 68% rename from use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb rename to use-cases/retail_recommend/retail_recommend.ipynb index 3bb6535cf2..04b5d6df93 100644 --- a/use-cases/retail_recommend/2_retail_recommend_train_tune.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -4,14 +4,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 2. Train and Make Predictions\n", + "# Recommendation Engine for E-Commerce Sales\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", "## Dataset\n", "\n", "The dataset for this demo comes from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Online+Retail). It contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail. The company mainly sells unique all-occasion gifts. The following attributes are included in our dataset:\n", - "\n", "+ InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation.\n", "+ StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product.\n", "+ Description: Product (item) name. Nominal.\n", @@ -28,9 +27,129 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Solution Architecture\n", + "## Part 1: Data Preparation\n", "----\n", - "![Architecture](./images/retail_rec_train_reg_deploy.png)" + "The first of the notebook will focus on preparing the data for training.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_dataprep.png)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade sagemaker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sagemaker\n", + "import sagemaker.amazon.common as smac\n", + "import boto3\n", + "\n", + "import io\n", + "import json\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "\n", + "from scipy.sparse import csr_matrix, hstack, save_npz\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert sagemaker.__version__ >= \"2.21.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region = boto3.Session().region_name\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "print(f\"using bucket{bucket} in region {region} \\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "print(df.shape)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Preprocessing\n", + "\n", + "First, we check for any null (i.e. missing) values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Drop any records with a missing CustomerID. If we do not know who the customer is, then it is not helpful to us when we make recommendations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "print(df.shape)" ] }, { @@ -38,22 +157,171 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"Quantity\"], kde=True)\n", + "plt.title(\"Distribution of Quantity\")\n", + "plt.xlabel(\"Quantity\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Most of our quantities are realteively small (positive) numbers, but there are also some negative quantities as well as extreme outliers (both postiive and negative outliers). " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 5))\n", + "sns.distplot(df[\"UnitPrice\"], kde=True)\n", + "plt.title(\"Distribution of Unit Prices\")\n", + "plt.xlabel(\"Price\");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are no negative prices, which is good, but we can see some extreme outliers." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def loadDataset(dataframe):\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " ohe_output = enc.fit_transform(dataframe[onehot_cols])\n", + "\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = dataframe[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(dataframe[\"Description\"])\n", + "\n", + " row = range(len(dataframe))\n", + " col = [0] * len(dataframe)\n", + " unit_price = csr_matrix((dataframe[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", + "\n", + " X = hstack([ohe_output, tfidf_output, unit_price], format=\"csr\", dtype=\"float32\")\n", + "\n", + " y = dataframe[\"Quantity\"].values.astype(\"float32\")\n", + "\n", + " return X, y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X, y = loadDataset(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# display sparsity\n", + "total_cells = X.shape[0] * X.shape[1]\n", + "(total_cells - X.nnz) / total_cells" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our data is over 99.9% sparse. Because of this high sparsity, the sparse matrix data type allows us to represent our data using only a small fraction of the memory that a dense matrix would require." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Save numpy arrays to local storage in /data folder\n" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df.to_csv(\"data/online_retail_preprocessed.csv\", index=False)\n", + "save_npz(\"data/X_train.npz\", X_train)\n", + "save_npz(\"data/X_test.npz\", X_test)\n", + "np.savez(\"data/y_train.npz\", y_train)\n", + "np.savez(\"data/y_test.npz\", y_test)" + ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "!pip install -Uq sagemaker boto3" + "prefix = \"personalization\"\n", + "\n", + "train_key = \"train.protobuf\"\n", + "train_prefix = f\"{prefix}/train\"\n", + "\n", + "test_key = \"test.protobuf\"\n", + "test_prefix = f\"{prefix}/test\"\n", + "\n", + "output_prefix = f\"s3://{bucket}/{prefix}/output\"" ] }, { @@ -62,13 +330,38 @@ "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "def writeDatasetToProtobuf(X, y, bucket, prefix, key):\n", + " buf = io.BytesIO()\n", + " smac.write_spmatrix_to_sparse_tensor(buf, X, y)\n", + " buf.seek(0)\n", + " obj = \"{}/{}\".format(prefix, key)\n", + " boto3.resource(\"s3\").Bucket(bucket).Object(obj).upload_fileobj(buf)\n", + " return \"s3://{}/{}\".format(bucket, obj)\n", + "\n", + "\n", + "train_data_location = writeDatasetToProtobuf(X_train, y_train, bucket, train_prefix, train_key)\n", + "test_data_location = writeDatasetToProtobuf(X_test, y_test, bucket, test_prefix, test_key)\n", + "\n", + "print(train_data_location)\n", + "print(test_data_location)\n", + "print(\"Output: {}\".format(output_prefix))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Train, Tune, and Deploy Model\n", + "----\n", + "This second part will focus on training, tuning, and deploying a model trained on the data prepared in part 1.\n", + "\n", + "### Solution Architecture\n", + "![Architecture](./images/retail_rec_train_reg_deploy.png)" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -90,194 +383,283 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "assert sagemaker.__version__ >= \"2.21.0\"" + "region = boto3.Session().region_name\n", + "boto3.setup_default_session(region_name=region)\n", + "boto_session = boto3.Session(region_name=region)\n", + "\n", + "s3_client = boto3.client(\"s3\", region_name=region)\n", + "\n", + "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", + "sagemaker_session = sagemaker.session.Session(\n", + " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", + ")\n", + "sagemaker_role = sagemaker.get_execution_role()\n", + "\n", + "bucket = sagemaker_session.default_bucket()\n", + "\n", + "prefix = \"personalization\"\n", + "\n", + "output_prefix = f\"s3://{bucket}/{prefix}/output\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Data For Modeling\n", + "\n", + "+ Split the data into training and testing sets\n", + "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load array\n", + "X_train = load_npz(\"./data/X_train.npz\")\n", + "X_test = load_npz(\"./data/X_test.npz\")\n", + "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", + "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", + "y_train = y_train_npzfile.f.arr_0\n", + "y_test = y_test_npzfile.f.arr_0\n", + "\n", + "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "region = boto3.Session().region_name\n", - "boto3.setup_default_session(region_name=region)\n", - "boto_session = boto3.Session(region_name=region)\n", + "input_dims = X_train.shape[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", + "\n", + "fm = sagemaker.estimator.Estimator(\n", + " container,\n", + " sagemaker_role,\n", + " instance_count=1,\n", + " instance_type=\"ml.c5.xlarge\",\n", + " output_path=output_prefix,\n", + " sagemaker_session=sagemaker_session,\n", + ")\n", + "\n", + "fm.set_hyperparameters(\n", + " feature_dim=input_dims,\n", + " predictor_type=\"regressor\",\n", + " mini_batch_size=1000,\n", + " num_factors=64,\n", + " epochs=20,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if 'training_job_name' not in locals():\n", + " \n", + " fm.fit({'train': train_data_location, 'test': test_data_location})\n", + " training_job_name = fm.latest_training_job.job_name\n", + " \n", + "else:\n", + " print(f'Using previous training job: {training_job_name}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Make Predictions\n", + "\n", + "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", + "\n", + "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", + "\n", + "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.deserializers import JSONDeserializer\n", + "from sagemaker.serializers import JSONSerializer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class FMSerializer(JSONSerializer):\n", + " def serialize(self, data):\n", + " js = {\"instances\": []}\n", + " for row in data:\n", + " js[\"instances\"].append({\"features\": row.tolist()})\n", + " return json.dumps(js)\n", + "\n", + "\n", + "fm_predictor = fm.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=\"ml.m4.xlarge\",\n", + " serializer=FMSerializer(),\n", + " deserializer=JSONDeserializer(),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# find customer who spent the most money\n", + "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", + "\n", + "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", + "top_customer = (\n", + " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", + " popular_items = (\n", + " df.groupby([\"StockCode\", \"UnitPrice\"])\n", + " .nunique()[\"CustomerID\"]\n", + " .sort_values(ascending=False)\n", + " .reset_index()\n", + " )\n", + " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", + " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", + "\n", + " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", + " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", + "\n", + " # find customer's country\n", + " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", + " country = df_subset[\"Country\"].value_counts().index[0]\n", + "\n", + " data = {\n", + " \"StockCode\": top_n_items,\n", + " \"Description\": [item_map[i] for i in top_n_items],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices,\n", + " }\n", "\n", - "s3_client = boto3.client(\"s3\", region_name=region)\n", + " df_inference = pd.DataFrame(data)\n", "\n", - "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n", - "sagemaker_session = sagemaker.session.Session(\n", - " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n", - ")\n", - "sagemaker_role = sagemaker.get_execution_role()\n", + " # we need to build the data set similar to how we built it for training\n", + " # it should have the same number of features as the training data\n", + " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", + " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", + " enc.fit(df[onehot_cols])\n", + " onehot_output = enc.transform(df_inference[onehot_cols])\n", "\n", - "bucket = sagemaker_session.default_bucket()\n", + " vectorizer = TfidfVectorizer(min_df=2)\n", + " unique_descriptions = df[\"Description\"].unique()\n", + " vectorizer.fit(unique_descriptions)\n", + " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", "\n", - "prefix = \"personalization\"\n", + " row = range(len(df_inference))\n", + " col = [0] * len(df_inference)\n", + " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", "\n", - "output_prefix = f\"s3://{bucket}/{prefix}/output\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Read the data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Prepare Data For Modeling\n", + " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", "\n", - "+ Split the data into training and testing sets\n", - "+ Write the data to protobuf recordIO format for Pipe mode. [Read more](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html) about protobuf recordIO format." + " result = fm_predictor.predict(X_inference.toarray())\n", + " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", + " index_array = np.array(preds).argsort()\n", + " items = enc.inverse_transform(onehot_output)[:, 0]\n", + " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", + " recommendations = [[i, item_map[i]] for i in top_recs]\n", + " return recommendations" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "# load array\n", - "X_train = load_npz(\"./data/X_train.npz\")\n", - "X_test = load_npz(\"./data/X_test.npz\")\n", - "y_train_npzfile = np.load(\"./data/y_train.npz\")\n", - "y_test_npzfile = np.load(\"./data/y_test.npz\")\n", - "y_train = y_train_npzfile.f.arr_0\n", - "y_test = y_test_npzfile.f.arr_0" + "print(\"Top 5 recommended products:\")\n", + "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" ] }, { - "cell_type": "code", - "execution_count": 7, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "((219519, 9284), (54880, 9284), (219519,), (54880,))" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "X_train.shape, X_test.shape, y_train.shape, y_test.shape" + "Once you are done with the endpoint, you should delete the endpoint to save cost and free resources." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'input_dims' (int)\n" - ] - } - ], + "outputs": [], "source": [ - "input_dims = X_train.shape[1]\n", - "%store input_dims" + "fm_predictor.delete_model()\n", + "fm_predictor.delete_endpoint()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Train the factorization machine model\n", - "\n", - "Once we have the data preprocessed and available in the correct format for training, the next step is to actually train the model using the data. \n", + "## Optional Part: Registering the Model in SageMaker Model Registry\n", "\n", - "We'll use the Amazon SageMaker Python SDK to kick off training and monitor status until it is completed. In this example that takes only a few minutes. Despite the model only need 1-2 minutes to train, there is some extra time required upfront to provision hardware and load the algorithm container.\n", - "\n", - "First, let's specify our containers. To find the rigth container, we'll create a small lookup. More details on algorithm containers can be found in [AWS documentation.](https://docs-aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html)" + "Once a useful model has been trained, you have the option to register the model for future reference and possible deployment. To do so, we must first properly associate the artifacts of the model." ] }, { - "cell_type": "code", - "execution_count": 9, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "container = sagemaker.image_uris.retrieve(\"factorization-machines\", region=boto_session.region_name)\n", - "\n", - "fm = sagemaker.estimator.Estimator(\n", - " container,\n", - " sagemaker_role,\n", - " instance_count=1,\n", - " instance_type=\"ml.c5.xlarge\",\n", - " output_path=output_prefix,\n", - " sagemaker_session=sagemaker_session,\n", - ")\n", - "\n", - "fm.set_hyperparameters(\n", - " feature_dim=input_dims,\n", - " predictor_type=\"regressor\",\n", - " mini_batch_size=1000,\n", - " num_factors=64,\n", - " epochs=20,\n", - ")" + "### Training data artifact" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "if 'training_job_name' not in locals():\n", - " \n", - " fm.fit({'train': train_data_location, 'test': test_data_location})\n", - " training_job_name = fm.latest_training_job.job_name\n", - " %store training_job_name\n", - " \n", - "else:\n", - " print(f'Using previous training job: {training_job_name}')" - ] - }, - { - "cell_type": "code", - "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName=training_job_name)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Training data artifact" - ] - }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/cdd7fbecb4eefa22c43b2ad48140acc2\n" - ] - } - ], + "outputs": [], "source": [ "training_data_s3_uri = training_job_info[\"InputDataConfig\"][0][\"DataSource\"][\"S3DataSource\"][\n", " \"S3Uri\"\n", @@ -318,17 +700,9 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using existing artifact: arn:aws:sagemaker:us-east-2:645431112437:artifact/3acde2fc029adeff9c767be68feac3a7\n" - ] - } - ], + "outputs": [], "source": [ "trained_model_s3_uri = training_job_info[\"ModelArtifacts\"][\"S3ModelArtifacts\"]\n", "\n", @@ -358,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -377,18 +751,9 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Association already exists with DataSet\n", - "Association with Model: SUCCEESFUL\n" - ] - } - ], + "outputs": [], "source": [ "artifact_list = [[training_data_artifact, \"ContributedTo\"], [model_artifact, \"Produced\"]]\n", "\n", @@ -430,41 +795,27 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## SageMaker Model Registry\n", - "\n", - "Once a useful model has been trained and its artifacts properly associated, the next step is to register the model for future reference and possible deployment.\n", - "\n", "### Create Model Package Group\n", "\n", - "A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." + "After associating all the relevant artifacts, the Model Package Group can now be created. A Model Package Groups holds multiple versions or iterations of a model. Though it is not required to create them for every model in the registry, they help organize various models which all have the same purpose and provide autiomatic versioning." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'mpg_name' (str)\n", - "Model Package Group name: retail-recommendation-2021-03-01-21-41\n" - ] - } - ], + "outputs": [], "source": [ "if 'mpg_name' not in locals():\n", " timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", " mpg_name = f'retail-recommendation-{timestamp}'\n", - " %store mpg_name\n", "\n", "print(f'Model Package Group name: {mpg_name}')" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -493,7 +844,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -519,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -546,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -562,7 +913,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -586,17 +937,9 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "model package status: Completed\n" - ] - } - ], + "outputs": [], "source": [ "mp_info = sagemaker_boto_client.describe_model_package(\n", " ModelPackageName=mp_response[\"ModelPackageArn\"]\n", @@ -615,7 +958,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -630,277 +973,25 @@ "update_response = sagemaker_boto_client.update_model_package(**model_package_update)" ] }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Name/SourceDirectionTypeAssociation TypeLineage Type
0s3://...1-03-01-21-36-56-437/output/model.tar.gzInputModelProducedartifact
1s3://...12437/personalization/test/test.protobufInputDataSetContributedToartifact
2s3://...437/personalization/train/train.protobufInputDataSetContributedToartifact
340461...2.amazonaws.com/factorization-machines:1InputImageContributedToartifact
4s3://...1-03-01-21-36-56-437/output/model.tar.gzOutputModelProducedartifact
\n", - "
" - ], - "text/plain": [ - " Name/Source Direction Type \\\n", - "0 s3://...1-03-01-21-36-56-437/output/model.tar.gz Input Model \n", - "1 s3://...12437/personalization/test/test.protobuf Input DataSet \n", - "2 s3://...437/personalization/train/train.protobuf Input DataSet \n", - "3 40461...2.amazonaws.com/factorization-machines:1 Input Image \n", - "4 s3://...1-03-01-21-36-56-437/output/model.tar.gz Output Model \n", - "\n", - " Association Type Lineage Type \n", - "0 Produced artifact \n", - "1 ContributedTo artifact \n", - "2 ContributedTo artifact \n", - "3 ContributedTo artifact \n", - "4 Produced artifact " - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", - "\n", - "viz = LineageTableVisualizer(sagemaker_session)\n", - "display(viz.show(training_job_name=training_job_name))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Make Predictions\n", - "\n", - "Now that we've trained our model, we can deploy it behind an Amazon SageMaker real-time hosted endpoint. This will allow out to make predictions (or inference) from the model dyanamically.\n", - "\n", - "Note, Amazon SageMaker allows you the flexibility of importing models trained elsewhere, as well as the choice of not importing models if the target of model creation is AWS Lambda, AWS Greengrass, Amazon Redshift, Amazon Athena, or other deployment target.\n", - "\n", - "Here we will take the top customer, the customer who spent the most money, and try to find which items to recommend to them." - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.deserializers import JSONDeserializer\n", - "from sagemaker.serializers import JSONSerializer" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "class FMSerializer(JSONSerializer):\n", - " def serialize(self, data):\n", - " js = {\"instances\": []}\n", - " for row in data:\n", - " js[\"instances\"].append({\"features\": row.tolist()})\n", - " return json.dumps(js)\n", - "\n", - "\n", - "fm_predictor = fm.deploy(\n", - " initial_instance_count=1,\n", - " instance_type=\"ml.m4.xlarge\",\n", - " serializer=FMSerializer(),\n", - " deserializer=JSONDeserializer(),\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "# find customer who spent the most money\n", - "df = pd.read_csv(\"data/online_retail_preprocessed.csv\")\n", - "\n", - "df[\"invoice_amount\"] = df[\"Quantity\"] * df[\"UnitPrice\"]\n", - "top_customer = (\n", - " df.groupby(\"CustomerID\").sum()[\"invoice_amount\"].sort_values(ascending=False).index[0]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "def get_recommendations(df, customer_id, n_recommendations, n_ranks=100):\n", - " popular_items = (\n", - " df.groupby([\"StockCode\", \"UnitPrice\"])\n", - " .nunique()[\"CustomerID\"]\n", - " .sort_values(ascending=False)\n", - " .reset_index()\n", - " )\n", - " top_n_items = popular_items[\"StockCode\"].iloc[:n_ranks].values\n", - " top_n_prices = popular_items[\"UnitPrice\"].iloc[:n_ranks].values\n", - "\n", - " # stock codes can have multiple descriptions, so we will choose whichever description is most common\n", - " item_map = df.groupby(\"StockCode\").agg(lambda x: x.value_counts().index[0])[\"Description\"]\n", - "\n", - " # find customer's country\n", - " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", - " country = df_subset[\"Country\"].value_counts().index[0]\n", - "\n", - " data = {\n", - " \"StockCode\": top_n_items,\n", - " \"Description\": [item_map[i] for i in top_n_items],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices,\n", - " }\n", - "\n", - " df_inference = pd.DataFrame(data)\n", - "\n", - " # we need to build the data set similar to how we built it for training\n", - " # it should have the same number of features as the training data\n", - " enc = OneHotEncoder(handle_unknown=\"ignore\")\n", - " onehot_cols = [\"StockCode\", \"CustomerID\", \"Country\"]\n", - " enc.fit(df[onehot_cols])\n", - " onehot_output = enc.transform(df_inference[onehot_cols])\n", - "\n", - " vectorizer = TfidfVectorizer(min_df=2)\n", - " unique_descriptions = df[\"Description\"].unique()\n", - " vectorizer.fit(unique_descriptions)\n", - " tfidf_output = vectorizer.transform(df_inference[\"Description\"])\n", - "\n", - " row = range(len(df_inference))\n", - " col = [0] * len(df_inference)\n", - " unit_price = csr_matrix((df_inference[\"UnitPrice\"].values, (row, col)), dtype=\"float32\")\n", - "\n", - " X_inference = hstack([onehot_output, tfidf_output, unit_price], format=\"csr\")\n", + "from sagemaker.lineage.visualizer import LineageTableVisualizer\n", "\n", - " result = fm_predictor.predict(X_inference.toarray())\n", - " preds = [i[\"score\"] for i in result[\"predictions\"]]\n", - " index_array = np.array(preds).argsort()\n", - " items = enc.inverse_transform(onehot_output)[:, 0]\n", - " top_recs = np.take_along_axis(items, index_array, axis=0)[: -n_recommendations - 1 : -1]\n", - " recommendations = [[i, item_map[i]] for i in top_recs]\n", - " return recommendations" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Top 5 recommended products:\n" - ] - }, - { - "data": { - "text/plain": [ - "[['22423', 'REGENCY CAKESTAND 3 TIER'],\n", - " ['22776', 'SWEETHEART CAKESTAND 3 TIER'],\n", - " ['22624', 'IVORY KITCHEN SCALES'],\n", - " ['85123A', 'WHITE HANGING HEART T-LIGHT HOLDER'],\n", - " ['85099B', 'JUMBO BAG RED RETROSPOT']]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\"Top 5 recommended products:\")\n", - "get_recommendations(df, top_customer, n_recommendations=5, n_ranks=100)" + "viz = LineageTableVisualizer(sagemaker_session)\n", + "display(viz.show(training_job_name=training_job_name))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -912,7 +1003,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb similarity index 96% rename from use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb rename to use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 5d9e3b09ae..81f1a3b945 100644 --- a/use-cases/retail_recommend/3_retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Recommendation Engine for E-Commerce Sales: Part 3. Build Pipeline\n", + "# Recommendation Engine for E-Commerce Sales - Pipeline Mode\n", "\n", "This notebook gives an overview of techniques and services offer by SageMaker to build and deploy a personalized recommendation engine.\n", "\n", @@ -32,28 +32,18 @@ "![Architecture](./images/retail_rec_pipeline.png)" ] }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install -Uq sagemaker boto3" - ] - }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "%store -r\n", - "%store" + "! pip install --upgrade sagemaker" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -63,13 +53,16 @@ "from sagemaker.workflow.step_collections import RegisterModel\n", "from sagemaker.sklearn.processing import SKLearnProcessor\n", "from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString\n", + "import datetime\n", "import boto3\n", - "import time" + "import time\n", + "import pandas as pd\n", + "from preprocessing import loadDataset" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -78,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -108,9 +101,41 @@ "## Define Estimator" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, the number of feature dimensions must be calculated as it is a hyperparameter of the estimator. The feature dimensions are calculated by looking at the dataset, cleaning and preprocessing it as defined in the first part of [Recommendation Engine for E-Commerce Sales](retail_recommend.ipynb), and then counting the number of feature dimensions are in the processed dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv(\"data/Online Retail.csv\")\n", + "df.dropna(subset=[\"CustomerID\"], inplace=True)\n", + "df[\"Description\"] = df[\"Description\"].apply(lambda x: x.strip())\n", + "df = df.groupby([\"StockCode\", \"Description\", \"CustomerID\", \"Country\", \"UnitPrice\"])[\n", + " \"Quantity\"\n", + "].sum()\n", + "df = df.loc[df > 0].reset_index()\n", + "X, y = loadDataset(df)\n", + "input_dims = X.shape[1]\n", + "input_dims" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After calculating all the hyperparameters that are needed, the estimator is created." + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -159,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -170,7 +195,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -209,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -233,13 +258,22 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_step.properties.AlgorithmSpecification.TrainingImage._path" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container,#train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", @@ -252,10 +286,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ + "timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", + "mpg_name = f'retail-recommendation-{timestamp}'\n", + "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", " estimator=fm,\n", @@ -271,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -306,20 +343,11 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'pipeline_name' (str)\n" - ] - } - ], + "outputs": [], "source": [ "pipeline_name = f\"PersonalizationDemo\"\n", - "%store pipeline_name\n", "\n", "pipeline = Pipeline(\n", " name=pipeline_name,\n", @@ -376,21 +404,14 @@ " display(viz.show(pipeline_execution_step=execution_step))\n", " time.sleep(5)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3", + "display_name": "conda_python3", "language": "python", - "name": "python3" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -402,7 +423,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.6.13" } }, "nbformat": 4, From a3c42d585357ef1b0d8b6ef7d90ab8665c6b854f Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 20:20:39 +0000 Subject: [PATCH 02/27] cleanup --- .../retail_recommend/retail_recommend_pipeline.ipynb | 9 --------- 1 file changed, 9 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 81f1a3b945..261616f913 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -256,15 +256,6 @@ ")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_step.properties.AlgorithmSpecification.TrainingImage._path" - ] - }, { "cell_type": "code", "execution_count": null, From 790beddd64d61f5363b25f01ad2a7e16d2b07e0e Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 28 Apr 2022 20:36:02 +0000 Subject: [PATCH 03/27] reformat --- .../retail_recommend/retail_recommend.ipynb | 18 +++++++++--------- .../retail_recommend_pipeline.ipynb | 6 +++--- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index 04b5d6df93..abe3fa0ea5 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -474,13 +474,13 @@ "metadata": {}, "outputs": [], "source": [ - "if 'training_job_name' not in locals():\n", - " \n", - " fm.fit({'train': train_data_location, 'test': test_data_location})\n", + "if \"training_job_name\" not in locals():\n", + "\n", + " fm.fit({\"train\": train_data_location, \"test\": test_data_location})\n", " training_job_name = fm.latest_training_job.job_name\n", - " \n", + "\n", "else:\n", - " print(f'Using previous training job: {training_job_name}')" + " print(f\"Using previous training job: {training_job_name}\")" ] }, { @@ -806,11 +806,11 @@ "metadata": {}, "outputs": [], "source": [ - "if 'mpg_name' not in locals():\n", - " timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", - " mpg_name = f'retail-recommendation-{timestamp}'\n", + "if \"mpg_name\" not in locals():\n", + " timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + " mpg_name = f\"retail-recommendation-{timestamp}\"\n", "\n", - "print(f'Model Package Group name: {mpg_name}')" + "print(f\"Model Package Group name: {mpg_name}\")" ] }, { diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 261616f913..a8a1b23605 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -264,7 +264,7 @@ "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=container,#train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container, # train_step.properties.AlgorithmSpecification.TrainingImage,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", @@ -281,8 +281,8 @@ "metadata": {}, "outputs": [], "source": [ - "timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M')\n", - "mpg_name = f'retail-recommendation-{timestamp}'\n", + "timestamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H-%M\")\n", + "mpg_name = f\"retail-recommendation-{timestamp}\"\n", "\n", "register_step = RegisterModel(\n", " name=\"RegisterModel\",\n", From ced1ef28215bc9017c440e487dfa6037e832fec6 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:46:51 +0000 Subject: [PATCH 04/27] make pandas version compatible --- .../retail_recommend/retail_recommend.ipynb | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index abe3fa0ea5..6bb3adbe38 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -565,14 +565,17 @@ " # find customer's country\n", " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", " country = df_subset[\"Country\"].value_counts().index[0]\n", - "\n", - " data = {\n", - " \"StockCode\": top_n_items,\n", - " \"Description\": [item_map[i] for i in top_n_items],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices,\n", - " }\n", + " \n", + " data = []\n", + " flattened_item_map = [item_map[i] for i in top_n_items]\n", + " for idx in range(len(top_n_items)):\n", + " data.append({\n", + " \"StockCode\": top_n_items[idx],\n", + " \"Description\": flattened_item_map[idx],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices[idx],\n", + " })\n", "\n", " df_inference = pd.DataFrame(data)\n", "\n", @@ -987,7 +990,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From 873eefae97ef9e1b81afeaed23666ea925cce0a6 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:47:58 +0000 Subject: [PATCH 05/27] reformat --- .../retail_recommend/retail_recommend.ipynb | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/use-cases/retail_recommend/retail_recommend.ipynb b/use-cases/retail_recommend/retail_recommend.ipynb index 6bb3adbe38..ef3775439e 100644 --- a/use-cases/retail_recommend/retail_recommend.ipynb +++ b/use-cases/retail_recommend/retail_recommend.ipynb @@ -565,17 +565,19 @@ " # find customer's country\n", " df_subset = df.loc[df[\"CustomerID\"] == customer_id]\n", " country = df_subset[\"Country\"].value_counts().index[0]\n", - " \n", + "\n", " data = []\n", " flattened_item_map = [item_map[i] for i in top_n_items]\n", " for idx in range(len(top_n_items)):\n", - " data.append({\n", - " \"StockCode\": top_n_items[idx],\n", - " \"Description\": flattened_item_map[idx],\n", - " \"CustomerID\": customer_id,\n", - " \"Country\": country,\n", - " \"UnitPrice\": top_n_prices[idx],\n", - " })\n", + " data.append(\n", + " {\n", + " \"StockCode\": top_n_items[idx],\n", + " \"Description\": flattened_item_map[idx],\n", + " \"CustomerID\": customer_id,\n", + " \"Country\": country,\n", + " \"UnitPrice\": top_n_prices[idx],\n", + " }\n", + " )\n", "\n", " df_inference = pd.DataFrame(data)\n", "\n", From aff2a74e1d99860331652c4fa479d14bf8ccac15 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:49:50 +0000 Subject: [PATCH 06/27] cleanup --- use-cases/retail_recommend/retail_recommend_pipeline.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index a8a1b23605..72bfcfa4e9 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -264,7 +264,7 @@ "source": [ "model = sagemaker.model.Model(\n", " name=\"retail-personalization-factorization-machine\",\n", - " image_uri=container, # train_step.properties.AlgorithmSpecification.TrainingImage,\n", + " image_uri=container,\n", " model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n", " sagemaker_session=sagemaker_session,\n", " role=sagemaker_role,\n", From 3238674b098813487a33f65375bbfbcbd53327c4 Mon Sep 17 00:00:00 2001 From: atqy Date: Thu, 28 Apr 2022 23:59:47 +0000 Subject: [PATCH 07/27] dleete instance type --- use-cases/retail_recommend/retail_recommend_pipeline.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb index 72bfcfa4e9..3e5bc3b221 100644 --- a/use-cases/retail_recommend/retail_recommend_pipeline.ipynb +++ b/use-cases/retail_recommend/retail_recommend_pipeline.ipynb @@ -398,7 +398,6 @@ } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "conda_python3", "language": "python", From c79501da119cd86f0e9769461d2bbbc2bbc08b6d Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 29 Apr 2022 00:56:44 +0000 Subject: [PATCH 08/27] edit links --- use-cases/index.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/use-cases/index.rst b/use-cases/index.rst index 9ab084bbf6..5f406a8668 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -27,9 +27,8 @@ E-Commerce Personalization .. toctree:: :maxdepth: 1 - retail_recommend/1_retail_recommend_dataprep - retail_recommend/2_retail_recommend_train_tune - retail_recommend/3_retail_recommend_pipeline + retail_recommend/retail_recommend + retail_recommend/retail_recommend_pipeline Computer Vision for Medical Imaging From 116be4d87b3cc6ac84f52ba5edd6f0a902f5e27a Mon Sep 17 00:00:00 2001 From: atqy Date: Fri, 29 Apr 2022 23:07:12 +0000 Subject: [PATCH 09/27] refactor sequential notebooks --- .../0_cust_churn_overview_dw.ipynb | 529 +++++++++- .../2_cust_churn_train_deploy_infer.ipynb | 985 ++++++++++++++++-- 2 files changed, 1394 insertions(+), 120 deletions(-) diff --git a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb index 2f2a2f6e51..0d270b8a89 100644 --- a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb +++ b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb @@ -164,7 +164,8 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install -q 'sagemaker==2.19.0' 'botocore == 1.19.4' 's3fs==0.4.2' 'sagemaker-experiments' 'boto3 == 1.16.4'\n", + "!pip install -q 's3fs==0.4.2' 'sagemaker-experiments'\n", + "!pip install --upgrade sagemaker boto3\n", "# s3fs is needed for pandas to read files from S3" ] }, @@ -207,26 +208,6 @@ "prefix = \"music-streaming\"" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store bucket\n", - "%store prefix" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -937,8 +918,9 @@ "outputs": [], "source": [ "processing_output_filename = f\"{processing_output_path}/{final_features_filename}\"\n", - "%store processing_output_filename\n", - "%store -r" + "# %store processing_output_filename\n", + "# %store -r\n", + "processing_output_filename" ] }, { @@ -975,8 +957,365 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Citation\n", - "The data used in this notebook is simulated using the [EventSim](https://github.com/Interana/eventsim)." + "### Preprocess the Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sagemaker.sklearn.processing import SKLearnProcessor\n", + "\n", + "sklearn_processor = SKLearnProcessor(\n", + " # framework_version='0.20.0',\n", + " framework_version=\"0.23-1\",\n", + " role=role,\n", + " instance_type=\"ml.m5.xlarge\",\n", + " instance_count=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### SAVE THE OUTPUT FILE NAME FROM PROCESSING JOB\n", + "processing_job_output_name = 'processing_job_output.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile preprocessing.py\n", + "\n", + "import os\n", + "import warnings\n", + "import time\n", + "import argparse\n", + "import subprocess\n", + "import sys\n", + "\n", + "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"--upgrade\", \"pandas\"])\n", + "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"awswrangler\"])\n", + "import pandas as pd\n", + "import awswrangler as wr\n", + "\n", + "start_time = time.time()\n", + "\n", + "if __name__ == \"__main__\":\n", + " parser = argparse.ArgumentParser()\n", + " parser.add_argument(\"--dw-output-path\")\n", + " parser.add_argument(\"--processing-output-filename\")\n", + "\n", + " args, _ = parser.parse_known_args()\n", + " print(\"Received arguments {}\".format(args))\n", + "\n", + " data_s3_uri = args.dw_output_path\n", + " output_filename = args.processing_output_filename\n", + "\n", + " # data_path = os.path.join('/opt/ml/processing/input', dw_output_name)\n", + " # df = pd.read_csv(data_path)\n", + " df = wr.s3.read_csv(path=data_s3_uri, dataset=True)\n", + " ## convert to time\n", + " df[\"date\"] = pd.to_datetime(df[\"ts\"], unit=\"ms\")\n", + " df[\"ts_dow\"] = df[\"date\"].dt.weekday\n", + " df[\"ts_date_day\"] = df[\"date\"].dt.date\n", + " df[\"ts_is_weekday\"] = [1 if x in [0, 1, 2, 3, 4] else 0 for x in df[\"ts_dow\"]]\n", + " df[\"registration_ts\"] = pd.to_datetime(df[\"registration\"], unit=\"ms\").dt.date\n", + " ## add labels\n", + " df[\"churned_event\"] = [1 if x == \"Cancellation Confirmation\" else 0 for x in df[\"page\"]]\n", + " df[\"user_churned\"] = df.groupby(\"userId\")[\"churned_event\"].transform(\"max\")\n", + "\n", + " ## convert pages categorical variables to numerical\n", + " events_list = [\n", + " \"NextSong\",\n", + " \"Thumbs Down\",\n", + " \"Thumbs Up\",\n", + " \"Add to Playlist\",\n", + " \"Roll Advert\",\n", + " \"Add Friend\",\n", + " \"Downgrade\",\n", + " \"Upgrade\",\n", + " \"Error\",\n", + " ]\n", + " usage_column_name = []\n", + " for event in events_list:\n", + " event_name = \"_\".join(event.split()).lower()\n", + " usage_column_name.append(event_name)\n", + " df[event_name] = [1 if x == event else 0 for x in df[\"page\"]]\n", + " ## feature engineering\n", + " # average_events_weekday (numerical): average number of events per day during weekday\n", + " # average_events_weekend (numerical): average number of events per day during the weekend\n", + " base_df = (\n", + " df.groupby([\"userId\", \"ts_date_day\", \"ts_is_weekday\"])\n", + " .agg({\"page\": \"count\"})\n", + " .groupby([\"userId\", \"ts_is_weekday\"])[\"page\"]\n", + " .mean()\n", + " .unstack(fill_value=0)\n", + " .reset_index()\n", + " .rename(columns={0: \"average_events_weekend\", 1: \"average_events_weekday\"})\n", + " )\n", + "\n", + " # num_ads_7d, num_songs_played_7d, num_songs_played_30d, num_songs_played_90d, num_ads_7d, num_error_7d\n", + " base_df_daily = (\n", + " df.groupby([\"userId\", \"ts_date_day\"])\n", + " .agg({\"page\": \"count\", \"nextsong\": \"sum\", \"roll_advert\": \"sum\", \"error\": \"sum\"})\n", + " .reset_index()\n", + " )\n", + " feature34 = (\n", + " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", + " .tail(7)\n", + " .groupby([\"userId\"])\n", + " .agg({\"nextsong\": \"sum\", \"roll_advert\": \"sum\", \"error\": \"sum\"})\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"nextsong\": \"num_songs_played_7d\",\n", + " \"roll_advert\": \"num_ads_7d\",\n", + " \"error\": \"num_error_7d\",\n", + " }\n", + " )\n", + " )\n", + " feature5 = (\n", + " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", + " .tail(30)\n", + " .groupby([\"userId\"])\n", + " .agg({\"nextsong\": \"sum\"})\n", + " .reset_index()\n", + " .rename(columns={\"nextsong\": \"num_songs_played_30d\"})\n", + " )\n", + " feature6 = (\n", + " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", + " .tail(90)\n", + " .groupby([\"userId\"])\n", + " .agg({\"nextsong\": \"sum\"})\n", + " .reset_index()\n", + " .rename(columns={\"nextsong\": \"num_songs_played_90d\"})\n", + " )\n", + " # num_artists, num_songs, num_ads, num_thumbsup, num_thumbsdown, num_playlist, num_addfriend, num_error, user_downgrade,\n", + " # user_upgrade, percentage_ad, days_since_active\n", + " base_df_user = (\n", + " df.groupby([\"userId\"])\n", + " .agg(\n", + " {\n", + " \"page\": \"count\",\n", + " \"nextsong\": \"sum\",\n", + " \"artist\": \"nunique\",\n", + " \"song\": \"nunique\",\n", + " \"thumbs_down\": \"sum\",\n", + " \"thumbs_up\": \"sum\",\n", + " \"add_to_playlist\": \"sum\",\n", + " \"roll_advert\": \"sum\",\n", + " \"add_friend\": \"sum\",\n", + " \"downgrade\": \"max\",\n", + " \"upgrade\": \"max\",\n", + " \"error\": \"sum\",\n", + " \"ts_date_day\": \"max\",\n", + " \"registration_ts\": \"min\",\n", + " \"user_churned\": \"max\",\n", + " }\n", + " )\n", + " .reset_index()\n", + " )\n", + " base_df_user[\"percentage_ad\"] = base_df_user[\"roll_advert\"] / base_df_user[\"page\"]\n", + " base_df_user[\"days_since_active\"] = (\n", + " base_df_user[\"ts_date_day\"] - base_df_user[\"registration_ts\"]\n", + " ).dt.days\n", + " # repeats ratio\n", + " base_df_user[\"repeats_ratio\"] = 1 - base_df_user[\"song\"] / base_df_user[\"nextsong\"]\n", + "\n", + " # num_sessions, avg_time_per_session, avg_events_per_session,\n", + " base_df_session = (\n", + " df.groupby([\"userId\", \"sessionId\"])\n", + " .agg({\"length\": \"sum\", \"page\": \"count\", \"date\": \"min\"})\n", + " .reset_index()\n", + " )\n", + " base_df_session[\"prev_session_ts\"] = base_df_session.groupby([\"userId\"])[\"date\"].shift(1)\n", + " base_df_session[\"gap_session\"] = (\n", + " base_df_session[\"date\"] - base_df_session[\"prev_session_ts\"]\n", + " ).dt.days\n", + " user_sessions = (\n", + " base_df_session.groupby(\"userId\")\n", + " .agg({\"sessionId\": \"count\", \"length\": \"mean\", \"page\": \"mean\", \"gap_session\": \"mean\"})\n", + " .reset_index()\n", + " .rename(\n", + " columns={\n", + " \"sessionId\": \"num_sessions\",\n", + " \"length\": \"avg_time_per_session\",\n", + " \"page\": \"avg_events_per_session\",\n", + " \"gap_session\": \"avg_gap_between_session\",\n", + " }\n", + " )\n", + " )\n", + "\n", + " # merge features together\n", + " base_df[\"userId\"] = base_df[\"userId\"].astype(\"int\")\n", + " final_feature_df = base_df.merge(feature34, how=\"left\", on=\"userId\")\n", + " final_feature_df = final_feature_df.merge(feature5, how=\"left\", on=\"userId\")\n", + " final_feature_df = final_feature_df.merge(feature6, how=\"left\", on=\"userId\")\n", + " final_feature_df = final_feature_df.merge(user_sessions, how=\"left\", on=\"userId\")\n", + " final_feature_df = final_feature_df.merge(base_df_user, how=\"left\", on=\"userId\")\n", + "\n", + " final_feature_df = final_feature_df.fillna(0)\n", + " # renaming columns\n", + " final_feature_df.columns = [\n", + " \"userId\",\n", + " \"average_events_weekend\",\n", + " \"average_events_weekday\",\n", + " \"num_songs_played_7d\",\n", + " \"num_ads_7d\",\n", + " \"num_error_7d\",\n", + " \"num_songs_played_30d\",\n", + " \"num_songs_played_90d\",\n", + " \"num_sessions\",\n", + " \"avg_time_per_session\",\n", + " \"avg_events_per_session\",\n", + " \"avg_gap_between_session\",\n", + " \"num_events\",\n", + " \"num_songs\",\n", + " \"num_artists\",\n", + " \"num_unique_songs\",\n", + " \"num_thumbs_down\",\n", + " \"num_thumbs_up\",\n", + " \"num_add_to_playlist\",\n", + " \"num_ads\",\n", + " \"num_add_friend\",\n", + " \"num_downgrade\",\n", + " \"num_upgrade\",\n", + " \"num_error\",\n", + " \"ts_date_day\",\n", + " \"registration_ts\",\n", + " \"user_churned\",\n", + " \"percentage_ad\",\n", + " \"days_since_active\",\n", + " \"repeats_ratio\",\n", + " ]\n", + " # only keep created feature columns\n", + " final_feature_df = final_feature_df[\n", + " [\n", + " \"userId\",\n", + " \"user_churned\",\n", + " \"average_events_weekend\",\n", + " \"average_events_weekday\",\n", + " \"num_songs_played_7d\",\n", + " \"num_ads_7d\",\n", + " \"num_error_7d\",\n", + " \"num_songs_played_30d\",\n", + " \"num_songs_played_90d\",\n", + " \"num_sessions\",\n", + " \"avg_time_per_session\",\n", + " \"avg_events_per_session\",\n", + " \"avg_gap_between_session\",\n", + " \"num_events\",\n", + " \"num_songs\",\n", + " \"num_artists\",\n", + " \"num_thumbs_down\",\n", + " \"num_thumbs_up\",\n", + " \"num_add_to_playlist\",\n", + " \"num_ads\",\n", + " \"num_add_friend\",\n", + " \"num_downgrade\",\n", + " \"num_upgrade\",\n", + " \"num_error\",\n", + " \"percentage_ad\",\n", + " \"days_since_active\",\n", + " \"repeats_ratio\",\n", + " ]\n", + " ]\n", + "\n", + " print(\"shape of file to append:\\t\\t{}\".format(final_feature_df.shape))\n", + " iter_end_time = time.time()\n", + " end_time = time.time()\n", + " print(\"minutes elapsed: {}\".format(str((end_time - start_time) / 60)))\n", + "\n", + " final_features_output_path = os.path.join(\"/opt/ml/processing/output\", output_filename)\n", + " print(\"Saving processed data to {}\".format(final_features_output_path))\n", + " final_feature_df.to_csv(final_features_output_path, header=True, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "output_path = processing_output_filename" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "%%time\n", + "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", + "\n", + "processing_job_output_path = f\"s3://{bucket}/{prefix}/data/processing\"\n", + "\n", + "sklearn_processor.run(\n", + " code=\"preprocessing.py\",\n", + " outputs=[\n", + " ProcessingOutput(\n", + " output_name=\"processed_data\",\n", + " source=\"/opt/ml/processing/output\",\n", + " destination=processing_job_output_path,\n", + " )\n", + " ],\n", + " arguments=[\n", + " \"--dw-output-path\",\n", + " processing_job_output_path,\n", + " \"--processing-output-filename\",\n", + " processing_job_output_name,\n", + " ],\n", + ")\n", + "\n", + "preprocessing_job_description = sklearn_processor.jobs[-1].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "preprocessing_job_description" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Congratulations! You have preprocessed the data. You can proceed to modelling." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Data Splitting\n", + "\n", + "You formulated the use case as a classification problem on user level, so you can randomly split your data from last step into train/validation/test. If you want to predict \"will user X churn in the next Y days\" on per user per day level, you should think about spliting data in chronological order instead of random. \n", + "\n", + "You should split the data and make sure that data of both classes exist in your train, validation and test sets, to make sure both classes are represented in your data. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Find the output of Processing Job" ] }, { @@ -984,7 +1323,143 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "processing_job_output_uri = f\"{processing_job_output_path}/{processing_job_output_name}\"\n", + "processing_job_output_uri" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp $processing_job_output_uri ./data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data = pd.read_csv(processing_job_output_uri)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Optional: you can also load the processed data from the provided feature set\n", + "# processed_data = pd.read_csv('./data/full_feature_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "processed_data.head(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Split data to train/validation/test by 70/20/10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data = processed_data.sample(frac=1, random_state=1729)\n", + "grouped_df = data.groupby(\"user_churned\")\n", + "arr_list = [np.split(g, [int(0.7 * len(g)), int(0.9 * len(g))]) for i, g in grouped_df]\n", + "\n", + "train_data = pd.concat([t[0] for t in arr_list])\n", + "validation_data = pd.concat([t[1] for t in arr_list])\n", + "test_data = pd.concat([v[2] for v in arr_list])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def process_data(data, name, header=False):\n", + " data = data.drop(columns=[\"userId\"])\n", + " data = pd.concat([data[\"user_churned\"], data.drop([\"user_churned\"], axis=1)], axis=1)\n", + " data.to_csv(name, header=header, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "process_data(train_data, \"data/train_updated.csv\")\n", + "process_data(validation_data, \"data/validation_updated.csv\")\n", + "process_data(test_data, \"data/test_updated.csv\")\n", + "\n", + "process_data(train_data, \"data/train_w_header.csv\", header=True)\n", + "process_data(validation_data, \"data/validation_w_header.csv\", header=True)\n", + "process_data(test_data, \"data/test_w_header.csv\", header=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Save splitted data to S3\n", + "The splitted data is provided in the /data folder. You can also upload the provided files (`data/train_updated.csv`,`data/validation_updated.csv`, `data/test_updated.csv`) and proceed to the next step. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "s3_input_train = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"train/train.csv\"))\n", + " .upload_file(\"data/train_updated.csv\")\n", + ")\n", + "s3_input_validation = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"validation/validation.csv\"))\n", + " .upload_file(\"data/validation_updated.csv\")\n", + ")\n", + "s3_input_validation = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"test/test_labeled.csv\"))\n", + " .upload_file(\"data/test_updated.csv\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Citation\n", + "The data used in this notebook is simulated using the [EventSim](https://github.com/Interana/eventsim)." + ] } ], "metadata": { @@ -992,7 +1467,7 @@ "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" }, "language_info": { "codemirror_mode": { diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index b459fd1f29..604dd393bb 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -123,17 +123,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -142,12 +132,13 @@ "import pandas as pd\n", "import glob\n", "import s3fs\n", - "import boto3" + "import boto3\n", + "from datetime import datetime" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -156,12 +147,14 @@ "\n", "region = boto3.Session().region_name\n", "role = sagemaker.get_execution_role()\n", - "smclient = boto3.Session().client(\"sagemaker\")" + "smclient = boto3.Session().client(\"sagemaker\")\n", + "bucket = sagemaker_session.default_bucket()\n", + "prefix = \"music-streaming\"" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -179,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -205,15 +198,15 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 66 µs, sys: 0 ns, total: 66 µs\n", - "Wall time: 68.7 µs\n" + "CPU times: user 102 µs, sys: 0 ns, total: 102 µs\n", + "Wall time: 309 µs\n" ] } ], @@ -235,7 +228,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -252,11 +245,114 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2022-04-29 21:58:04 Starting - Starting the training job...\n", + "2022-04-29 21:58:28 Starting - Preparing the instances for trainingProfilerReport-1651269483: InProgress\n", + ".........\n", + "2022-04-29 21:59:56 Downloading - Downloading input data...\n", + "2022-04-29 22:00:31 Training - Downloading the training image......\n", + "2022-04-29 22:01:27 Training - Training image download completed. Training in progress..\u001b[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[22:01:32] 708x25 matrix with 17700 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[22:01:32] 204x25 matrix with 5100 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n", + "\u001b[34mINFO:root:Single node training.\u001b[0m\n", + "\u001b[34mINFO:root:Train matrix has 708 rows\u001b[0m\n", + "\u001b[34mINFO:root:Validation matrix has 204 rows\u001b[0m\n", + "\u001b[34m[0]#011train-auc:0.91768#011validation-auc:0.94514\u001b[0m\n", + "\u001b[34m[1]#011train-auc:0.92026#011validation-auc:0.95180\u001b[0m\n", + "\u001b[34m[2]#011train-auc:0.93830#011validation-auc:0.95534\u001b[0m\n", + "\u001b[34m[3]#011train-auc:0.93852#011validation-auc:0.95507\u001b[0m\n", + "\u001b[34m[4]#011train-auc:0.95391#011validation-auc:0.96667\u001b[0m\n", + "\u001b[34m[5]#011train-auc:0.95654#011validation-auc:0.96758\u001b[0m\n", + "\u001b[34m[6]#011train-auc:0.95694#011validation-auc:0.96468\u001b[0m\n", + "\u001b[34m[7]#011train-auc:0.96200#011validation-auc:0.96473\u001b[0m\n", + "\u001b[34m[8]#011train-auc:0.96468#011validation-auc:0.96720\u001b[0m\n", + "\u001b[34m[9]#011train-auc:0.96311#011validation-auc:0.96699\u001b[0m\n", + "\u001b[34m[10]#011train-auc:0.96290#011validation-auc:0.96871\u001b[0m\n", + "\u001b[34m[11]#011train-auc:0.96521#011validation-auc:0.97434\u001b[0m\n", + "\u001b[34m[12]#011train-auc:0.96481#011validation-auc:0.97182\u001b[0m\n", + "\u001b[34m[13]#011train-auc:0.96483#011validation-auc:0.97386\u001b[0m\n", + "\u001b[34m[14]#011train-auc:0.96442#011validation-auc:0.97375\u001b[0m\n", + "\u001b[34m[15]#011train-auc:0.96458#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[16]#011train-auc:0.96498#011validation-auc:0.97563\u001b[0m\n", + "\u001b[34m[17]#011train-auc:0.96619#011validation-auc:0.97724\u001b[0m\n", + "\u001b[34m[18]#011train-auc:0.96492#011validation-auc:0.97681\u001b[0m\n", + "\u001b[34m[19]#011train-auc:0.96406#011validation-auc:0.97584\u001b[0m\n", + "\u001b[34m[20]#011train-auc:0.96365#011validation-auc:0.97584\u001b[0m\n", + "\u001b[34m[21]#011train-auc:0.96428#011validation-auc:0.97381\u001b[0m\n", + "\u001b[34m[22]#011train-auc:0.96540#011validation-auc:0.97348\u001b[0m\n", + "\u001b[34m[23]#011train-auc:0.96511#011validation-auc:0.97445\u001b[0m\n", + "\u001b[34m[24]#011train-auc:0.96481#011validation-auc:0.97450\u001b[0m\n", + "\u001b[34m[25]#011train-auc:0.96465#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[26]#011train-auc:0.96503#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[27]#011train-auc:0.96627#011validation-auc:0.97364\u001b[0m\n", + "\u001b[34m[28]#011train-auc:0.96733#011validation-auc:0.97289\u001b[0m\n", + "\u001b[34m[29]#011train-auc:0.96781#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[30]#011train-auc:0.96757#011validation-auc:0.97300\u001b[0m\n", + "\u001b[34m[31]#011train-auc:0.96827#011validation-auc:0.97300\u001b[0m\n", + "\u001b[34m[32]#011train-auc:0.96887#011validation-auc:0.97332\u001b[0m\n", + "\u001b[34m[33]#011train-auc:0.96900#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[34]#011train-auc:0.96905#011validation-auc:0.97332\u001b[0m\n", + "\u001b[34m[35]#011train-auc:0.96980#011validation-auc:0.97440\u001b[0m\n", + "\u001b[34m[36]#011train-auc:0.96945#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[37]#011train-auc:0.96924#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[38]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[39]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[40]#011train-auc:0.96933#011validation-auc:0.97407\u001b[0m\n", + "\u001b[34m[41]#011train-auc:0.96896#011validation-auc:0.97343\u001b[0m\n", + "\u001b[34m[42]#011train-auc:0.96899#011validation-auc:0.97348\u001b[0m\n", + "\u001b[34m[43]#011train-auc:0.96945#011validation-auc:0.97359\u001b[0m\n", + "\u001b[34m[44]#011train-auc:0.96924#011validation-auc:0.97391\u001b[0m\n", + "\u001b[34m[45]#011train-auc:0.96974#011validation-auc:0.97423\u001b[0m\n", + "\u001b[34m[46]#011train-auc:0.97061#011validation-auc:0.97477\u001b[0m\n", + "\u001b[34m[47]#011train-auc:0.97083#011validation-auc:0.97467\u001b[0m\n", + "\u001b[34m[48]#011train-auc:0.97080#011validation-auc:0.97467\u001b[0m\n", + "\u001b[34m[49]#011train-auc:0.97067#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[50]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[51]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[52]#011train-auc:0.97181#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[53]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[54]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[55]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[56]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[57]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[58]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[59]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[60]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[61]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[62]#011train-auc:0.97333#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[63]#011train-auc:0.97349#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[64]#011train-auc:0.97359#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[65]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\u001b[34m[66]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\u001b[34m[67]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\n", + "2022-04-29 22:01:57 Uploading - Uploading generated training model\n", + "2022-04-29 22:01:57 Completed - Training job completed\n", + "Training seconds: 113\n", + "Billable seconds: 113\n", + "CPU times: user 465 ms, sys: 20.3 ms, total: 485 ms\n", + "Wall time: 4min 12s\n" + ] + } + ], "source": [ "%%time\n", "xgb.fit(inputs={\"train\": train_input, \"validation\": validation_input}, wait=True)" @@ -271,20 +367,252 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# custom trial name\n", - "experiment_name = \"music-streaming-churn-exp\"\n", + "experiment_name = \"music-streaming-churn-exp-{}\".format(datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))\n", "trial_name_xgb = \"xgboost\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Experiment creation music-streaming-churn-exp: SUCCESS\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.\n", + "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n", + "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2022-04-29-22-02-17-343\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Create trial xgboost: SUCCESSFUL\n", + "2022-04-29 22:02:17 Starting - Starting the training job...\n", + "2022-04-29 22:02:44 Starting - Preparing the instances for trainingProfilerReport-1651269737: InProgress\n", + ".........\n", + "2022-04-29 22:04:14 Downloading - Downloading input data...\n", + "2022-04-29 22:04:45 Training - Downloading the training image......\n", + "2022-04-29 22:05:46 Training - Training image download completed. Training in progress...\u001b[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.\u001b[0m\n", + "\u001b[34mReturning the value itself\u001b[0m\n", + "\u001b[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[22:05:50] 708x25 matrix with 17700 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n", + "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m[22:05:50] 204x25 matrix with 5100 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n", + "\u001b[34mINFO:root:Single node training.\u001b[0m\n", + "\u001b[34mINFO:root:Train matrix has 708 rows\u001b[0m\n", + "\u001b[34mINFO:root:Validation matrix has 204 rows\u001b[0m\n", + "\u001b[34m[0]#011train-auc:0.91768#011validation-auc:0.94514\u001b[0m\n", + "\u001b[34m[1]#011train-auc:0.92026#011validation-auc:0.95180\u001b[0m\n", + "\u001b[34m[2]#011train-auc:0.93830#011validation-auc:0.95534\u001b[0m\n", + "\u001b[34m[3]#011train-auc:0.93852#011validation-auc:0.95507\u001b[0m\n", + "\u001b[34m[4]#011train-auc:0.95391#011validation-auc:0.96667\u001b[0m\n", + "\u001b[34m[5]#011train-auc:0.95654#011validation-auc:0.96758\u001b[0m\n", + "\u001b[34m[6]#011train-auc:0.95694#011validation-auc:0.96468\u001b[0m\n", + "\u001b[34m[7]#011train-auc:0.96200#011validation-auc:0.96473\u001b[0m\n", + "\u001b[34m[8]#011train-auc:0.96468#011validation-auc:0.96720\u001b[0m\n", + "\u001b[34m[9]#011train-auc:0.96311#011validation-auc:0.96699\u001b[0m\n", + "\u001b[34m[10]#011train-auc:0.96290#011validation-auc:0.96871\u001b[0m\n", + "\u001b[34m[11]#011train-auc:0.96521#011validation-auc:0.97434\u001b[0m\n", + "\u001b[34m[12]#011train-auc:0.96481#011validation-auc:0.97182\u001b[0m\n", + "\u001b[34m[13]#011train-auc:0.96483#011validation-auc:0.97386\u001b[0m\n", + "\u001b[34m[14]#011train-auc:0.96442#011validation-auc:0.97375\u001b[0m\n", + "\u001b[34m[15]#011train-auc:0.96458#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[16]#011train-auc:0.96498#011validation-auc:0.97563\u001b[0m\n", + "\u001b[34m[17]#011train-auc:0.96619#011validation-auc:0.97724\u001b[0m\n", + "\u001b[34m[18]#011train-auc:0.96492#011validation-auc:0.97681\u001b[0m\n", + "\u001b[34m[19]#011train-auc:0.96406#011validation-auc:0.97584\u001b[0m\n", + "\u001b[34m[20]#011train-auc:0.96365#011validation-auc:0.97584\u001b[0m\n", + "\u001b[34m[21]#011train-auc:0.96428#011validation-auc:0.97381\u001b[0m\n", + "\u001b[34m[22]#011train-auc:0.96540#011validation-auc:0.97348\u001b[0m\n", + "\u001b[34m[23]#011train-auc:0.96511#011validation-auc:0.97445\u001b[0m\n", + "\u001b[34m[24]#011train-auc:0.96481#011validation-auc:0.97450\u001b[0m\n", + "\u001b[34m[25]#011train-auc:0.96465#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[26]#011train-auc:0.96503#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[27]#011train-auc:0.96627#011validation-auc:0.97364\u001b[0m\n", + "\u001b[34m[28]#011train-auc:0.96733#011validation-auc:0.97289\u001b[0m\n", + "\u001b[34m[29]#011train-auc:0.96781#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[30]#011train-auc:0.96757#011validation-auc:0.97300\u001b[0m\n", + "\u001b[34m[31]#011train-auc:0.96827#011validation-auc:0.97300\u001b[0m\n", + "\u001b[34m[32]#011train-auc:0.96887#011validation-auc:0.97332\u001b[0m\n", + "\u001b[34m[33]#011train-auc:0.96900#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[34]#011train-auc:0.96905#011validation-auc:0.97332\u001b[0m\n", + "\u001b[34m[35]#011train-auc:0.96980#011validation-auc:0.97440\u001b[0m\n", + "\u001b[34m[36]#011train-auc:0.96945#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[37]#011train-auc:0.96924#011validation-auc:0.97354\u001b[0m\n", + "\u001b[34m[38]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[39]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", + "\u001b[34m[40]#011train-auc:0.96933#011validation-auc:0.97407\u001b[0m\n", + "\u001b[34m[41]#011train-auc:0.96896#011validation-auc:0.97343\u001b[0m\n", + "\u001b[34m[42]#011train-auc:0.96899#011validation-auc:0.97348\u001b[0m\n", + "\u001b[34m[43]#011train-auc:0.96945#011validation-auc:0.97359\u001b[0m\n", + "\u001b[34m[44]#011train-auc:0.96924#011validation-auc:0.97391\u001b[0m\n", + "\u001b[34m[45]#011train-auc:0.96974#011validation-auc:0.97423\u001b[0m\n", + "\u001b[34m[46]#011train-auc:0.97061#011validation-auc:0.97477\u001b[0m\n", + "\u001b[34m[47]#011train-auc:0.97083#011validation-auc:0.97467\u001b[0m\n", + "\u001b[34m[48]#011train-auc:0.97080#011validation-auc:0.97467\u001b[0m\n", + "\u001b[34m[49]#011train-auc:0.97067#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[50]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[51]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", + "\u001b[34m[52]#011train-auc:0.97181#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[53]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[54]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", + "\u001b[34m[55]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[56]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[57]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", + "\u001b[34m[58]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[59]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[60]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[61]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", + "\u001b[34m[62]#011train-auc:0.97333#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[63]#011train-auc:0.97349#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[64]#011train-auc:0.97359#011validation-auc:0.97515\u001b[0m\n", + "\u001b[34m[65]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\u001b[34m[66]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\u001b[34m[67]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", + "\n", + "2022-04-29 22:06:07 Uploading - Uploading generated training model\n", + "2022-04-29 22:06:07 Completed - Training job completed\n", + "Training seconds: 113\n", + "Billable seconds: 113\n", + "CPU times: user 994 ms, sys: 74.8 ms, total: 1.07 s\n", + "Wall time: 4min 13s\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TrialComponentNameDisplayNameSourceArnSageMaker.ImageUriSageMaker.InstanceCountSageMaker.InstanceTypeSageMaker.VolumeSizeInGBearly_stopping_roundsetaeval_metric...train:auc - Lasttrain:auc - Counttrain - MediaTypetrain - Valuevalidation - MediaTypevalidation - ValueSageMaker.ModelArtifact - MediaTypeSageMaker.ModelArtifact - ValueTrialsExperiments
0sagemaker-xgboost-2022-04-29-22-02-17-343-aws-...churn-xgboostarn:aws:sagemaker:us-west-2:688520471316:train...246618743249.dkr.ecr.us-west-2.amazonaws.com/s...1.0ml.m4.xlarge30.050.00.08auc...0.9690535csvs3://sagemaker-us-west-2-688520471316/music-st...csvs3://sagemaker-us-west-2-688520471316/music-st...Nones3://sagemaker-us-west-2-688520471316/music-st...[xgboost][music-streaming-churn-exp]
\n", + "

1 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " TrialComponentName DisplayName \\\n", + "0 sagemaker-xgboost-2022-04-29-22-02-17-343-aws-... churn-xgboost \n", + "\n", + " SourceArn \\\n", + "0 arn:aws:sagemaker:us-west-2:688520471316:train... \n", + "\n", + " SageMaker.ImageUri SageMaker.InstanceCount \\\n", + "0 246618743249.dkr.ecr.us-west-2.amazonaws.com/s... 1.0 \n", + "\n", + " SageMaker.InstanceType SageMaker.VolumeSizeInGB early_stopping_rounds \\\n", + "0 ml.m4.xlarge 30.0 50.0 \n", + "\n", + " eta eval_metric ... train:auc - Last train:auc - Count \\\n", + "0 0.08 auc ... 0.96905 35 \n", + "\n", + " train - MediaType train - Value \\\n", + "0 csv s3://sagemaker-us-west-2-688520471316/music-st... \n", + "\n", + " validation - MediaType validation - Value \\\n", + "0 csv s3://sagemaker-us-west-2-688520471316/music-st... \n", + "\n", + " SageMaker.ModelArtifact - MediaType \\\n", + "0 None \n", + "\n", + " SageMaker.ModelArtifact - Value Trials \\\n", + "0 s3://sagemaker-us-west-2-688520471316/music-st... [xgboost] \n", + "\n", + " Experiments \n", + "0 [music-streaming-churn-exp] \n", + "\n", + "[1 rows x 36 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "%%time\n", "from smexperiments import experiment, trial\n", @@ -350,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -382,7 +710,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -442,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -455,13 +783,12 @@ ], "source": [ "# custom a tuner job name\n", - "tuning_job_name = \"ChurnPrediction-Tuning-Job\"\n", - "%store tuning_job_name" + "tuning_job_name = \"ChurnPrediction-Tuning-Job-{}\".format(datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -472,16 +799,35 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'HyperParameterTuningJobSummaries': [],\n", + " 'NextToken': 'cIws2QhTXUIa8bi8W47LEKvF+FCR8eCxw7lm05/6M4GEnbWgUtoUJdlQBSv7kOsUKyeD3vlHXjc+jwuuBpymNWHzVbJTQRgpv3gKfmL4gypEQUDRwvEqhJzEDswtvI3HovY77Q4w795ItXG+PyA0eT/CNgcnCrkGC1ZBCjvUDG3ik8HgfI2+WPs8rSJrNtI86VXlB+tKqBzfn6e0wkIVyMjnAtA653gJLJ6HYJjCA4wq7Q5HqeZyUP62UPhU2KKXNbvdlD2x/3WC9Z37Re53/rYLhSnzqCBH0BVz1OS0vsRuL4QUzHmrVw/b6rngygpW57lbB2WQkZJqB9yyBXjOO/G3BELqDX7SKGDYEQw6j3jklpEwBM//HEqMOppRWmDr7bpGrVFs1aWy/a79jjTWTMe2916jd/I5RWvegPXL1o5E6lfkb+7ZbMelxH2Idtj8LF6B38/DNdYEDXnjeNoXRTjUTPBb5ay0ExcwPqHQs3wSax6Js7KazMxNQBDSVOcFJ7FfjGA/CTd71ya/S6l23g5PtLj8bPbn97oJn2Xej6tvFumWLATRDxWFTQIgE9mZylxrQEYM3kVymvSvzVg42WJdbtFzikOsFPjzyaO/T7lll9K2XUY7SWsybD+NQ/JNBSimd73sbOfy',\n", + " 'ResponseMetadata': {'RequestId': '86f3a2ff-4ab7-40af-9262-0c8034a1347f',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amzn-requestid': '86f3a2ff-4ab7-40af-9262-0c8034a1347f',\n", + " 'content-type': 'application/x-amz-json-1.1',\n", + " 'content-length': '706',\n", + " 'date': 'Fri, 29 Apr 2022 22:06:30 GMT'},\n", + " 'RetryAttempts': 0}}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "smclient.list_hyper_parameter_tuning_jobs(NameContains=tuning_job_name)" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -535,9 +881,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "InProgress\n", + "Completed\n", + "CPU times: user 556 ms, sys: 53.8 ms, total: 610 ms\n", + "Wall time: 25min 4s\n" + ] + } + ], "source": [ "%%time\n", "# check status\n", @@ -559,9 +940,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ChurnPrediction-Tuning-Job-010-f4b35971\n", + "\n", + "2022-04-29 22:19:24 Starting - Preparing the instances for training\n", + "2022-04-29 22:19:24 Downloading - Downloading input data\n", + "2022-04-29 22:19:24 Training - Training image download completed. Training in progress.\n", + "2022-04-29 22:19:24 Uploading - Uploading generated training model\n", + "2022-04-29 22:19:24 Completed - Training job completed\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating model with name: sagemaker-xgboost-2022-04-29-22-31-36-166\n", + "INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2022-04-29-22-31-36-813\n", + "INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2022-04-29-22-31-36-813\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-----!CPU times: user 306 ms, sys: 12.4 ms, total: 318 ms\n", + "Wall time: 2min 32s\n" + ] + } + ], "source": [ "%%time\n", "# Attach to an existing hyperparameter tuning job.\n", @@ -600,7 +1012,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -614,7 +1026,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -639,11 +1051,213 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating model with name: sagemaker-xgboost-2022-04-29-22-34-07-950\n", + "INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2022-04-29-22-34-08-567\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".................................\n", + ".\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", + "\u001b[34mworker_processes auto;\u001b[0m\n", + "\u001b[34mdaemon off;\u001b[0m\n", + "\u001b[34mpid /tmp/nginx.pid;\u001b[0m\n", + "\u001b[34merror_log /dev/stderr;\u001b[0m\n", + "\u001b[34mworker_rlimit_nofile 4096;\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", + "\u001b[35mworker_processes auto;\u001b[0m\n", + "\u001b[35mdaemon off;\u001b[0m\n", + "\u001b[35mpid /tmp/nginx.pid;\u001b[0m\n", + "\u001b[35merror_log /dev/stderr;\u001b[0m\n", + "\u001b[35mworker_rlimit_nofile 4096;\u001b[0m\n", + "\u001b[34mevents {\n", + " worker_connections 2048;\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34mhttp {\n", + " include /etc/nginx/mime.types;\n", + " default_type application/octet-stream;\n", + " access_log /dev/stdout combined;\n", + " upstream gunicorn {\n", + " server unix:/tmp/gunicorn.sock;\n", + " }\n", + " server {\n", + " listen 8080 deferred;\n", + " client_max_body_size 0;\n", + " keepalive_timeout 3;\n", + " location ~ ^/(ping|invocations|execution-parameters) {\n", + " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", + " proxy_set_header Host $http_host;\n", + " proxy_redirect off;\n", + " proxy_read_timeout 60s;\n", + " proxy_pass http://gunicorn;\n", + " }\n", + " location / {\n", + " return 404 \"{}\";\n", + " }\n", + " }\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", + "\u001b[35mevents {\n", + " worker_connections 2048;\u001b[0m\n", + "\u001b[35m}\u001b[0m\n", + "\u001b[35mhttp {\n", + " include /etc/nginx/mime.types;\n", + " default_type application/octet-stream;\n", + " access_log /dev/stdout combined;\n", + " upstream gunicorn {\n", + " server unix:/tmp/gunicorn.sock;\n", + " }\n", + " server {\n", + " listen 8080 deferred;\n", + " client_max_body_size 0;\n", + " keepalive_timeout 3;\n", + " location ~ ^/(ping|invocations|execution-parameters) {\n", + " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", + " proxy_set_header Host $http_host;\n", + " proxy_redirect off;\n", + " proxy_read_timeout 60s;\n", + " proxy_pass http://gunicorn;\n", + " }\n", + " location / {\n", + " return 404 \"{}\";\n", + " }\n", + " }\u001b[0m\n", + "\u001b[35m}\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[32m2022-04-29T22:39:37.156:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", + "\u001b[34mworker_processes auto;\u001b[0m\n", + "\u001b[34mdaemon off;\u001b[0m\n", + "\u001b[34mpid /tmp/nginx.pid;\u001b[0m\n", + "\u001b[34merror_log /dev/stderr;\u001b[0m\n", + "\u001b[34mworker_rlimit_nofile 4096;\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", + "\u001b[35mworker_processes auto;\u001b[0m\n", + "\u001b[35mdaemon off;\u001b[0m\n", + "\u001b[35mpid /tmp/nginx.pid;\u001b[0m\n", + "\u001b[35merror_log /dev/stderr;\u001b[0m\n", + "\u001b[35mworker_rlimit_nofile 4096;\u001b[0m\n", + "\u001b[34mevents {\n", + " worker_connections 2048;\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34mhttp {\n", + " include /etc/nginx/mime.types;\n", + " default_type application/octet-stream;\n", + " access_log /dev/stdout combined;\n", + " upstream gunicorn {\n", + " server unix:/tmp/gunicorn.sock;\n", + " }\n", + " server {\n", + " listen 8080 deferred;\n", + " client_max_body_size 0;\n", + " keepalive_timeout 3;\n", + " location ~ ^/(ping|invocations|execution-parameters) {\n", + " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", + " proxy_set_header Host $http_host;\n", + " proxy_redirect off;\n", + " proxy_read_timeout 60s;\n", + " proxy_pass http://gunicorn;\n", + " }\n", + " location / {\n", + " return 404 \"{}\";\n", + " }\n", + " }\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", + "\u001b[35mevents {\n", + " worker_connections 2048;\u001b[0m\n", + "\u001b[35m}\u001b[0m\n", + "\u001b[35mhttp {\n", + " include /etc/nginx/mime.types;\n", + " default_type application/octet-stream;\n", + " access_log /dev/stdout combined;\n", + " upstream gunicorn {\n", + " server unix:/tmp/gunicorn.sock;\n", + " }\n", + " server {\n", + " listen 8080 deferred;\n", + " client_max_body_size 0;\n", + " keepalive_timeout 3;\n", + " location ~ ^/(ping|invocations|execution-parameters) {\n", + " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", + " proxy_set_header Host $http_host;\n", + " proxy_redirect off;\n", + " proxy_read_timeout 60s;\n", + " proxy_pass http://gunicorn;\n", + " }\n", + " location / {\n", + " return 404 \"{}\";\n", + " }\n", + " }\u001b[0m\n", + "\u001b[35m}\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", + "\u001b[34m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", + "\u001b[35m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[34m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[35m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", + "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", + "\u001b[32m2022-04-29T22:39:37.156:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n" + ] + } + ], "source": [ "transformer = best_model.transformer(\n", " instance_count=1, instance_type=\"ml.m4.xlarge\", output_path=batch_output\n", @@ -663,7 +1277,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -672,7 +1286,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -694,7 +1308,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -727,31 +1341,31 @@ " \n", " 0\n", " 0.0\n", - " 0.124609\n", + " 0.069090\n", " 0\n", " \n", " \n", " 1\n", " 0.0\n", - " 0.124609\n", + " 0.026726\n", " 0\n", " \n", " \n", " 2\n", " 0.0\n", - " 0.199627\n", + " 0.027686\n", " 0\n", " \n", " \n", " 3\n", " 0.0\n", - " 0.261825\n", + " 0.080675\n", " 0\n", " \n", " \n", " 4\n", " 0.0\n", - " 0.251063\n", + " 0.017445\n", " 0\n", " \n", " \n", @@ -763,31 +1377,31 @@ " \n", " 97\n", " 1.0\n", - " 0.880904\n", + " 0.986643\n", " 1\n", " \n", " \n", " 98\n", " 1.0\n", - " 0.879375\n", + " 0.964494\n", " 1\n", " \n", " \n", " 99\n", " 1.0\n", - " 0.135027\n", + " 0.106448\n", " 0\n", " \n", " \n", " 100\n", " 1.0\n", - " 0.898226\n", + " 0.975653\n", " 1\n", " \n", " \n", " 101\n", " 1.0\n", - " 0.886231\n", + " 0.989780\n", " 1\n", " \n", " \n", @@ -797,22 +1411,22 @@ ], "text/plain": [ " user_churned predicted_results predicted_binary\n", - "0 0.0 0.124609 0\n", - "1 0.0 0.124609 0\n", - "2 0.0 0.199627 0\n", - "3 0.0 0.261825 0\n", - "4 0.0 0.251063 0\n", + "0 0.0 0.069090 0\n", + "1 0.0 0.026726 0\n", + "2 0.0 0.027686 0\n", + "3 0.0 0.080675 0\n", + "4 0.0 0.017445 0\n", ".. ... ... ...\n", - "97 1.0 0.880904 1\n", - "98 1.0 0.879375 1\n", - "99 1.0 0.135027 0\n", - "100 1.0 0.898226 1\n", - "101 1.0 0.886231 1\n", + "97 1.0 0.986643 1\n", + "98 1.0 0.964494 1\n", + "99 1.0 0.106448 0\n", + "100 1.0 0.975653 1\n", + "101 1.0 0.989780 1\n", "\n", "[102 rows x 3 columns]" ] }, - "execution_count": 24, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -833,7 +1447,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -841,9 +1455,9 @@ "output_type": "stream", "text": [ "Test Evaluation: \n", - "Average F1 Score: 0.8736913204998312\n", - "Precision Score: 0.9285714285714286\n", - "Recall Score: 0.7428571428571429\n" + "Average F1 Score: 0.8861607142857144\n", + "Precision Score: 0.9310344827586207\n", + "Recall Score: 0.7714285714285715\n" ] } ], @@ -878,9 +1492,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.\n", + "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n" + ] + } + ], "source": [ "from sagemaker import clarify\n", "\n", @@ -891,7 +1514,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -904,7 +1527,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -915,7 +1538,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -924,7 +1547,7 @@ "{'predicted_binary', 'predicted_results', 'user_churned'}" ] }, - "execution_count": 29, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -935,9 +1558,18 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:sagemaker.deprecations:DataConfig will be deprecated on 15 Mar 2022.s3_data_distribution_type parameter will no longer be supported. Everything else will remain as is in sagemaker>=2.\n", + "See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.\n" + ] + } + ], "source": [ "shap_config = clarify.SHAPConfig(\n", " baseline=[test_set.iloc[0].values.tolist()], num_samples=100, agg_method=\"mean_abs\"\n", @@ -963,11 +1595,151 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2022-04-29-22-39-55-756\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: Clarify-Explainability-2022-04-29-22-39-55-756\n", + "Inputs: [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/train/train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/clarify-explainability/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/clarify-explainability', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", + ".....................................\u001b[34m2022-04-29 22:45:55,125 logging.conf not found when configuring logging, using default logging configuration.\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,125 Starting SageMaker Clarify Processing job\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,126 Analysis config path: /opt/ml/processing/input/config/analysis_config.json\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,126 Analysis result path: /opt/ml/processing/output\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,127 This host is algo-1.\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,127 This host is the leader.\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,127 Number of hosts in the cluster is 1.\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,287 Running Python / Pandas based analyzer.\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,288 Dataset type: text/csv uri: /opt/ml/processing/input/data\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,296 Loading dataset...\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,314 Loaded dataset. Dataset info:\u001b[0m\n", + "\u001b[34m\u001b[0m\n", + "\u001b[34mRangeIndex: 708 entries, 0 to 707\u001b[0m\n", + "\u001b[34mData columns (total 25 columns):\n", + " # Column Non-Null Count Dtype \u001b[0m\n", + "\u001b[34m--- ------ -------------- ----- \n", + " 0 average_events_weekend 708 non-null float64\n", + " 1 average_events_weekday 708 non-null float64\n", + " 2 num_songs_played_7d 708 non-null int64 \n", + " 3 num_ads_7d 708 non-null int64 \n", + " 4 num_error_7d 708 non-null int64 \n", + " 5 num_songs_played_30d 708 non-null int64 \n", + " 6 num_songs_played_90d 708 non-null int64 \n", + " 7 num_sessions 708 non-null int64 \n", + " 8 avg_time_per_session 708 non-null float64\n", + " 9 avg_events_per_session 708 non-null float64\n", + " 10 avg_gap_between_session 708 non-null float64\n", + " 11 num_events 708 non-null int64 \n", + " 12 num_songs 708 non-null int64 \n", + " 13 num_artists 708 non-null int64 \n", + " 14 num_thumbs_down 708 non-null int64 \n", + " 15 num_thumbs_up 708 non-null int64 \n", + " 16 num_add_to_playlist 708 non-null int64 \n", + " 17 num_ads 708 non-null int64 \n", + " 18 num_add_friend 708 non-null int64 \n", + " 19 num_downgrade 708 non-null int64 \n", + " 20 num_upgrade 708 non-null int64 \n", + " 21 num_error 708 non-null int64 \n", + " 22 percentage_ad 708 non-null float64\n", + " 23 days_since_active 708 non-null int64 \n", + " 24 repeats_ratio 708 non-null float64\u001b[0m\n", + "\u001b[34mdtypes: float64(7), int64(18)\u001b[0m\n", + "\u001b[34mmemory usage: 138.4 KB\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,482 Spinning up shadow endpoint\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,483 Creating endpoint-config with name sm-clarify-config-1651272355-5750\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,567 Creating endpoint: 'sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1'\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,831 Using endpoint name: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,832 Waiting for endpoint ...\u001b[0m\n", + "\u001b[34m2022-04-29 22:45:55,832 Checking endpoint status:\u001b[0m\n", + "\u001b[34mLegend:\u001b[0m\n", + "\u001b[34m(OutOfService: x, Creating: -, Updating: -, InService: !, RollingBack: <, Deleting: o, Failed: *)\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,323 Endpoint is in service after 240 seconds\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,323 Endpoint ready.\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,326 SHAP n_samples 100\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,480 =====================================================\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,480 Shap analyzer: explaining 708 rows, 25 columns...\u001b[0m\n", + "\u001b[34m2022-04-29 22:49:56,480 =====================================================\n", + " 0% (0 of 708) | | Elapsed Time: 0:00:00 ETA: --:--:--\u001b[0m\n", + "\u001b[34m100% (708 of 708) |######################| Elapsed Time: 0:00:16 Time: 0:00:16\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,201 getting explanations took 16.72 seconds.\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,201 ===================================================\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,271 converting explanations to tabular took 0.07 seconds.\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,271 ===================================================\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,274 Wrote baseline used to compute explanations to: /opt/ml/processing/output/explanations_shap/baseline.csv\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,298 Wrote 708 local explanations to: /opt/ml/processing/output/explanations_shap/out.csv\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,299 writing local explanations took 0.03 seconds.\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,299 ===================================================\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,301 aggregating local explanations took 0.00 seconds.\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,301 ===================================================\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,301 Shap analysis finished.\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,302 Stop using endpoint: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,302 Deleting endpoint configuration with name: sm-clarify-config-1651272355-5750\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,407 Deleting endpoint with name: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:13,528 Model endpoint delivered 41.81825 requests per second and a total of 710 requests over 17 seconds\u001b[0m\n", + "\u001b[34m2022-04-29 22:50:17,557 Stop using endpoint: None\u001b[0m\n", + "\n", + "\u001b[34m2022-04-29 22:51:24,009 jupyter nbconvert --to html --output /opt/ml/processing/output/report.html /opt/ml/processing/output/report.ipynb --template sagemaker-xai\u001b[0m\n", + "\u001b[34m[NbConvertApp] Converting notebook /opt/ml/processing/output/report.ipynb to html\u001b[0m\n", + "\u001b[34m[NbConvertApp] Writing 534585 bytes to /opt/ml/processing/output/report.html\u001b[0m\n", + "\u001b[34m2022-04-29 22:51:25,148 HTML report '/opt/ml/processing/output/report.html' generated successfully.\u001b[0m\n", + "\u001b[34m2022-04-29 22:51:25,148 wkhtmltopdf -q /opt/ml/processing/output/report.html /opt/ml/processing/output/report.pdf\u001b[0m\n", + "\u001b[34m2022-04-29 22:51:25,809 PDF report '/opt/ml/processing/output/report.pdf' generated successfully.\u001b[0m\n", + "\u001b[34m2022-04-29 22:51:25,810 Collected analyses: \u001b[0m\n", + "\u001b[34m{\n", + " \"version\": \"1.0\",\n", + " \"explanations\": {\n", + " \"kernel_shap\": {\n", + " \"label0\": {\n", + " \"global_shap_values\": {\n", + " \"average_events_weekend\": 0.0021496041898171037,\n", + " \"average_events_weekday\": 0.016587702435219898,\n", + " \"num_songs_played_7d\": 0.0017402863783950045,\n", + " \"num_ads_7d\": 0.00868551809925556,\n", + " \"num_error_7d\": 0.0027734162552485904,\n", + " \"num_songs_played_30d\": 0.0011722262443900576,\n", + " \"num_songs_played_90d\": 0.0012286071144514484,\n", + " \"num_sessions\": 0.0011579425530513692,\n", + " \"avg_time_per_session\": 0.010029593923214982,\n", + " \"avg_events_per_session\": 0.003897350916100629,\n", + " \"avg_gap_between_session\": 0.0448815105461223,\n", + " \"num_events\": 0.0010721354650824332,\n", + " \"num_songs\": 0.0010824064811915818,\n", + " \"num_artists\": 0.0011435948826397841,\n", + " \"num_thumbs_down\": 0.0022934530809684047,\n", + " \"num_thumbs_up\": 0.0010034211293590313,\n", + " \"num_add_to_playlist\": 0.001163952744967679,\n", + " \"num_ads\": 0.0010789208860291896,\n", + " \"num_add_friend\": 0.005465492600682657,\n", + " \"num_downgrade\": 0.001048377920399092,\n", + " \"num_upgrade\": 0.002673292801161779,\n", + " \"num_error\": 0.0011517907319294842,\n", + " \"percentage_ad\": 0.02861671713801823,\n", + " \"days_since_active\": 0.29080317686448776,\n", + " \"repeats_ratio\": 0.001010326307543725\n", + " },\n", + " \"expected_value\": 0.06909029185771942\n", + " }\n", + " }\n", + " }\u001b[0m\n", + "\u001b[34m}\u001b[0m\n", + "\u001b[34m2022-04-29 22:51:25,810 exit_message: Completed: SageMaker XAI Analyzer ran successfully\u001b[0m\n", + "\u001b[34m----!\u001b[0m\n" + ] + } + ], "source": [ "clarify_processor.run_explainability(\n", " data_config=explainability_data_config,\n", @@ -997,7 +1769,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1005,14 +1777,34 @@ "validation_data = pd.read_csv(\"data/validation_w_header.csv\")\n", "\n", "data_for_experiment = pd.concat([train_data, validation_data])\n", - "data_for_experiment.to_csv(\"full_feature_data.csv\", index=False)\n", - "s3_input_full_set = (\n", - " boto3.Session()\n", - " .resource(\"s3\")\n", - " .Bucket(bucket)\n", - " .Object(os.path.join(prefix, \"full/fullset.csv\"))\n", - " .upload_file(\"full_feature_data.csv\")\n", - ")" + "data_for_experiment.to_csv(\"full_feature_data_temp.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling and Reference\n", + "\n", + "Now that you have created the complete feature set, you can start to explore and find a best-working model for your churn use case. By the end of part 2, you will select an algorithm, find the best sets of hyperparameter for the model, examine how well the model performs, and finally find the top influential features.\n", + "\n", + "To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input (variable dataframe `processed_data`) for the next steps. " + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "# pd.read(\"full_feature_data.csv\")\n", + "# s3_input_full_set = (\n", + "# boto3.Session()\n", + "# .resource(\"s3\")\n", + "# .Bucket(bucket)\n", + "# .Object(os.path.join(prefix, \"full/fullset.csv\"))\n", + "# .upload_file(\"full_feature_data.csv\")\n", + "# )" ] }, { @@ -1062,14 +1854,21 @@ "\n", "The data used in this notebook is simulated using the [EventSim](https://github.com/Interana/eventsim)." ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "conda_python3", + "display_name": "Python 3 (Data Science)", "language": "python", - "name": "conda_python3" + "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" }, "language_info": { "codemirror_mode": { @@ -1081,7 +1880,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.7.10" } }, "nbformat": 4, From 787d8a1e4ce98d583e90fef693ff12305269263a Mon Sep 17 00:00:00 2001 From: atqy Date: Sat, 30 Apr 2022 00:18:19 +0000 Subject: [PATCH 10/27] refactor sequential noteboks 1 --- .../0_cust_churn_overview_dw.ipynb | 12 +- .../2_cust_churn_train_deploy_infer.ipynb | 1149 +++-------------- 2 files changed, 150 insertions(+), 1011 deletions(-) diff --git a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb index 0d270b8a89..a00ba25e52 100644 --- a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb +++ b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb @@ -918,8 +918,6 @@ "outputs": [], "source": [ "processing_output_filename = f\"{processing_output_path}/{final_features_filename}\"\n", - "# %store processing_output_filename\n", - "# %store -r\n", "processing_output_filename" ] }, @@ -941,15 +939,7 @@ " ] = processing_output_filename\n", "\n", "with open(\"dw_example.flow\", \"w\") as f:\n", - " json.dump(flow, f)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + " json.dump(flow, f)\n", "flow" ] }, diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index 604dd393bb..0fe4474302 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -123,7 +123,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -133,12 +133,13 @@ "import glob\n", "import s3fs\n", "import boto3\n", - "from datetime import datetime" + "from datetime import datetime\n", + "import os" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -152,14 +153,104 @@ "prefix = \"music-streaming\"" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "\n", + "### Download Data and Upload to S3\n", + "\n", + "We ingest the simulated data from the public SageMaker S3 training database. If you want to see how the train, test, and validation datasets are created in detail, look at [Build a Customer Churn Model for Music Streaming App Users: Overview and Data Preparation](0_cust_churn_overview_dw.ipynb)" + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "container = sagemaker.image_uris.retrieve(\n", - " \"xgboost\", region, version=\"1.0-1\", instance_type=\"ml.m4.xlarge\"\n", + "##### Alternative: copy data from a public S3 bucket to your own bucket\n", + "##### data file should include full_data.csv and sample.json\n", + "#### cell 5 - 7 is not needed; the processing job before data wrangler screenshots is not needed\n", + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data.zip ./data/raw/customer-churn-data.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!unzip -o ./data/raw/customer-churn-data.zip -d ./data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# unzip the partitioned data files into the same folder\n", + "!unzip -o data/simu-1.zip -d data/raw\n", + "!unzip -o data/simu-2.zip -d data/raw\n", + "!unzip -o data/simu-3.zip -d data/raw\n", + "!unzip -o data/simu-4.zip -d data/raw" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm ./data/raw/*.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!unzip -o data/sample.zip -d data/raw" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!aws s3 cp ./data/raw s3://$bucket/$prefix/data/json/ --recursive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "s3_input_train = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"train/train.csv\"))\n", + " .upload_file(\"data/train_updated.csv\")\n", + ")\n", + "s3_input_validation = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"validation/validation.csv\"))\n", + " .upload_file(\"data/validation_updated.csv\")\n", + ")\n", + "s3_input_validation = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"test/test_labeled.csv\"))\n", + " .upload_file(\"data/test_updated.csv\")\n", ")" ] }, @@ -172,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -198,22 +289,18 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "CPU times: user 102 µs, sys: 0 ns, total: 102 µs\n", - "Wall time: 309 µs\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "from time import gmtime, strftime\n", "\n", + "container = sagemaker.image_uris.retrieve(\n", + " \"xgboost\", region, version=\"1.0-1\", instance_type=\"ml.m4.xlarge\"\n", + ")\n", + "\n", + "\n", "xgb = sagemaker.estimator.Estimator(\n", " container,\n", " role,\n", @@ -228,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -245,114 +332,11 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-04-29 21:58:04 Starting - Starting the training job...\n", - "2022-04-29 21:58:28 Starting - Preparing the instances for trainingProfilerReport-1651269483: InProgress\n", - ".........\n", - "2022-04-29 21:59:56 Downloading - Downloading input data...\n", - "2022-04-29 22:00:31 Training - Downloading the training image......\n", - "2022-04-29 22:01:27 Training - Training image download completed. Training in progress..\u001b[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[22:01:32] 708x25 matrix with 17700 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[22:01:32] 204x25 matrix with 5100 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n", - "\u001b[34mINFO:root:Single node training.\u001b[0m\n", - "\u001b[34mINFO:root:Train matrix has 708 rows\u001b[0m\n", - "\u001b[34mINFO:root:Validation matrix has 204 rows\u001b[0m\n", - "\u001b[34m[0]#011train-auc:0.91768#011validation-auc:0.94514\u001b[0m\n", - "\u001b[34m[1]#011train-auc:0.92026#011validation-auc:0.95180\u001b[0m\n", - "\u001b[34m[2]#011train-auc:0.93830#011validation-auc:0.95534\u001b[0m\n", - "\u001b[34m[3]#011train-auc:0.93852#011validation-auc:0.95507\u001b[0m\n", - "\u001b[34m[4]#011train-auc:0.95391#011validation-auc:0.96667\u001b[0m\n", - "\u001b[34m[5]#011train-auc:0.95654#011validation-auc:0.96758\u001b[0m\n", - "\u001b[34m[6]#011train-auc:0.95694#011validation-auc:0.96468\u001b[0m\n", - "\u001b[34m[7]#011train-auc:0.96200#011validation-auc:0.96473\u001b[0m\n", - "\u001b[34m[8]#011train-auc:0.96468#011validation-auc:0.96720\u001b[0m\n", - "\u001b[34m[9]#011train-auc:0.96311#011validation-auc:0.96699\u001b[0m\n", - "\u001b[34m[10]#011train-auc:0.96290#011validation-auc:0.96871\u001b[0m\n", - "\u001b[34m[11]#011train-auc:0.96521#011validation-auc:0.97434\u001b[0m\n", - "\u001b[34m[12]#011train-auc:0.96481#011validation-auc:0.97182\u001b[0m\n", - "\u001b[34m[13]#011train-auc:0.96483#011validation-auc:0.97386\u001b[0m\n", - "\u001b[34m[14]#011train-auc:0.96442#011validation-auc:0.97375\u001b[0m\n", - "\u001b[34m[15]#011train-auc:0.96458#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[16]#011train-auc:0.96498#011validation-auc:0.97563\u001b[0m\n", - "\u001b[34m[17]#011train-auc:0.96619#011validation-auc:0.97724\u001b[0m\n", - "\u001b[34m[18]#011train-auc:0.96492#011validation-auc:0.97681\u001b[0m\n", - "\u001b[34m[19]#011train-auc:0.96406#011validation-auc:0.97584\u001b[0m\n", - "\u001b[34m[20]#011train-auc:0.96365#011validation-auc:0.97584\u001b[0m\n", - "\u001b[34m[21]#011train-auc:0.96428#011validation-auc:0.97381\u001b[0m\n", - "\u001b[34m[22]#011train-auc:0.96540#011validation-auc:0.97348\u001b[0m\n", - "\u001b[34m[23]#011train-auc:0.96511#011validation-auc:0.97445\u001b[0m\n", - "\u001b[34m[24]#011train-auc:0.96481#011validation-auc:0.97450\u001b[0m\n", - "\u001b[34m[25]#011train-auc:0.96465#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[26]#011train-auc:0.96503#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[27]#011train-auc:0.96627#011validation-auc:0.97364\u001b[0m\n", - "\u001b[34m[28]#011train-auc:0.96733#011validation-auc:0.97289\u001b[0m\n", - "\u001b[34m[29]#011train-auc:0.96781#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[30]#011train-auc:0.96757#011validation-auc:0.97300\u001b[0m\n", - "\u001b[34m[31]#011train-auc:0.96827#011validation-auc:0.97300\u001b[0m\n", - "\u001b[34m[32]#011train-auc:0.96887#011validation-auc:0.97332\u001b[0m\n", - "\u001b[34m[33]#011train-auc:0.96900#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[34]#011train-auc:0.96905#011validation-auc:0.97332\u001b[0m\n", - "\u001b[34m[35]#011train-auc:0.96980#011validation-auc:0.97440\u001b[0m\n", - "\u001b[34m[36]#011train-auc:0.96945#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[37]#011train-auc:0.96924#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[38]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[39]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[40]#011train-auc:0.96933#011validation-auc:0.97407\u001b[0m\n", - "\u001b[34m[41]#011train-auc:0.96896#011validation-auc:0.97343\u001b[0m\n", - "\u001b[34m[42]#011train-auc:0.96899#011validation-auc:0.97348\u001b[0m\n", - "\u001b[34m[43]#011train-auc:0.96945#011validation-auc:0.97359\u001b[0m\n", - "\u001b[34m[44]#011train-auc:0.96924#011validation-auc:0.97391\u001b[0m\n", - "\u001b[34m[45]#011train-auc:0.96974#011validation-auc:0.97423\u001b[0m\n", - "\u001b[34m[46]#011train-auc:0.97061#011validation-auc:0.97477\u001b[0m\n", - "\u001b[34m[47]#011train-auc:0.97083#011validation-auc:0.97467\u001b[0m\n", - "\u001b[34m[48]#011train-auc:0.97080#011validation-auc:0.97467\u001b[0m\n", - "\u001b[34m[49]#011train-auc:0.97067#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[50]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[51]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[52]#011train-auc:0.97181#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[53]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[54]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[55]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[56]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[57]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[58]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[59]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[60]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[61]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[62]#011train-auc:0.97333#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[63]#011train-auc:0.97349#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[64]#011train-auc:0.97359#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[65]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\u001b[34m[66]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\u001b[34m[67]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\n", - "2022-04-29 22:01:57 Uploading - Uploading generated training model\n", - "2022-04-29 22:01:57 Completed - Training job completed\n", - "Training seconds: 113\n", - "Billable seconds: 113\n", - "CPU times: user 465 ms, sys: 20.3 ms, total: 485 ms\n", - "Wall time: 4min 12s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "xgb.fit(inputs={\"train\": train_input, \"validation\": validation_input}, wait=True)" @@ -367,252 +351,20 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# custom trial name\n", - "experiment_name = \"music-streaming-churn-exp-{}\".format(datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))\n", - "trial_name_xgb = \"xgboost\"" + "experiment_name = \"music-streaming-churn-exp-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n", + "trial_name_xgb = \"xgboost-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Experiment creation music-streaming-churn-exp: SUCCESS\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.\n", - "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n", - "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2022-04-29-22-02-17-343\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Create trial xgboost: SUCCESSFUL\n", - "2022-04-29 22:02:17 Starting - Starting the training job...\n", - "2022-04-29 22:02:44 Starting - Preparing the instances for trainingProfilerReport-1651269737: InProgress\n", - ".........\n", - "2022-04-29 22:04:14 Downloading - Downloading input data...\n", - "2022-04-29 22:04:45 Training - Downloading the training image......\n", - "2022-04-29 22:05:46 Training - Training image download completed. Training in progress...\u001b[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter eval_metric value auc to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.\u001b[0m\n", - "\u001b[34mReturning the value itself\u001b[0m\n", - "\u001b[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[22:05:50] 708x25 matrix with 17700 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,\u001b[0m\n", - "\u001b[34mINFO:root:Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m[22:05:50] 204x25 matrix with 5100 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,\u001b[0m\n", - "\u001b[34mINFO:root:Single node training.\u001b[0m\n", - "\u001b[34mINFO:root:Train matrix has 708 rows\u001b[0m\n", - "\u001b[34mINFO:root:Validation matrix has 204 rows\u001b[0m\n", - "\u001b[34m[0]#011train-auc:0.91768#011validation-auc:0.94514\u001b[0m\n", - "\u001b[34m[1]#011train-auc:0.92026#011validation-auc:0.95180\u001b[0m\n", - "\u001b[34m[2]#011train-auc:0.93830#011validation-auc:0.95534\u001b[0m\n", - "\u001b[34m[3]#011train-auc:0.93852#011validation-auc:0.95507\u001b[0m\n", - "\u001b[34m[4]#011train-auc:0.95391#011validation-auc:0.96667\u001b[0m\n", - "\u001b[34m[5]#011train-auc:0.95654#011validation-auc:0.96758\u001b[0m\n", - "\u001b[34m[6]#011train-auc:0.95694#011validation-auc:0.96468\u001b[0m\n", - "\u001b[34m[7]#011train-auc:0.96200#011validation-auc:0.96473\u001b[0m\n", - "\u001b[34m[8]#011train-auc:0.96468#011validation-auc:0.96720\u001b[0m\n", - "\u001b[34m[9]#011train-auc:0.96311#011validation-auc:0.96699\u001b[0m\n", - "\u001b[34m[10]#011train-auc:0.96290#011validation-auc:0.96871\u001b[0m\n", - "\u001b[34m[11]#011train-auc:0.96521#011validation-auc:0.97434\u001b[0m\n", - "\u001b[34m[12]#011train-auc:0.96481#011validation-auc:0.97182\u001b[0m\n", - "\u001b[34m[13]#011train-auc:0.96483#011validation-auc:0.97386\u001b[0m\n", - "\u001b[34m[14]#011train-auc:0.96442#011validation-auc:0.97375\u001b[0m\n", - "\u001b[34m[15]#011train-auc:0.96458#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[16]#011train-auc:0.96498#011validation-auc:0.97563\u001b[0m\n", - "\u001b[34m[17]#011train-auc:0.96619#011validation-auc:0.97724\u001b[0m\n", - "\u001b[34m[18]#011train-auc:0.96492#011validation-auc:0.97681\u001b[0m\n", - "\u001b[34m[19]#011train-auc:0.96406#011validation-auc:0.97584\u001b[0m\n", - "\u001b[34m[20]#011train-auc:0.96365#011validation-auc:0.97584\u001b[0m\n", - "\u001b[34m[21]#011train-auc:0.96428#011validation-auc:0.97381\u001b[0m\n", - "\u001b[34m[22]#011train-auc:0.96540#011validation-auc:0.97348\u001b[0m\n", - "\u001b[34m[23]#011train-auc:0.96511#011validation-auc:0.97445\u001b[0m\n", - "\u001b[34m[24]#011train-auc:0.96481#011validation-auc:0.97450\u001b[0m\n", - "\u001b[34m[25]#011train-auc:0.96465#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[26]#011train-auc:0.96503#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[27]#011train-auc:0.96627#011validation-auc:0.97364\u001b[0m\n", - "\u001b[34m[28]#011train-auc:0.96733#011validation-auc:0.97289\u001b[0m\n", - "\u001b[34m[29]#011train-auc:0.96781#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[30]#011train-auc:0.96757#011validation-auc:0.97300\u001b[0m\n", - "\u001b[34m[31]#011train-auc:0.96827#011validation-auc:0.97300\u001b[0m\n", - "\u001b[34m[32]#011train-auc:0.96887#011validation-auc:0.97332\u001b[0m\n", - "\u001b[34m[33]#011train-auc:0.96900#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[34]#011train-auc:0.96905#011validation-auc:0.97332\u001b[0m\n", - "\u001b[34m[35]#011train-auc:0.96980#011validation-auc:0.97440\u001b[0m\n", - "\u001b[34m[36]#011train-auc:0.96945#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[37]#011train-auc:0.96924#011validation-auc:0.97354\u001b[0m\n", - "\u001b[34m[38]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[39]#011train-auc:0.96936#011validation-auc:0.97418\u001b[0m\n", - "\u001b[34m[40]#011train-auc:0.96933#011validation-auc:0.97407\u001b[0m\n", - "\u001b[34m[41]#011train-auc:0.96896#011validation-auc:0.97343\u001b[0m\n", - "\u001b[34m[42]#011train-auc:0.96899#011validation-auc:0.97348\u001b[0m\n", - "\u001b[34m[43]#011train-auc:0.96945#011validation-auc:0.97359\u001b[0m\n", - "\u001b[34m[44]#011train-auc:0.96924#011validation-auc:0.97391\u001b[0m\n", - "\u001b[34m[45]#011train-auc:0.96974#011validation-auc:0.97423\u001b[0m\n", - "\u001b[34m[46]#011train-auc:0.97061#011validation-auc:0.97477\u001b[0m\n", - "\u001b[34m[47]#011train-auc:0.97083#011validation-auc:0.97467\u001b[0m\n", - "\u001b[34m[48]#011train-auc:0.97080#011validation-auc:0.97467\u001b[0m\n", - "\u001b[34m[49]#011train-auc:0.97067#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[50]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[51]#011train-auc:0.97121#011validation-auc:0.97456\u001b[0m\n", - "\u001b[34m[52]#011train-auc:0.97181#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[53]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[54]#011train-auc:0.97159#011validation-auc:0.97461\u001b[0m\n", - "\u001b[34m[55]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[56]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[57]#011train-auc:0.97246#011validation-auc:0.97504\u001b[0m\n", - "\u001b[34m[58]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[59]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[60]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[61]#011train-auc:0.97323#011validation-auc:0.97493\u001b[0m\n", - "\u001b[34m[62]#011train-auc:0.97333#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[63]#011train-auc:0.97349#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[64]#011train-auc:0.97359#011validation-auc:0.97515\u001b[0m\n", - "\u001b[34m[65]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\u001b[34m[66]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\u001b[34m[67]#011train-auc:0.97353#011validation-auc:0.97525\u001b[0m\n", - "\n", - "2022-04-29 22:06:07 Uploading - Uploading generated training model\n", - "2022-04-29 22:06:07 Completed - Training job completed\n", - "Training seconds: 113\n", - "Billable seconds: 113\n", - "CPU times: user 994 ms, sys: 74.8 ms, total: 1.07 s\n", - "Wall time: 4min 13s\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
TrialComponentNameDisplayNameSourceArnSageMaker.ImageUriSageMaker.InstanceCountSageMaker.InstanceTypeSageMaker.VolumeSizeInGBearly_stopping_roundsetaeval_metric...train:auc - Lasttrain:auc - Counttrain - MediaTypetrain - Valuevalidation - MediaTypevalidation - ValueSageMaker.ModelArtifact - MediaTypeSageMaker.ModelArtifact - ValueTrialsExperiments
0sagemaker-xgboost-2022-04-29-22-02-17-343-aws-...churn-xgboostarn:aws:sagemaker:us-west-2:688520471316:train...246618743249.dkr.ecr.us-west-2.amazonaws.com/s...1.0ml.m4.xlarge30.050.00.08auc...0.9690535csvs3://sagemaker-us-west-2-688520471316/music-st...csvs3://sagemaker-us-west-2-688520471316/music-st...Nones3://sagemaker-us-west-2-688520471316/music-st...[xgboost][music-streaming-churn-exp]
\n", - "

1 rows × 36 columns

\n", - "
" - ], - "text/plain": [ - " TrialComponentName DisplayName \\\n", - "0 sagemaker-xgboost-2022-04-29-22-02-17-343-aws-... churn-xgboost \n", - "\n", - " SourceArn \\\n", - "0 arn:aws:sagemaker:us-west-2:688520471316:train... \n", - "\n", - " SageMaker.ImageUri SageMaker.InstanceCount \\\n", - "0 246618743249.dkr.ecr.us-west-2.amazonaws.com/s... 1.0 \n", - "\n", - " SageMaker.InstanceType SageMaker.VolumeSizeInGB early_stopping_rounds \\\n", - "0 ml.m4.xlarge 30.0 50.0 \n", - "\n", - " eta eval_metric ... train:auc - Last train:auc - Count \\\n", - "0 0.08 auc ... 0.96905 35 \n", - "\n", - " train - MediaType train - Value \\\n", - "0 csv s3://sagemaker-us-west-2-688520471316/music-st... \n", - "\n", - " validation - MediaType validation - Value \\\n", - "0 csv s3://sagemaker-us-west-2-688520471316/music-st... \n", - "\n", - " SageMaker.ModelArtifact - MediaType \\\n", - "0 None \n", - "\n", - " SageMaker.ModelArtifact - Value Trials \\\n", - "0 s3://sagemaker-us-west-2-688520471316/music-st... [xgboost] \n", - "\n", - " Experiments \n", - "0 [music-streaming-churn-exp] \n", - "\n", - "[1 rows x 36 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "%%time\n", "from smexperiments import experiment, trial\n", @@ -678,7 +430,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -710,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -770,25 +522,17 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stored 'tuning_job_name' (str)\n" - ] - } - ], + "outputs": [], "source": [ "# custom a tuner job name\n", - "tuning_job_name = \"ChurnPrediction-Tuning-Job-{}\".format(datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))" + "tuning_job_name = \"ChurnPredictTune-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -799,45 +543,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'HyperParameterTuningJobSummaries': [],\n", - " 'NextToken': 'cIws2QhTXUIa8bi8W47LEKvF+FCR8eCxw7lm05/6M4GEnbWgUtoUJdlQBSv7kOsUKyeD3vlHXjc+jwuuBpymNWHzVbJTQRgpv3gKfmL4gypEQUDRwvEqhJzEDswtvI3HovY77Q4w795ItXG+PyA0eT/CNgcnCrkGC1ZBCjvUDG3ik8HgfI2+WPs8rSJrNtI86VXlB+tKqBzfn6e0wkIVyMjnAtA653gJLJ6HYJjCA4wq7Q5HqeZyUP62UPhU2KKXNbvdlD2x/3WC9Z37Re53/rYLhSnzqCBH0BVz1OS0vsRuL4QUzHmrVw/b6rngygpW57lbB2WQkZJqB9yyBXjOO/G3BELqDX7SKGDYEQw6j3jklpEwBM//HEqMOppRWmDr7bpGrVFs1aWy/a79jjTWTMe2916jd/I5RWvegPXL1o5E6lfkb+7ZbMelxH2Idtj8LF6B38/DNdYEDXnjeNoXRTjUTPBb5ay0ExcwPqHQs3wSax6Js7KazMxNQBDSVOcFJ7FfjGA/CTd71ya/S6l23g5PtLj8bPbn97oJn2Xej6tvFumWLATRDxWFTQIgE9mZylxrQEYM3kVymvSvzVg42WJdbtFzikOsFPjzyaO/T7lll9K2XUY7SWsybD+NQ/JNBSimd73sbOfy',\n", - " 'ResponseMetadata': {'RequestId': '86f3a2ff-4ab7-40af-9262-0c8034a1347f',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amzn-requestid': '86f3a2ff-4ab7-40af-9262-0c8034a1347f',\n", - " 'content-type': 'application/x-amz-json-1.1',\n", - " 'content-length': '706',\n", - " 'date': 'Fri, 29 Apr 2022 22:06:30 GMT'},\n", - " 'RetryAttempts': 0}}" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "smclient.list_hyper_parameter_tuning_jobs(NameContains=tuning_job_name)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Create tuning job ChurnPrediction-Tuning-Job: SUCCESSFUL\n" - ] - } - ], + "outputs": [], "source": [ "from sagemaker.tuner import HyperparameterTuner\n", "\n", @@ -881,44 +598,9 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "InProgress\n", - "Completed\n", - "CPU times: user 556 ms, sys: 53.8 ms, total: 610 ms\n", - "Wall time: 25min 4s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# check status\n", @@ -940,40 +622,9 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ChurnPrediction-Tuning-Job-010-f4b35971\n", - "\n", - "2022-04-29 22:19:24 Starting - Preparing the instances for training\n", - "2022-04-29 22:19:24 Downloading - Downloading input data\n", - "2022-04-29 22:19:24 Training - Training image download completed. Training in progress.\n", - "2022-04-29 22:19:24 Uploading - Uploading generated training model\n", - "2022-04-29 22:19:24 Completed - Training job completed\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating model with name: sagemaker-xgboost-2022-04-29-22-31-36-166\n", - "INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2022-04-29-22-31-36-813\n", - "INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2022-04-29-22-31-36-813\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "-----!CPU times: user 306 ms, sys: 12.4 ms, total: 318 ms\n", - "Wall time: 2min 32s\n" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "# Attach to an existing hyperparameter tuning job.\n", @@ -1012,7 +663,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1026,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1051,213 +702,11 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating model with name: sagemaker-xgboost-2022-04-29-22-34-07-950\n", - "INFO:sagemaker:Creating transform job with name: sagemaker-xgboost-2022-04-29-22-34-08-567\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ".................................\n", - ".\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", - "\u001b[34mworker_processes auto;\u001b[0m\n", - "\u001b[34mdaemon off;\u001b[0m\n", - "\u001b[34mpid /tmp/nginx.pid;\u001b[0m\n", - "\u001b[34merror_log /dev/stderr;\u001b[0m\n", - "\u001b[34mworker_rlimit_nofile 4096;\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", - "\u001b[35mworker_processes auto;\u001b[0m\n", - "\u001b[35mdaemon off;\u001b[0m\n", - "\u001b[35mpid /tmp/nginx.pid;\u001b[0m\n", - "\u001b[35merror_log /dev/stderr;\u001b[0m\n", - "\u001b[35mworker_rlimit_nofile 4096;\u001b[0m\n", - "\u001b[34mevents {\n", - " worker_connections 2048;\u001b[0m\n", - "\u001b[34m}\u001b[0m\n", - "\u001b[34mhttp {\n", - " include /etc/nginx/mime.types;\n", - " default_type application/octet-stream;\n", - " access_log /dev/stdout combined;\n", - " upstream gunicorn {\n", - " server unix:/tmp/gunicorn.sock;\n", - " }\n", - " server {\n", - " listen 8080 deferred;\n", - " client_max_body_size 0;\n", - " keepalive_timeout 3;\n", - " location ~ ^/(ping|invocations|execution-parameters) {\n", - " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", - " proxy_set_header Host $http_host;\n", - " proxy_redirect off;\n", - " proxy_read_timeout 60s;\n", - " proxy_pass http://gunicorn;\n", - " }\n", - " location / {\n", - " return 404 \"{}\";\n", - " }\n", - " }\u001b[0m\n", - "\u001b[34m}\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", - "\u001b[35mevents {\n", - " worker_connections 2048;\u001b[0m\n", - "\u001b[35m}\u001b[0m\n", - "\u001b[35mhttp {\n", - " include /etc/nginx/mime.types;\n", - " default_type application/octet-stream;\n", - " access_log /dev/stdout combined;\n", - " upstream gunicorn {\n", - " server unix:/tmp/gunicorn.sock;\n", - " }\n", - " server {\n", - " listen 8080 deferred;\n", - " client_max_body_size 0;\n", - " keepalive_timeout 3;\n", - " location ~ ^/(ping|invocations|execution-parameters) {\n", - " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", - " proxy_set_header Host $http_host;\n", - " proxy_redirect off;\n", - " proxy_read_timeout 60s;\n", - " proxy_pass http://gunicorn;\n", - " }\n", - " location / {\n", - " return 404 \"{}\";\n", - " }\n", - " }\u001b[0m\n", - "\u001b[35m}\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[32m2022-04-29T22:39:37.156:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", - "\u001b[34mworker_processes auto;\u001b[0m\n", - "\u001b[34mdaemon off;\u001b[0m\n", - "\u001b[34mpid /tmp/nginx.pid;\u001b[0m\n", - "\u001b[34merror_log /dev/stderr;\u001b[0m\n", - "\u001b[34mworker_rlimit_nofile 4096;\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:30:INFO] nginx config: \u001b[0m\n", - "\u001b[35mworker_processes auto;\u001b[0m\n", - "\u001b[35mdaemon off;\u001b[0m\n", - "\u001b[35mpid /tmp/nginx.pid;\u001b[0m\n", - "\u001b[35merror_log /dev/stderr;\u001b[0m\n", - "\u001b[35mworker_rlimit_nofile 4096;\u001b[0m\n", - "\u001b[34mevents {\n", - " worker_connections 2048;\u001b[0m\n", - "\u001b[34m}\u001b[0m\n", - "\u001b[34mhttp {\n", - " include /etc/nginx/mime.types;\n", - " default_type application/octet-stream;\n", - " access_log /dev/stdout combined;\n", - " upstream gunicorn {\n", - " server unix:/tmp/gunicorn.sock;\n", - " }\n", - " server {\n", - " listen 8080 deferred;\n", - " client_max_body_size 0;\n", - " keepalive_timeout 3;\n", - " location ~ ^/(ping|invocations|execution-parameters) {\n", - " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", - " proxy_set_header Host $http_host;\n", - " proxy_redirect off;\n", - " proxy_read_timeout 60s;\n", - " proxy_pass http://gunicorn;\n", - " }\n", - " location / {\n", - " return 404 \"{}\";\n", - " }\n", - " }\u001b[0m\n", - "\u001b[34m}\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", - "\u001b[35mevents {\n", - " worker_connections 2048;\u001b[0m\n", - "\u001b[35m}\u001b[0m\n", - "\u001b[35mhttp {\n", - " include /etc/nginx/mime.types;\n", - " default_type application/octet-stream;\n", - " access_log /dev/stdout combined;\n", - " upstream gunicorn {\n", - " server unix:/tmp/gunicorn.sock;\n", - " }\n", - " server {\n", - " listen 8080 deferred;\n", - " client_max_body_size 0;\n", - " keepalive_timeout 3;\n", - " location ~ ^/(ping|invocations|execution-parameters) {\n", - " proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;\n", - " proxy_set_header Host $http_host;\n", - " proxy_redirect off;\n", - " proxy_read_timeout 60s;\n", - " proxy_pass http://gunicorn;\n", - " }\n", - " location / {\n", - " return 404 \"{}\";\n", - " }\n", - " }\u001b[0m\n", - "\u001b[35m}\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Starting gunicorn 19.10.0\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Listening at: unix:/tmp/gunicorn.sock (18)\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [18] [INFO] Using worker: gevent\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [25] [INFO] Booting worker with pid: 25\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", - "\u001b[34m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [26] [INFO] Booting worker with pid: 26\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [31] [INFO] Booting worker with pid: 31\u001b[0m\n", - "\u001b[35m[2022-04-29 22:39:31 +0000] [30] [INFO] Booting worker with pid: 30\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:37:INFO] No GPUs detected (normal if no gpus installed)\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /ping HTTP/1.1\" 200 0 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[34m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[34m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"GET /execution-parameters HTTP/1.1\" 200 84 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[35m[2022-04-29:22:39:37:INFO] Determined delimiter of CSV input is ','\u001b[0m\n", - "\u001b[35m169.254.255.130 - - [29/Apr/2022:22:39:37 +0000] \"POST /invocations HTTP/1.1\" 200 2006 \"-\" \"Go-http-client/1.1\"\u001b[0m\n", - "\u001b[32m2022-04-29T22:39:37.156:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "transformer = best_model.transformer(\n", " instance_count=1, instance_type=\"ml.m4.xlarge\", output_path=batch_output\n", @@ -1277,7 +726,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1286,7 +735,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1308,129 +757,9 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_churnedpredicted_resultspredicted_binary
00.00.0690900
10.00.0267260
20.00.0276860
30.00.0806750
40.00.0174450
............
971.00.9866431
981.00.9644941
991.00.1064480
1001.00.9756531
1011.00.9897801
\n", - "

102 rows × 3 columns

\n", - "
" - ], - "text/plain": [ - " user_churned predicted_results predicted_binary\n", - "0 0.0 0.069090 0\n", - "1 0.0 0.026726 0\n", - "2 0.0 0.027686 0\n", - "3 0.0 0.080675 0\n", - "4 0.0 0.017445 0\n", - ".. ... ... ...\n", - "97 1.0 0.986643 1\n", - "98 1.0 0.964494 1\n", - "99 1.0 0.106448 0\n", - "100 1.0 0.975653 1\n", - "101 1.0 0.989780 1\n", - "\n", - "[102 rows x 3 columns]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "test_data[\"predicted_results\"] = pd.to_numeric(results)\n", "# define a threshold to convert probability to class, you can set as 0.5 by default\n", @@ -1447,20 +776,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test Evaluation: \n", - "Average F1 Score: 0.8861607142857144\n", - "Precision Score: 0.9310344827586207\n", - "Recall Score: 0.7714285714285715\n" - ] - } - ], + "outputs": [], "source": [ "from sklearn import metrics\n", "\n", @@ -1492,18 +810,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.\n", - "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n" - ] - } - ], + "outputs": [], "source": [ "from sagemaker import clarify\n", "\n", @@ -1514,7 +823,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1527,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1538,38 +847,18 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'predicted_binary', 'predicted_results', 'user_churned'}" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "set(test_data.columns) - set(test_set.columns)" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:sagemaker.deprecations:DataConfig will be deprecated on 15 Mar 2022.s3_data_distribution_type parameter will no longer be supported. Everything else will remain as is in sagemaker>=2.\n", - "See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.\n" - ] - } - ], + "outputs": [], "source": [ "shap_config = clarify.SHAPConfig(\n", " baseline=[test_set.iloc[0].values.tolist()], num_samples=100, agg_method=\"mean_abs\"\n", @@ -1595,151 +884,11 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2022-04-29-22-39-55-756\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: Clarify-Explainability-2022-04-29-22-39-55-756\n", - "Inputs: [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/train/train.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/clarify-explainability/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/clarify-explainability', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", - ".....................................\u001b[34m2022-04-29 22:45:55,125 logging.conf not found when configuring logging, using default logging configuration.\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,125 Starting SageMaker Clarify Processing job\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,126 Analysis config path: /opt/ml/processing/input/config/analysis_config.json\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,126 Analysis result path: /opt/ml/processing/output\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,127 This host is algo-1.\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,127 This host is the leader.\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,127 Number of hosts in the cluster is 1.\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,287 Running Python / Pandas based analyzer.\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,288 Dataset type: text/csv uri: /opt/ml/processing/input/data\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,296 Loading dataset...\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,314 Loaded dataset. Dataset info:\u001b[0m\n", - "\u001b[34m\u001b[0m\n", - "\u001b[34mRangeIndex: 708 entries, 0 to 707\u001b[0m\n", - "\u001b[34mData columns (total 25 columns):\n", - " # Column Non-Null Count Dtype \u001b[0m\n", - "\u001b[34m--- ------ -------------- ----- \n", - " 0 average_events_weekend 708 non-null float64\n", - " 1 average_events_weekday 708 non-null float64\n", - " 2 num_songs_played_7d 708 non-null int64 \n", - " 3 num_ads_7d 708 non-null int64 \n", - " 4 num_error_7d 708 non-null int64 \n", - " 5 num_songs_played_30d 708 non-null int64 \n", - " 6 num_songs_played_90d 708 non-null int64 \n", - " 7 num_sessions 708 non-null int64 \n", - " 8 avg_time_per_session 708 non-null float64\n", - " 9 avg_events_per_session 708 non-null float64\n", - " 10 avg_gap_between_session 708 non-null float64\n", - " 11 num_events 708 non-null int64 \n", - " 12 num_songs 708 non-null int64 \n", - " 13 num_artists 708 non-null int64 \n", - " 14 num_thumbs_down 708 non-null int64 \n", - " 15 num_thumbs_up 708 non-null int64 \n", - " 16 num_add_to_playlist 708 non-null int64 \n", - " 17 num_ads 708 non-null int64 \n", - " 18 num_add_friend 708 non-null int64 \n", - " 19 num_downgrade 708 non-null int64 \n", - " 20 num_upgrade 708 non-null int64 \n", - " 21 num_error 708 non-null int64 \n", - " 22 percentage_ad 708 non-null float64\n", - " 23 days_since_active 708 non-null int64 \n", - " 24 repeats_ratio 708 non-null float64\u001b[0m\n", - "\u001b[34mdtypes: float64(7), int64(18)\u001b[0m\n", - "\u001b[34mmemory usage: 138.4 KB\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,482 Spinning up shadow endpoint\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,483 Creating endpoint-config with name sm-clarify-config-1651272355-5750\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,567 Creating endpoint: 'sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1'\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,831 Using endpoint name: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,832 Waiting for endpoint ...\u001b[0m\n", - "\u001b[34m2022-04-29 22:45:55,832 Checking endpoint status:\u001b[0m\n", - "\u001b[34mLegend:\u001b[0m\n", - "\u001b[34m(OutOfService: x, Creating: -, Updating: -, InService: !, RollingBack: <, Deleting: o, Failed: *)\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,323 Endpoint is in service after 240 seconds\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,323 Endpoint ready.\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,326 SHAP n_samples 100\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,480 =====================================================\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,480 Shap analyzer: explaining 708 rows, 25 columns...\u001b[0m\n", - "\u001b[34m2022-04-29 22:49:56,480 =====================================================\n", - " 0% (0 of 708) | | Elapsed Time: 0:00:00 ETA: --:--:--\u001b[0m\n", - "\u001b[34m100% (708 of 708) |######################| Elapsed Time: 0:00:16 Time: 0:00:16\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,201 getting explanations took 16.72 seconds.\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,201 ===================================================\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,271 converting explanations to tabular took 0.07 seconds.\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,271 ===================================================\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,274 Wrote baseline used to compute explanations to: /opt/ml/processing/output/explanations_shap/baseline.csv\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,298 Wrote 708 local explanations to: /opt/ml/processing/output/explanations_shap/out.csv\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,299 writing local explanations took 0.03 seconds.\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,299 ===================================================\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,301 aggregating local explanations took 0.00 seconds.\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,301 ===================================================\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,301 Shap analysis finished.\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,302 Stop using endpoint: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,302 Deleting endpoint configuration with name: sm-clarify-config-1651272355-5750\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,407 Deleting endpoint with name: sm-clarify-sagemaker-xgboost-2022-04-29-22-34-0-1651272355-86d1\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:13,528 Model endpoint delivered 41.81825 requests per second and a total of 710 requests over 17 seconds\u001b[0m\n", - "\u001b[34m2022-04-29 22:50:17,557 Stop using endpoint: None\u001b[0m\n", - "\n", - "\u001b[34m2022-04-29 22:51:24,009 jupyter nbconvert --to html --output /opt/ml/processing/output/report.html /opt/ml/processing/output/report.ipynb --template sagemaker-xai\u001b[0m\n", - "\u001b[34m[NbConvertApp] Converting notebook /opt/ml/processing/output/report.ipynb to html\u001b[0m\n", - "\u001b[34m[NbConvertApp] Writing 534585 bytes to /opt/ml/processing/output/report.html\u001b[0m\n", - "\u001b[34m2022-04-29 22:51:25,148 HTML report '/opt/ml/processing/output/report.html' generated successfully.\u001b[0m\n", - "\u001b[34m2022-04-29 22:51:25,148 wkhtmltopdf -q /opt/ml/processing/output/report.html /opt/ml/processing/output/report.pdf\u001b[0m\n", - "\u001b[34m2022-04-29 22:51:25,809 PDF report '/opt/ml/processing/output/report.pdf' generated successfully.\u001b[0m\n", - "\u001b[34m2022-04-29 22:51:25,810 Collected analyses: \u001b[0m\n", - "\u001b[34m{\n", - " \"version\": \"1.0\",\n", - " \"explanations\": {\n", - " \"kernel_shap\": {\n", - " \"label0\": {\n", - " \"global_shap_values\": {\n", - " \"average_events_weekend\": 0.0021496041898171037,\n", - " \"average_events_weekday\": 0.016587702435219898,\n", - " \"num_songs_played_7d\": 0.0017402863783950045,\n", - " \"num_ads_7d\": 0.00868551809925556,\n", - " \"num_error_7d\": 0.0027734162552485904,\n", - " \"num_songs_played_30d\": 0.0011722262443900576,\n", - " \"num_songs_played_90d\": 0.0012286071144514484,\n", - " \"num_sessions\": 0.0011579425530513692,\n", - " \"avg_time_per_session\": 0.010029593923214982,\n", - " \"avg_events_per_session\": 0.003897350916100629,\n", - " \"avg_gap_between_session\": 0.0448815105461223,\n", - " \"num_events\": 0.0010721354650824332,\n", - " \"num_songs\": 0.0010824064811915818,\n", - " \"num_artists\": 0.0011435948826397841,\n", - " \"num_thumbs_down\": 0.0022934530809684047,\n", - " \"num_thumbs_up\": 0.0010034211293590313,\n", - " \"num_add_to_playlist\": 0.001163952744967679,\n", - " \"num_ads\": 0.0010789208860291896,\n", - " \"num_add_friend\": 0.005465492600682657,\n", - " \"num_downgrade\": 0.001048377920399092,\n", - " \"num_upgrade\": 0.002673292801161779,\n", - " \"num_error\": 0.0011517907319294842,\n", - " \"percentage_ad\": 0.02861671713801823,\n", - " \"days_since_active\": 0.29080317686448776,\n", - " \"repeats_ratio\": 0.001010326307543725\n", - " },\n", - " \"expected_value\": 0.06909029185771942\n", - " }\n", - " }\n", - " }\u001b[0m\n", - "\u001b[34m}\u001b[0m\n", - "\u001b[34m2022-04-29 22:51:25,810 exit_message: Completed: SageMaker XAI Analyzer ran successfully\u001b[0m\n", - "\u001b[34m----!\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "clarify_processor.run_explainability(\n", " data_config=explainability_data_config,\n", @@ -1769,7 +918,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1793,7 +942,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ From 2cbe3579c28b8a547d29e257a2e1ddef89a5e9eb Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 16:18:21 +0000 Subject: [PATCH 11/27] notebook edits --- .../0_cust_churn_overview_dw.ipynb | 1454 +++++++++++++++-- 1 file changed, 1353 insertions(+), 101 deletions(-) diff --git a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb index a00ba25e52..b7c984fb4d 100644 --- a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb +++ b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb @@ -160,9 +160,52 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", + " from cryptography.utils import int_from_bytes\n", + "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", + " from cryptography.utils import int_from_bytes\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", + " from cryptography.utils import int_from_bytes\n", + "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", + " from cryptography.utils import int_from_bytes\n", + "Requirement already satisfied: sagemaker in /opt/conda/lib/python3.7/site-packages (2.88.1)\n", + "Requirement already satisfied: boto3 in /opt/conda/lib/python3.7/site-packages (1.22.4)\n", + "Requirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.0.1)\n", + "Requirement already satisfied: google-pasta in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.2.0)\n", + "Requirement already satisfied: attrs==20.3.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (20.3.0)\n", + "Requirement already satisfied: smdebug-rulesconfig==1.0.1 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.0.1)\n", + "Requirement already satisfied: importlib-metadata>=1.4.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.5.0)\n", + "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (20.1)\n", + "Requirement already satisfied: pathos in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.2.8)\n", + "Requirement already satisfied: protobuf>=3.1 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (3.20.0)\n", + "Requirement already satisfied: numpy>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.21.5)\n", + "Requirement already satisfied: protobuf3-to-dict>=0.1.5 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.1.5)\n", + "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /opt/conda/lib/python3.7/site-packages (from boto3) (0.5.2)\n", + "Requirement already satisfied: botocore<1.26.0,>=1.25.4 in /opt/conda/lib/python3.7/site-packages (from boto3) (1.25.4)\n", + "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3) (1.0.0)\n", + "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /opt/conda/lib/python3.7/site-packages (from botocore<1.26.0,>=1.25.4->boto3) (1.26.9)\n", + "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.7/site-packages (from botocore<1.26.0,>=1.25.4->boto3) (2.8.1)\n", + "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata>=1.4.0->sagemaker) (2.2.0)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging>=20.0->sagemaker) (2.4.6)\n", + "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from packaging>=20.0->sagemaker) (1.14.0)\n", + "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas->sagemaker) (2019.3)\n", + "Requirement already satisfied: dill>=0.3.4 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.3.4)\n", + "Requirement already satisfied: ppft>=1.6.6.4 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (1.6.6.4)\n", + "Requirement already satisfied: multiprocess>=0.70.12 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.70.12.2)\n", + "Requirement already satisfied: pox>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.3.0)\n", + "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ "!pip install -q 's3fs==0.4.2' 'sagemaker-experiments'\n", "!pip install --upgrade sagemaker boto3\n", @@ -171,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +223,8 @@ "import pandas as pd\n", "import glob\n", "import s3fs\n", - "import boto3" + "import boto3\n", + "import numpy as np" ] }, { @@ -193,7 +237,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -221,9 +265,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data.zip to data/raw/customer-churn-data.zip\n" + ] + } + ], "source": [ "##### Alternative: copy data from a public S3 bucket to your own bucket\n", "##### data file should include full_data.csv and sample.json\n", @@ -233,18 +285,53 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: ./data/raw/customer-churn-data.zip\n", + " inflating: ./data/data_wrangler_output.csv \n", + " inflating: ./data/full_feature_data.csv \n", + " inflating: ./data/sample.csv \n", + " extracting: ./data/sample.zip \n", + " extracting: ./data/simu-1.zip \n", + " extracting: ./data/simu-2.zip \n", + " extracting: ./data/simu-3.zip \n", + " extracting: ./data/simu-4.zip \n", + " inflating: ./data/test.csv \n", + " inflating: ./data/test_updated.csv \n", + " inflating: ./data/train_updated.csv \n", + " inflating: ./data/validation_updated.csv \n" + ] + } + ], "source": [ "!unzip -o ./data/raw/customer-churn-data.zip -d ./data" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: data/simu-1.zip\n", + " inflating: data/raw/simu-1.json \n", + "Archive: data/simu-2.zip\n", + " inflating: data/raw/simu-2.json \n", + "Archive: data/simu-3.zip\n", + " inflating: data/raw/simu-3.json \n", + "Archive: data/simu-4.zip\n", + " inflating: data/raw/simu-4.json \n" + ] + } + ], "source": [ "# unzip the partitioned data files into the same folder\n", "!unzip -o data/simu-1.zip -d data/raw\n", @@ -255,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -264,18 +351,39 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Archive: data/sample.zip\n", + " inflating: data/raw/sample.json \n" + ] + } + ], "source": [ "!unzip -o data/sample.zip -d data/raw" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "upload: data/raw/simu-1.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json\n", + "upload: data/raw/sample.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json\n", + "upload: data/raw/simu-2.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json\n", + "upload: data/raw/simu-4.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json\n", + "upload: data/raw/simu-3.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json\n" + ] + } + ], "source": [ "!aws s3 cp ./data/raw s3://$bucket/$prefix/data/json/ --recursive" ] @@ -293,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -310,7 +418,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -322,9 +430,124 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tsuserIdsessionIdpageauthmethodstatuslevelitemInSessionlocationuserAgentlastNamefirstNameregistrationgenderartistsonglength
0159214626773112065118NextSongLogged InPUT200paid0Richmond, VA\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...DavisBristol1.591971e+12MPeter ToshWanted Dread And Alive (2002 Digital Remaster)267.85914
1159214626873112065118Thumbs DownLogged InPUT307paid1Richmond, VA\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...DavisBristol1.591971e+12MNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " ts userId sessionId page auth method status \\\n", + "0 1592146267731 12065 118 NextSong Logged In PUT 200 \n", + "1 1592146268731 12065 118 Thumbs Down Logged In PUT 307 \n", + "\n", + " level itemInSession location \\\n", + "0 paid 0 Richmond, VA \n", + "1 paid 1 Richmond, VA \n", + "\n", + " userAgent lastName firstName \\\n", + "0 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... Davis Bristol \n", + "1 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... Davis Bristol \n", + "\n", + " registration gender artist \\\n", + "0 1.591971e+12 M Peter Tosh \n", + "1 1.591971e+12 M NaN \n", + "\n", + " song length \n", + "0 Wanted Dread And Alive (2002 Digital Remaster) 267.85914 \n", + "1 NaN NaN " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sample.head(2)" ] @@ -340,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -359,9 +582,41 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "percentage of the value missing in each column is: \n" + ] + }, + { + "data": { + "text/plain": [ + "ts 0.000000\n", + "userId 0.000000\n", + "sessionId 0.000000\n", + "page 0.000000\n", + "auth 0.000000\n", + "level 0.000000\n", + "itemInSession 0.000000\n", + "location 0.025447\n", + "userAgent 0.025447\n", + "registration 0.025447\n", + "gender 0.025447\n", + "artist 0.210330\n", + "song 0.210330\n", + "length 0.210330\n", + "dtype: float64" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "print(\"percentage of the value missing in each column is: \")\n", "sample.isnull().sum() / len(sample)" @@ -369,7 +624,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -388,9 +643,28 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The unique values in column page are: ['NextSong' 'Thumbs Down' 'Home' 'Settings' 'Thumbs Up' 'Add to Playlist'\n", + " 'Roll Advert' 'Save Settings' 'Help' 'Logout' 'Add Friend' 'Downgrade'\n", + " 'About' 'Upgrade' 'Error' 'Submit Upgrade' 'Submit Downgrade' 'Cancel'\n", + " 'Cancellation Confirmation']\n", + "The unique values in column auth are: ['Logged In' 'Cancelled']\n", + "The unique values in column level are: ['paid' 'free']\n", + "The unique values in column gender are: ['M' 'F']\n", + "There are 72 unique values in column location\n", + "There are 37 unique values in column userAgent\n", + "There are 16207 unique values in column artist\n", + "There are 51447 unique values in column song\n", + "There are 101 unique values in column userId\n" + ] + } + ], "source": [ "cat_columns = [\"page\", \"auth\", \"level\", \"gender\"]\n", "cat_columns_long = [\"location\", \"userAgent\", \"artist\", \"song\", \"userId\"]\n", @@ -422,7 +696,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -439,7 +713,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -466,9 +740,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 12.87% of users churned in this dataset\n" + ] + } + ], "source": [ "print(\n", " \"There are {:.2f}% of users churned in this dataset\".format(\n", @@ -490,7 +772,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -544,9 +826,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", @@ -595,9 +888,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "events_per_hour_per_user = (\n", " sample.groupby([\"userId\", \"ts_date_day\", \"ts_hour\", \"user_churned\"])\n", @@ -637,9 +941,83 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average total: number of sessions, App usage length, number of songs listened, number of artists listened per user, days active: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_churnedsessionIdsongartistlengthts_date_day
002044.6363641434.1022731067.602273412310.6420962044.636364
113260.3846152173.1538461493.230769656340.5865223260.384615
\n", + "
" + ], + "text/plain": [ + " user_churned sessionId song artist length \\\n", + "0 0 2044.636364 1434.102273 1067.602273 412310.642096 \n", + "1 1 3260.384615 2173.153846 1493.230769 656340.586522 \n", + "\n", + " ts_date_day \n", + "0 2044.636364 \n", + "1 3260.384615 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stats_per_user = (\n", " sample.groupby([\"userId\", \"user_churned\"])\n", @@ -676,9 +1054,76 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average daily: number of sessions, App usage length, number of songs listened, number of artists listened per user: \n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_churnedsessionIdsongartistlength
0076.59770161.30183159.32311615446.290551
1192.74617174.29102871.50109418670.519967
\n", + "
" + ], + "text/plain": [ + " user_churned sessionId song artist length\n", + "0 0 76.597701 61.301831 59.323116 15446.290551\n", + "1 1 92.746171 74.291028 71.501094 18670.519967" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "stats_per_user = (\n", " sample.groupby([\"userId\", \"ts_date_day\", \"user_churned\"])\n", @@ -707,7 +1152,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -731,7 +1176,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -740,9 +1185,88 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_churnednextsongthumbs_downthumbs_upadd_to_playlistroll_advertadd_frienddowngradeupgradeerror
001656.20454516.988636150.47727350.8636367.61363629.1818189.5681821.9545452.193182
112645.53846228.076923239.61538580.84615410.92307748.92307712.6153852.4615383.461538
\n", + "
" + ], + "text/plain": [ + " user_churned nextsong thumbs_down thumbs_up add_to_playlist \\\n", + "0 0 1656.204545 16.988636 150.477273 50.863636 \n", + "1 1 2645.538462 28.076923 239.615385 80.846154 \n", + "\n", + " roll_advert add_friend downgrade upgrade error \n", + "0 7.613636 29.181818 9.568182 1.954545 2.193182 \n", + "1 10.923077 48.923077 12.615385 2.461538 3.461538 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "app_use_group = app_use_per_user.groupby([\"user_churned\"])[usage_column_name].mean().reset_index()\n", "app_use_group" @@ -764,9 +1288,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting preprocessing_predw.py\n" + ] + } + ], "source": [ "%%writefile preprocessing_predw.py\n", "\n", @@ -807,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -820,9 +1352,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json',\n", + " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json',\n", + " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json',\n", + " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json',\n", + " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json']" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "s3_client = boto3.client(\"s3\")\n", "list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=f\"{prefix}/data/json\")\n", @@ -832,7 +1379,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -849,9 +1396,30 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: sagemaker-scikit-learn-2022-04-30-01-59-49-481\n", + "Inputs: [{'InputName': 'sample', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json', 'LocalPath': '/opt/ml/processing/input/data/sample', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json', 'LocalPath': '/opt/ml/processing/input/data/simu-1', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json', 'LocalPath': '/opt/ml/processing/input/data/simu-2', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json', 'LocalPath': '/opt/ml/processing/input/data/simu-3', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-4', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json', 'LocalPath': '/opt/ml/processing/input/data/simu-4', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-01-59-49-481/input/code/preprocessing_predw.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'processed_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", + "...........................\u001b[34mReceived arguments Namespace(processing_output_filename='full_data.csv')\u001b[0m\n", + "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-3/simu-3.json\u001b[0m\n", + "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-1/simu-1.json\u001b[0m\n", + "\u001b[34mStarting file: /opt/ml/processing/input/data/sample/sample.json\u001b[0m\n", + "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-4/simu-4.json\u001b[0m\n", + "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-2/simu-2.json\u001b[0m\n", + "\u001b[34mSaving processed data to /opt/ml/processing/output/full_data.csv\u001b[0m\n", + "\n", + "CPU times: user 932 ms, sys: 81.8 ms, total: 1.01 s\n", + "Wall time: 8min 26s\n" + ] + } + ], "source": [ "%%time\n", "processing_output_path = f\"s3://{bucket}/{prefix}/data/processing\"\n", @@ -913,9 +1481,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/full_data.csv'" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "processing_output_filename = f\"{processing_output_path}/{final_features_filename}\"\n", "processing_output_filename" @@ -923,9 +1502,139 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'metadata': {'version': 1},\n", + " 'nodes': [{'node_id': '660c3ee3-5207-4ded-b92f-7059831a7aed',\n", + " 'type': 'SOURCE',\n", + " 'operator': 'sagemaker.s3_source_0.1',\n", + " 'parameters': {'dataset_definition': {'__typename': 'S3CreateDatasetDefinitionOutput',\n", + " 'datasetSourceType': 'S3',\n", + " 'name': 'full_data.csv',\n", + " 'description': None,\n", + " 's3ExecutionContext': {'__typename': 'S3ExecutionContext',\n", + " 's3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/full_data.csv',\n", + " 's3ContentType': 'csv',\n", + " 's3HasHeader': True}}},\n", + " 'inputs': [],\n", + " 'outputs': [{'name': 'default',\n", + " 'sampling': {'sampling_method': 'sample_by_ratio',\n", + " 'sample_ratio': 0.06460757939298588}}]},\n", + " {'node_id': 'd04eac2a-92a9-4539-b22f-f0f30aa29877',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.infer_and_cast_type_0.1',\n", + " 'parameters': {},\n", + " 'trained_parameters': {'schema': {'ts': 'long',\n", + " 'userId': 'long',\n", + " 'sessionId': 'long',\n", + " 'page': 'string',\n", + " 'auth': 'string',\n", + " 'method': 'string',\n", + " 'status': 'long',\n", + " 'level': 'string',\n", + " 'itemInSession': 'long',\n", + " 'location': 'string',\n", + " 'userAgent': 'string',\n", + " 'lastName': 'string',\n", + " 'firstName': 'string',\n", + " 'registration': 'float',\n", + " 'gender': 'string',\n", + " 'artist': 'string',\n", + " 'song': 'string',\n", + " 'length': 'long'}},\n", + " 'inputs': [{'name': 'default',\n", + " 'node_id': '660c3ee3-5207-4ded-b92f-7059831a7aed',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': 'd1b462ec-bbae-466d-afbd-39e5eab8dcc9',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'method'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': 'd04eac2a-92a9-4539-b22f-f0f30aa29877',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '4dfd1354-1904-4fa4-bff7-56a9e0e50d0a',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'status'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': 'd1b462ec-bbae-466d-afbd-39e5eab8dcc9',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '92ac4b28-bfb1-47bf-848a-de23735a2570',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'location'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': '4dfd1354-1904-4fa4-bff7-56a9e0e50d0a',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': 'e1fd74c7-8240-4e99-876e-73b42a063e65',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'userAgent'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': '92ac4b28-bfb1-47bf-848a-de23735a2570',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '1550cb2f-c734-46f8-bfdc-4f8614c30c09',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'lastName'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': 'e1fd74c7-8240-4e99-876e-73b42a063e65',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '32405a27-8e85-4c9b-8142-dd75d56fa75d',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", + " 'parameters': {'operator': 'Drop column',\n", + " 'drop_column_parameters': {'column_to_drop': 'firstName'}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': '1550cb2f-c734-46f8-bfdc-4f8614c30c09',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '7b74dbbc-6f7e-4656-8f78-25272604bc45',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.handle_missing_0.1',\n", + " 'parameters': {'operator': 'Drop missing',\n", + " 'drop_missing_parameters': {'dimension': 'Drop Rows',\n", + " 'drop_rows_parameters': {'input_column': 'userId'}},\n", + " 'impute_parameters': {'column_type': 'Numeric',\n", + " 'numeric_parameters': {'strategy': 'Approximate Median'}}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': '32405a27-8e85-4c9b-8142-dd75d56fa75d',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]},\n", + " {'node_id': '82cb5ad3-3b9c-428d-9260-ce6efcd4c4f8',\n", + " 'type': 'TRANSFORM',\n", + " 'operator': 'sagemaker.spark.handle_missing_0.1',\n", + " 'parameters': {'operator': 'Drop missing',\n", + " 'drop_missing_parameters': {'dimension': 'Drop Rows',\n", + " 'drop_rows_parameters': {'input_column': 'registration'}},\n", + " 'impute_parameters': {'column_type': 'Numeric',\n", + " 'numeric_parameters': {'strategy': 'Approximate Median'}}},\n", + " 'inputs': [{'name': 'df',\n", + " 'node_id': '7b74dbbc-6f7e-4656-8f78-25272604bc45',\n", + " 'output_name': 'default'}],\n", + " 'outputs': [{'name': 'default'}]}]}" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "flow_file = \"dw_example.flow\"\n", "\n", @@ -952,7 +1661,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -969,7 +1678,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -979,9 +1688,17 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting preprocessing.py\n" + ] + } + ], "source": [ "%%writefile preprocessing.py\n", "\n", @@ -1232,7 +1949,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, "outputs": [], "source": [ @@ -1241,11 +1958,256 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Job Name: sagemaker-scikit-learn-2022-04-30-02-08-16-353\n", + "Inputs: [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-02-08-16-353/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", + "Outputs: [{'OutputName': 'processed_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", + "...........................\u001b[34mRequirement already satisfied: pandas in /miniconda3/lib/python3.7/site-packages (1.1.3)\u001b[0m\n", + "\u001b[34mCollecting pandas\n", + " Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 87.5 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: numpy>=1.17.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (1.21.0)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: python-dateutil>=2.7.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (2.8.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pytz>=2017.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (2022.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: six>=1.5 in /miniconda3/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\u001b[0m\n", + "\u001b[34mInstalling collected packages: pandas\n", + " Attempting uninstall: pandas\n", + " Found existing installation: pandas 1.1.3\n", + " Uninstalling pandas-1.1.3:\n", + " Successfully uninstalled pandas-1.1.3\u001b[0m\n", + "\u001b[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\u001b[0m\n", + "\u001b[34msagemaker-sklearn-container 2.0 requires pandas==1.1.3, but you have pandas 1.3.5 which is incompatible.\u001b[0m\n", + "\u001b[34mSuccessfully installed pandas-1.3.5\u001b[0m\n", + "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", + "\u001b[34mCollecting awswrangler\n", + " Downloading awswrangler-2.15.1-py3-none-any.whl (239 kB)\u001b[0m\n", + "\u001b[34m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 239.6/239.6 KB 8.2 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting openpyxl<3.1.0,>=3.0.0\n", + " Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.2/242.2 KB 33.7 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting pyarrow<7.1.0,>=2.0.0\n", + " Downloading pyarrow-7.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7/26.7 MB 52.3 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting redshift-connector<2.1.0,>=2.0.889\n", + " Downloading redshift_connector-2.0.906-py3-none-any.whl (109 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 109.8/109.8 KB 19.9 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting botocore<2.0.0,>=1.23.17\n", + " Downloading botocore-1.25.4-py3-none-any.whl (8.7 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.7/8.7 MB 96.3 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting opensearch-py<2.0.0,>=1.0.0\n", + " Downloading opensearch_py-1.1.0-py2.py3-none-any.whl (207 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 KB 30.5 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting progressbar2<5.0.0,>=4.0.0\n", + " Downloading progressbar2-4.0.0-py2.py3-none-any.whl (26 kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pandas<2.0.0,>=1.2.0 in /miniconda3/lib/python3.7/site-packages (from awswrangler) (1.3.5)\u001b[0m\n", + "\u001b[34mCollecting backoff<2.0.0,>=1.11.1\n", + " Downloading backoff-1.11.1-py2.py3-none-any.whl (13 kB)\u001b[0m\n", + "\u001b[34mCollecting gremlinpython<4.0.0,>=3.5.2\n", + " Downloading gremlinpython-3.6.0-py2.py3-none-any.whl (72 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 72.8/72.8 KB 14.3 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: numpy<2.0.0,>=1.21.0 in /miniconda3/lib/python3.7/site-packages (from awswrangler) (1.21.0)\u001b[0m\n", + "\u001b[34mCollecting pg8000<2.0.0,>=1.20.0\n", + " Downloading pg8000-1.26.1-py3-none-any.whl (33 kB)\u001b[0m\n", + "\u001b[34mCollecting pymysql<2.0.0,>=1.0.0\n", + " Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.8/43.8 KB 7.2 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting jsonpath-ng<2.0.0,>=1.5.3\n", + " Downloading jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)\u001b[0m\n", + "\u001b[34mCollecting boto3<2.0.0,>=1.20.17\n", + " Downloading boto3-1.22.4-py3-none-any.whl (132 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.5/132.5 KB 20.2 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting requests-aws4auth<2.0.0,>=1.1.1\n", + " Downloading requests_aws4auth-1.1.2-py2.py3-none-any.whl (24 kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: jmespath<2.0.0,>=0.7.1 in /miniconda3/lib/python3.7/site-packages (from boto3<2.0.0,>=1.20.17->awswrangler) (0.10.0)\u001b[0m\n", + "\u001b[34mCollecting s3transfer<0.6.0,>=0.5.0\n", + " Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 KB 19.0 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /miniconda3/lib/python3.7/site-packages (from botocore<2.0.0,>=1.23.17->awswrangler) (2.8.1)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: urllib3<1.27,>=1.25.4 in /miniconda3/lib/python3.7/site-packages (from botocore<2.0.0,>=1.23.17->awswrangler) (1.25.11)\u001b[0m\n", + "\u001b[34mCollecting nest-asyncio\n", + " Downloading nest_asyncio-1.5.5-py3-none-any.whl (5.2 kB)\u001b[0m\n", + "\u001b[34mCollecting isodate<1.0.0,>=0.6.0\n", + " Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 KB 7.8 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting aiohttp<=3.8.1,>=3.8.0\n", + " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 74.4 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting aenum<4.0.0,>=1.4.5\n", + " Downloading aenum-3.1.11-py3-none-any.whl (131 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.5/131.5 KB 26.2 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting decorator\n", + " Downloading decorator-5.1.1-py3-none-any.whl (9.1 kB)\u001b[0m\n", + "\u001b[34mCollecting ply\n", + " Downloading ply-3.11-py2.py3-none-any.whl (49 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.6/49.6 KB 9.7 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: six in /miniconda3/lib/python3.7/site-packages (from jsonpath-ng<2.0.0,>=1.5.3->awswrangler) (1.15.0)\u001b[0m\n", + "\u001b[34mCollecting et-xmlfile\n", + " Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: certifi in /miniconda3/lib/python3.7/site-packages (from opensearch-py<2.0.0,>=1.0.0->awswrangler) (2021.10.8)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: pytz>=2017.3 in /miniconda3/lib/python3.7/site-packages (from pandas<2.0.0,>=1.2.0->awswrangler) (2022.1)\u001b[0m\n", + "\u001b[34mCollecting scramp>=1.4.1\n", + " Downloading scramp-1.4.1-py3-none-any.whl (8.5 kB)\u001b[0m\n", + "\u001b[34mCollecting python-utils>=3.0.0\n", + " Downloading python_utils-3.1.0-py2.py3-none-any.whl (19 kB)\u001b[0m\n", + "\u001b[34mCollecting lxml>=4.6.5\n", + " Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 103.0 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting packaging\n", + " Downloading packaging-21.3-py3-none-any.whl (40 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.8/40.8 KB 6.9 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: requests<2.27.2,>=2.23.0 in /miniconda3/lib/python3.7/site-packages (from redshift-connector<2.1.0,>=2.0.889->awswrangler) (2.27.1)\u001b[0m\n", + "\u001b[34mCollecting beautifulsoup4<5.0.0,>=4.7.0\n", + " Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 128.2/128.2 KB 21.6 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting yarl<2.0,>=1.0\n", + " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 271.8/271.8 KB 35.1 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting asynctest==0.13.0\n", + " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\u001b[0m\n", + "\u001b[34mCollecting aiosignal>=1.1.2\n", + " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /miniconda3/lib/python3.7/site-packages (from aiohttp<=3.8.1,>=3.8.0->gremlinpython<4.0.0,>=3.5.2->awswrangler) (2.0.4)\u001b[0m\n", + "\u001b[34mCollecting attrs>=17.3.0\n", + " Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.6/60.6 KB 13.3 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting multidict<7.0,>=4.5\n", + " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 94.8/94.8 KB 15.3 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting frozenlist>=1.1.1\n", + " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.8/144.8 KB 15.0 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mRequirement already satisfied: typing-extensions>=3.7.4 in /miniconda3/lib/python3.7/site-packages (from aiohttp<=3.8.1,>=3.8.0->gremlinpython<4.0.0,>=3.5.2->awswrangler) (4.1.1)\u001b[0m\n", + "\u001b[34mCollecting async-timeout<5.0,>=4.0.0a3\n", + " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\u001b[0m\n", + "\u001b[34mCollecting soupsieve>1.2\n", + " Downloading soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)\u001b[0m\n", + "\u001b[34mRequirement already satisfied: idna<4,>=2.5 in /miniconda3/lib/python3.7/site-packages (from requests<2.27.2,>=2.23.0->redshift-connector<2.1.0,>=2.0.889->awswrangler) (3.3)\u001b[0m\n", + "\u001b[34mCollecting asn1crypto>=1.4.0\n", + " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 KB 11.9 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mCollecting pyparsing!=3.0.5,>=2.0.2\n", + " Downloading pyparsing-3.0.8-py3-none-any.whl (98 kB)\n", + " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 KB 11.6 MB/s eta 0:00:00\u001b[0m\n", + "\u001b[34mInstalling collected packages: ply, asn1crypto, aenum, soupsieve, scramp, python-utils, pyparsing, pymysql, pyarrow, opensearch-py, nest-asyncio, multidict, lxml, isodate, frozenlist, et-xmlfile, decorator, backoff, attrs, asynctest, async-timeout, yarl, requests-aws4auth, progressbar2, pg8000, packaging, openpyxl, jsonpath-ng, botocore, beautifulsoup4, aiosignal, s3transfer, aiohttp, gremlinpython, boto3, redshift-connector, awswrangler\u001b[0m\n", + "\u001b[34m Attempting uninstall: pyarrow\n", + " Found existing installation: pyarrow 0.16.0\n", + " Uninstalling pyarrow-0.16.0:\n", + " Successfully uninstalled pyarrow-0.16.0\u001b[0m\n", + "\u001b[34m Attempting uninstall: botocore\n", + " Found existing installation: botocore 1.19.4\n", + " Uninstalling botocore-1.19.4:\n", + " Successfully uninstalled botocore-1.19.4\u001b[0m\n", + "\u001b[34m Attempting uninstall: s3transfer\n", + " Found existing installation: s3transfer 0.3.7\n", + " Uninstalling s3transfer-0.3.7:\n", + " Successfully uninstalled s3transfer-0.3.7\n", + " Attempting uninstall: boto3\n", + " Found existing installation: boto3 1.16.4\n", + " Uninstalling boto3-1.16.4:\n", + " Successfully uninstalled boto3-1.16.4\u001b[0m\n", + "\u001b[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\u001b[0m\n", + "\u001b[34msagemaker-sklearn-container 2.0 requires boto3==1.16.4, but you have boto3 1.22.4 which is incompatible.\u001b[0m\n", + "\u001b[34msagemaker-sklearn-container 2.0 requires botocore==1.19.4, but you have botocore 1.25.4 which is incompatible.\u001b[0m\n", + "\u001b[34msagemaker-sklearn-container 2.0 requires pandas==1.1.3, but you have pandas 1.3.5 which is incompatible.\u001b[0m\n", + "\u001b[34mSuccessfully installed aenum-3.1.11 aiohttp-3.8.1 aiosignal-1.2.0 asn1crypto-1.5.1 async-timeout-4.0.2 asynctest-0.13.0 attrs-21.4.0 awswrangler-2.15.1 backoff-1.11.1 beautifulsoup4-4.11.1 boto3-1.22.4 botocore-1.25.4 decorator-5.1.1 et-xmlfile-1.1.0 frozenlist-1.3.0 gremlinpython-3.6.0 isodate-0.6.1 jsonpath-ng-1.5.3 lxml-4.8.0 multidict-6.0.2 nest-asyncio-1.5.5 openpyxl-3.0.9 opensearch-py-1.1.0 packaging-21.3 pg8000-1.26.1 ply-3.11 progressbar2-4.0.0 pyarrow-7.0.0 pymysql-1.0.2 pyparsing-3.0.8 python-utils-3.1.0 redshift-connector-2.0.906 requests-aws4auth-1.1.2 s3transfer-0.5.2 scramp-1.4.1 soupsieve-2.3.2.post1 yarl-1.7.2\u001b[0m\n", + "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", + "\u001b[34mReceived arguments Namespace(dw_output_path='s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', processing_output_filename='processing_job_output.csv')\u001b[0m\n", + "\n", + "\u001b[34mTraceback (most recent call last):\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 341, in array_func\n", + " \"aggregate\", values, how, axis=data.ndim - 1, min_count=min_count\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1016, in _cython_operation\n", + " **kwargs,\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 677, in cython_operation\n", + " **kwargs,\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 508, in _cython_op_ndim_compat\n", + " **kwargs,\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 563, in _call_cython_op\n", + " func, values = self.get_cython_func_and_vals(values, is_numeric)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 205, in get_cython_func_and_vals\n", + " func = self._get_cython_function(kind, how, values.dtype, is_numeric)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 171, in _get_cython_function\n", + " f\"function is not implemented for this dtype: \"\u001b[0m\n", + "\u001b[34mNotImplementedError: function is not implemented for this dtype: [how->max,dtype->object]\u001b[0m\n", + "\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n", + "\u001b[34mTraceback (most recent call last):\n", + " File \"/opt/ml/processing/input/code/preprocessing.py\", line 126, in \n", + " \"user_churned\": \"max\",\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 979, in aggregate\n", + " result = op.agg()\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 161, in agg\n", + " return self.agg_dict_like()\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 436, in agg_dict_like\n", + " key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 436, in \n", + " key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 243, in aggregate\n", + " return getattr(self, func)(*args, **kwargs)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1880, in max\n", + " numeric_only=numeric_only, min_count=min_count, alias=\"max\", npfunc=np.max\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1368, in _agg_general\n", + " min_count=min_count,\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 352, in _cython_agg_general\n", + " result = array_func(objvals)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 348, in array_func\n", + " result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1398, in _agg_py_fallback\n", + " res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1060, in agg_series\n", + " result = self._aggregate_series_fast(obj, func)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1085, in _aggregate_series_fast\n", + " result, _ = sgrouper.get_result()\n", + " File \"pandas/_libs/reduction.pyx\", line 281, in pandas._libs.reduction.SeriesGrouper.get_result\n", + " File \"pandas/_libs/reduction.pyx\", line 88, in pandas._libs.reduction._BaseGrouper._apply_to_group\n", + " File \"<__array_function__ internals>\", line 6, in amax\n", + " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py\", line 2755, in amax\n", + " keepdims=keepdims, initial=initial, where=where)\n", + " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py\", line 84, in _wrapreduction\n", + " return reduction(axis=axis, out=out, **passkwargs)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10819, in max\u001b[0m\n", + "\u001b[34m return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10365, in max\n", + " \"max\", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10355, in _stat_function\n", + " func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/series.py\", line 4392, in _reduce\n", + " return op(delegate, skipna=skipna, **kwds)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 156, in f\n", + " result = alt(values, axis=axis, skipna=skipna, **kwds)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 411, in new_func\n", + " result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)\n", + " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 1018, in reduction\n", + " result = getattr(values, meth)(axis)\n", + " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/_methods.py\", line 40, in _amax\n", + " return umr_maximum(a, axis, None, out, keepdims, initial, where)\u001b[0m\n", + "\u001b[34mTypeError: '>=' not supported between instances of 'datetime.date' and 'float'\u001b[0m\n" + ] + }, + { + "ename": "UnexpectedStatusException", + "evalue": "Error for Processing job sagemaker-scikit-learn-2022-04-30-02-08-16-353: Failed. Reason: AlgorithmError: See job logs for more information", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/processing.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, code, inputs, outputs, arguments, wait, logs, job_name, experiment_config, kms_key)\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 559\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_include_code_in_inputs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkms_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/processing.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 966\u001b[0m \"\"\"\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_processing_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_processing_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_processing_job\u001b[0;34m(self, job_name, wait, poll)\u001b[0m\n\u001b[1;32m 3885\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3886\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3887\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ProcessingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3888\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3889\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3337\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3338\u001b[0m \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3339\u001b[0;31m \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3340\u001b[0m )\n\u001b[1;32m 3341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Processing job sagemaker-scikit-learn-2022-04-30-02-08-16-353: Failed. Reason: AlgorithmError: See job logs for more information" + ] + } + ], "source": [ "%%time\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", @@ -1274,9 +2236,95 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'ProcessingInputs': [{'InputName': 'sample',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json',\n", + " 'LocalPath': '/opt/ml/processing/input/data/sample',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}},\n", + " {'InputName': 'simu-1',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json',\n", + " 'LocalPath': '/opt/ml/processing/input/data/simu-1',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}},\n", + " {'InputName': 'simu-2',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json',\n", + " 'LocalPath': '/opt/ml/processing/input/data/simu-2',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}},\n", + " {'InputName': 'simu-3',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json',\n", + " 'LocalPath': '/opt/ml/processing/input/data/simu-3',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}},\n", + " {'InputName': 'simu-4',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json',\n", + " 'LocalPath': '/opt/ml/processing/input/data/simu-4',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}},\n", + " {'InputName': 'code',\n", + " 'AppManaged': False,\n", + " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-01-59-49-481/input/code/preprocessing_predw.py',\n", + " 'LocalPath': '/opt/ml/processing/input/code',\n", + " 'S3DataType': 'S3Prefix',\n", + " 'S3InputMode': 'File',\n", + " 'S3DataDistributionType': 'FullyReplicated',\n", + " 'S3CompressionType': 'None'}}],\n", + " 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'processed_data',\n", + " 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing',\n", + " 'LocalPath': '/opt/ml/processing/output',\n", + " 'S3UploadMode': 'EndOfJob'},\n", + " 'AppManaged': False}]},\n", + " 'ProcessingJobName': 'sagemaker-scikit-learn-2022-04-30-01-59-49-481',\n", + " 'ProcessingResources': {'ClusterConfig': {'InstanceCount': 1,\n", + " 'InstanceType': 'ml.m5.xlarge',\n", + " 'VolumeSizeInGB': 30}},\n", + " 'StoppingCondition': {'MaxRuntimeInSeconds': 86400},\n", + " 'AppSpecification': {'ImageUri': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',\n", + " 'ContainerEntrypoint': ['python3',\n", + " '/opt/ml/processing/input/code/preprocessing_predw.py'],\n", + " 'ContainerArguments': ['--processing-output-filename', 'full_data.csv']},\n", + " 'RoleArn': 'arn:aws:iam::688520471316:role/service-role/AmazonSageMaker-ExecutionRole-20211229T100947',\n", + " 'ProcessingJobArn': 'arn:aws:sagemaker:us-west-2:688520471316:processing-job/sagemaker-scikit-learn-2022-04-30-01-59-49-481',\n", + " 'ProcessingJobStatus': 'Completed',\n", + " 'ProcessingEndTime': datetime.datetime(2022, 4, 30, 2, 7, 45, 433000, tzinfo=tzlocal()),\n", + " 'ProcessingStartTime': datetime.datetime(2022, 4, 30, 2, 3, 35, 10000, tzinfo=tzlocal()),\n", + " 'LastModifiedTime': datetime.datetime(2022, 4, 30, 2, 7, 45, 721000, tzinfo=tzlocal()),\n", + " 'CreationTime': datetime.datetime(2022, 4, 30, 1, 59, 49, 808000, tzinfo=tzlocal()),\n", + " 'ResponseMetadata': {'RequestId': 'cb40881c-69d9-4c7f-9a9c-d97153d589a5',\n", + " 'HTTPStatusCode': 200,\n", + " 'HTTPHeaders': {'x-amzn-requestid': 'cb40881c-69d9-4c7f-9a9c-d97153d589a5',\n", + " 'content-type': 'application/x-amz-json-1.1',\n", + " 'content-length': '3223',\n", + " 'date': 'Sat, 30 Apr 2022 02:08:14 GMT'},\n", + " 'RetryAttempts': 0}}" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "preprocessing_job_description" ] @@ -1310,9 +2358,20 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/processing_job_output.csv'" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "processing_job_output_uri = f\"{processing_job_output_path}/{processing_job_output_name}\"\n", "processing_job_output_uri" @@ -1320,16 +2379,24 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/processing_job_output.csv to data/processing_job_output.csv\n" + ] + } + ], "source": [ "!aws s3 cp $processing_job_output_uri ./data" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -1338,7 +2405,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1348,9 +2415,194 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIduser_churnedaverage_events_weekendaverage_events_weekdaynum_songs_played_7dnum_ads_7dnum_error_7dnum_songs_played_30dnum_songs_played_90dnum_sessions...num_thumbs_upnum_add_to_playlistnum_adsnum_add_friendnum_downgradenum_upgradenum_errorpercentage_addays_since_activerepeats_ratio
0110010.094.937576.3043484135714135413551...2931407811110.0013923590.179444
1110020.070.500076.666667476104764767...41161141000.0016642650.052521
2110031.098.7500120.87500038671293867386737...27210312691190.002576660.175330
3110041.070.0000120.444444108421108410847...6830291010.001546480.076568
\n", + "

4 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " userId user_churned average_events_weekend average_events_weekday \\\n", + "0 11001 0.0 94.9375 76.304348 \n", + "1 11002 0.0 70.5000 76.666667 \n", + "2 11003 1.0 98.7500 120.875000 \n", + "3 11004 1.0 70.0000 120.444444 \n", + "\n", + " num_songs_played_7d num_ads_7d num_error_7d num_songs_played_30d \\\n", + "0 4135 7 1 4135 \n", + "1 476 1 0 476 \n", + "2 3867 12 9 3867 \n", + "3 1084 2 1 1084 \n", + "\n", + " num_songs_played_90d num_sessions ... num_thumbs_up \\\n", + "0 4135 51 ... 293 \n", + "1 476 7 ... 41 \n", + "2 3867 37 ... 272 \n", + "3 1084 7 ... 68 \n", + "\n", + " num_add_to_playlist num_ads num_add_friend num_downgrade num_upgrade \\\n", + "0 140 7 81 1 1 \n", + "1 16 1 14 1 0 \n", + "2 103 12 69 1 1 \n", + "3 30 2 9 1 0 \n", + "\n", + " num_error percentage_ad days_since_active repeats_ratio \n", + "0 1 0.001392 359 0.179444 \n", + "1 0 0.001664 265 0.052521 \n", + "2 9 0.002576 66 0.175330 \n", + "3 1 0.001546 48 0.076568 \n", + "\n", + "[4 rows x 27 columns]" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "processed_data.head(4)" ] @@ -1364,7 +2616,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -1379,7 +2631,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -1391,7 +2643,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -1414,7 +2666,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ From 4ba8ff661183d5cfa159e97571277ca0139ed1b1 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 16:20:09 +0000 Subject: [PATCH 12/27] clear outputs --- .../0_cust_churn_overview_dw.ipynb | 1451 ++--------------- 1 file changed, 100 insertions(+), 1351 deletions(-) diff --git a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb index b7c984fb4d..d4e6436eed 100644 --- a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb +++ b/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb @@ -160,52 +160,9 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", - " from cryptography.utils import int_from_bytes\n", - "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", - " from cryptography.utils import int_from_bytes\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m/opt/conda/lib/python3.7/site-packages/secretstorage/dhcrypto.py:16: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", - " from cryptography.utils import int_from_bytes\n", - "/opt/conda/lib/python3.7/site-packages/secretstorage/util.py:25: CryptographyDeprecationWarning: int_from_bytes is deprecated, use int.from_bytes instead\n", - " from cryptography.utils import int_from_bytes\n", - "Requirement already satisfied: sagemaker in /opt/conda/lib/python3.7/site-packages (2.88.1)\n", - "Requirement already satisfied: boto3 in /opt/conda/lib/python3.7/site-packages (1.22.4)\n", - "Requirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.0.1)\n", - "Requirement already satisfied: google-pasta in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.2.0)\n", - "Requirement already satisfied: attrs==20.3.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (20.3.0)\n", - "Requirement already satisfied: smdebug-rulesconfig==1.0.1 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.0.1)\n", - "Requirement already satisfied: importlib-metadata>=1.4.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.5.0)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (20.1)\n", - "Requirement already satisfied: pathos in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.2.8)\n", - "Requirement already satisfied: protobuf>=3.1 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (3.20.0)\n", - "Requirement already satisfied: numpy>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (1.21.5)\n", - "Requirement already satisfied: protobuf3-to-dict>=0.1.5 in /opt/conda/lib/python3.7/site-packages (from sagemaker) (0.1.5)\n", - "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /opt/conda/lib/python3.7/site-packages (from boto3) (0.5.2)\n", - "Requirement already satisfied: botocore<1.26.0,>=1.25.4 in /opt/conda/lib/python3.7/site-packages (from boto3) (1.25.4)\n", - "Requirement already satisfied: jmespath<2.0.0,>=0.7.1 in /opt/conda/lib/python3.7/site-packages (from boto3) (1.0.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /opt/conda/lib/python3.7/site-packages (from botocore<1.26.0,>=1.25.4->boto3) (1.26.9)\n", - "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /opt/conda/lib/python3.7/site-packages (from botocore<1.26.0,>=1.25.4->boto3) (2.8.1)\n", - "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata>=1.4.0->sagemaker) (2.2.0)\n", - "Requirement already satisfied: pyparsing>=2.0.2 in /opt/conda/lib/python3.7/site-packages (from packaging>=20.0->sagemaker) (2.4.6)\n", - "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from packaging>=20.0->sagemaker) (1.14.0)\n", - "Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas->sagemaker) (2019.3)\n", - "Requirement already satisfied: dill>=0.3.4 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.3.4)\n", - "Requirement already satisfied: ppft>=1.6.6.4 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (1.6.6.4)\n", - "Requirement already satisfied: multiprocess>=0.70.12 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.70.12.2)\n", - "Requirement already satisfied: pox>=0.3.0 in /opt/conda/lib/python3.7/site-packages (from pathos->sagemaker) (0.3.0)\n", - "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!pip install -q 's3fs==0.4.2' 'sagemaker-experiments'\n", "!pip install --upgrade sagemaker boto3\n", @@ -214,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -237,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -265,17 +222,9 @@ }, { "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "download: s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data.zip to data/raw/customer-churn-data.zip\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "##### Alternative: copy data from a public S3 bucket to your own bucket\n", "##### data file should include full_data.csv and sample.json\n", @@ -285,53 +234,18 @@ }, { "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: ./data/raw/customer-churn-data.zip\n", - " inflating: ./data/data_wrangler_output.csv \n", - " inflating: ./data/full_feature_data.csv \n", - " inflating: ./data/sample.csv \n", - " extracting: ./data/sample.zip \n", - " extracting: ./data/simu-1.zip \n", - " extracting: ./data/simu-2.zip \n", - " extracting: ./data/simu-3.zip \n", - " extracting: ./data/simu-4.zip \n", - " inflating: ./data/test.csv \n", - " inflating: ./data/test_updated.csv \n", - " inflating: ./data/train_updated.csv \n", - " inflating: ./data/validation_updated.csv \n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!unzip -o ./data/raw/customer-churn-data.zip -d ./data" ] }, { "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: data/simu-1.zip\n", - " inflating: data/raw/simu-1.json \n", - "Archive: data/simu-2.zip\n", - " inflating: data/raw/simu-2.json \n", - "Archive: data/simu-3.zip\n", - " inflating: data/raw/simu-3.json \n", - "Archive: data/simu-4.zip\n", - " inflating: data/raw/simu-4.json \n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# unzip the partitioned data files into the same folder\n", "!unzip -o data/simu-1.zip -d data/raw\n", @@ -342,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -351,39 +265,18 @@ }, { "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: data/sample.zip\n", - " inflating: data/raw/sample.json \n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!unzip -o data/sample.zip -d data/raw" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "upload: data/raw/simu-1.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json\n", - "upload: data/raw/sample.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json\n", - "upload: data/raw/simu-2.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json\n", - "upload: data/raw/simu-4.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json\n", - "upload: data/raw/simu-3.json to s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!aws s3 cp ./data/raw s3://$bucket/$prefix/data/json/ --recursive" ] @@ -401,7 +294,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -418,7 +311,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -430,124 +323,9 @@ }, { "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
tsuserIdsessionIdpageauthmethodstatuslevelitemInSessionlocationuserAgentlastNamefirstNameregistrationgenderartistsonglength
0159214626773112065118NextSongLogged InPUT200paid0Richmond, VA\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...DavisBristol1.591971e+12MPeter ToshWanted Dread And Alive (2002 Digital Remaster)267.85914
1159214626873112065118Thumbs DownLogged InPUT307paid1Richmond, VA\"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...DavisBristol1.591971e+12MNaNNaNNaN
\n", - "
" - ], - "text/plain": [ - " ts userId sessionId page auth method status \\\n", - "0 1592146267731 12065 118 NextSong Logged In PUT 200 \n", - "1 1592146268731 12065 118 Thumbs Down Logged In PUT 307 \n", - "\n", - " level itemInSession location \\\n", - "0 paid 0 Richmond, VA \n", - "1 paid 1 Richmond, VA \n", - "\n", - " userAgent lastName firstName \\\n", - "0 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... Davis Bristol \n", - "1 \"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK... Davis Bristol \n", - "\n", - " registration gender artist \\\n", - "0 1.591971e+12 M Peter Tosh \n", - "1 1.591971e+12 M NaN \n", - "\n", - " song length \n", - "0 Wanted Dread And Alive (2002 Digital Remaster) 267.85914 \n", - "1 NaN NaN " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "sample.head(2)" ] @@ -563,7 +341,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -582,41 +360,9 @@ }, { "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "percentage of the value missing in each column is: \n" - ] - }, - { - "data": { - "text/plain": [ - "ts 0.000000\n", - "userId 0.000000\n", - "sessionId 0.000000\n", - "page 0.000000\n", - "auth 0.000000\n", - "level 0.000000\n", - "itemInSession 0.000000\n", - "location 0.025447\n", - "userAgent 0.025447\n", - "registration 0.025447\n", - "gender 0.025447\n", - "artist 0.210330\n", - "song 0.210330\n", - "length 0.210330\n", - "dtype: float64" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\"percentage of the value missing in each column is: \")\n", "sample.isnull().sum() / len(sample)" @@ -624,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -643,28 +389,9 @@ }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The unique values in column page are: ['NextSong' 'Thumbs Down' 'Home' 'Settings' 'Thumbs Up' 'Add to Playlist'\n", - " 'Roll Advert' 'Save Settings' 'Help' 'Logout' 'Add Friend' 'Downgrade'\n", - " 'About' 'Upgrade' 'Error' 'Submit Upgrade' 'Submit Downgrade' 'Cancel'\n", - " 'Cancellation Confirmation']\n", - "The unique values in column auth are: ['Logged In' 'Cancelled']\n", - "The unique values in column level are: ['paid' 'free']\n", - "The unique values in column gender are: ['M' 'F']\n", - "There are 72 unique values in column location\n", - "There are 37 unique values in column userAgent\n", - "There are 16207 unique values in column artist\n", - "There are 51447 unique values in column song\n", - "There are 101 unique values in column userId\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "cat_columns = [\"page\", \"auth\", \"level\", \"gender\"]\n", "cat_columns_long = [\"location\", \"userAgent\", \"artist\", \"song\", \"userId\"]\n", @@ -696,7 +423,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -713,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -740,17 +467,9 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are 12.87% of users churned in this dataset\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(\n", " \"There are {:.2f}% of users churned in this dataset\".format(\n", @@ -772,7 +491,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -826,20 +545,9 @@ }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", @@ -888,20 +596,9 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "events_per_hour_per_user = (\n", " sample.groupby([\"userId\", \"ts_date_day\", \"ts_hour\", \"user_churned\"])\n", @@ -941,83 +638,9 @@ }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average total: number of sessions, App usage length, number of songs listened, number of artists listened per user, days active: \n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_churnedsessionIdsongartistlengthts_date_day
002044.6363641434.1022731067.602273412310.6420962044.636364
113260.3846152173.1538461493.230769656340.5865223260.384615
\n", - "
" - ], - "text/plain": [ - " user_churned sessionId song artist length \\\n", - "0 0 2044.636364 1434.102273 1067.602273 412310.642096 \n", - "1 1 3260.384615 2173.153846 1493.230769 656340.586522 \n", - "\n", - " ts_date_day \n", - "0 2044.636364 \n", - "1 3260.384615 " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "stats_per_user = (\n", " sample.groupby([\"userId\", \"user_churned\"])\n", @@ -1054,76 +677,9 @@ }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Average daily: number of sessions, App usage length, number of songs listened, number of artists listened per user: \n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_churnedsessionIdsongartistlength
0076.59770161.30183159.32311615446.290551
1192.74617174.29102871.50109418670.519967
\n", - "
" - ], - "text/plain": [ - " user_churned sessionId song artist length\n", - "0 0 76.597701 61.301831 59.323116 15446.290551\n", - "1 1 92.746171 74.291028 71.501094 18670.519967" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "stats_per_user = (\n", " sample.groupby([\"userId\", \"ts_date_day\", \"user_churned\"])\n", @@ -1152,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1176,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1185,88 +741,9 @@ }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
user_churnednextsongthumbs_downthumbs_upadd_to_playlistroll_advertadd_frienddowngradeupgradeerror
001656.20454516.988636150.47727350.8636367.61363629.1818189.5681821.9545452.193182
112645.53846228.076923239.61538580.84615410.92307748.92307712.6153852.4615383.461538
\n", - "
" - ], - "text/plain": [ - " user_churned nextsong thumbs_down thumbs_up add_to_playlist \\\n", - "0 0 1656.204545 16.988636 150.477273 50.863636 \n", - "1 1 2645.538462 28.076923 239.615385 80.846154 \n", - "\n", - " roll_advert add_friend downgrade upgrade error \n", - "0 7.613636 29.181818 9.568182 1.954545 2.193182 \n", - "1 10.923077 48.923077 12.615385 2.461538 3.461538 " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "app_use_group = app_use_per_user.groupby([\"user_churned\"])[usage_column_name].mean().reset_index()\n", "app_use_group" @@ -1288,17 +765,9 @@ }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting preprocessing_predw.py\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "%%writefile preprocessing_predw.py\n", "\n", @@ -1339,7 +808,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1352,24 +821,9 @@ }, { "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['s3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json',\n", - " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json',\n", - " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json',\n", - " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json',\n", - " 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json']" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "s3_client = boto3.client(\"s3\")\n", "list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=f\"{prefix}/data/json\")\n", @@ -1379,7 +833,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1396,30 +850,9 @@ }, { "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: sagemaker-scikit-learn-2022-04-30-01-59-49-481\n", - "Inputs: [{'InputName': 'sample', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json', 'LocalPath': '/opt/ml/processing/input/data/sample', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json', 'LocalPath': '/opt/ml/processing/input/data/simu-1', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json', 'LocalPath': '/opt/ml/processing/input/data/simu-2', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json', 'LocalPath': '/opt/ml/processing/input/data/simu-3', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'simu-4', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json', 'LocalPath': '/opt/ml/processing/input/data/simu-4', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-01-59-49-481/input/code/preprocessing_predw.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'processed_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", - "...........................\u001b[34mReceived arguments Namespace(processing_output_filename='full_data.csv')\u001b[0m\n", - "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-3/simu-3.json\u001b[0m\n", - "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-1/simu-1.json\u001b[0m\n", - "\u001b[34mStarting file: /opt/ml/processing/input/data/sample/sample.json\u001b[0m\n", - "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-4/simu-4.json\u001b[0m\n", - "\u001b[34mStarting file: /opt/ml/processing/input/data/simu-2/simu-2.json\u001b[0m\n", - "\u001b[34mSaving processed data to /opt/ml/processing/output/full_data.csv\u001b[0m\n", - "\n", - "CPU times: user 932 ms, sys: 81.8 ms, total: 1.01 s\n", - "Wall time: 8min 26s\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "%%time\n", "processing_output_path = f\"s3://{bucket}/{prefix}/data/processing\"\n", @@ -1481,20 +914,9 @@ }, { "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/full_data.csv'" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "processing_output_filename = f\"{processing_output_path}/{final_features_filename}\"\n", "processing_output_filename" @@ -1502,139 +924,9 @@ }, { "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'metadata': {'version': 1},\n", - " 'nodes': [{'node_id': '660c3ee3-5207-4ded-b92f-7059831a7aed',\n", - " 'type': 'SOURCE',\n", - " 'operator': 'sagemaker.s3_source_0.1',\n", - " 'parameters': {'dataset_definition': {'__typename': 'S3CreateDatasetDefinitionOutput',\n", - " 'datasetSourceType': 'S3',\n", - " 'name': 'full_data.csv',\n", - " 'description': None,\n", - " 's3ExecutionContext': {'__typename': 'S3ExecutionContext',\n", - " 's3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/full_data.csv',\n", - " 's3ContentType': 'csv',\n", - " 's3HasHeader': True}}},\n", - " 'inputs': [],\n", - " 'outputs': [{'name': 'default',\n", - " 'sampling': {'sampling_method': 'sample_by_ratio',\n", - " 'sample_ratio': 0.06460757939298588}}]},\n", - " {'node_id': 'd04eac2a-92a9-4539-b22f-f0f30aa29877',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.infer_and_cast_type_0.1',\n", - " 'parameters': {},\n", - " 'trained_parameters': {'schema': {'ts': 'long',\n", - " 'userId': 'long',\n", - " 'sessionId': 'long',\n", - " 'page': 'string',\n", - " 'auth': 'string',\n", - " 'method': 'string',\n", - " 'status': 'long',\n", - " 'level': 'string',\n", - " 'itemInSession': 'long',\n", - " 'location': 'string',\n", - " 'userAgent': 'string',\n", - " 'lastName': 'string',\n", - " 'firstName': 'string',\n", - " 'registration': 'float',\n", - " 'gender': 'string',\n", - " 'artist': 'string',\n", - " 'song': 'string',\n", - " 'length': 'long'}},\n", - " 'inputs': [{'name': 'default',\n", - " 'node_id': '660c3ee3-5207-4ded-b92f-7059831a7aed',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': 'd1b462ec-bbae-466d-afbd-39e5eab8dcc9',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'method'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': 'd04eac2a-92a9-4539-b22f-f0f30aa29877',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '4dfd1354-1904-4fa4-bff7-56a9e0e50d0a',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'status'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': 'd1b462ec-bbae-466d-afbd-39e5eab8dcc9',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '92ac4b28-bfb1-47bf-848a-de23735a2570',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'location'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': '4dfd1354-1904-4fa4-bff7-56a9e0e50d0a',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': 'e1fd74c7-8240-4e99-876e-73b42a063e65',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'userAgent'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': '92ac4b28-bfb1-47bf-848a-de23735a2570',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '1550cb2f-c734-46f8-bfdc-4f8614c30c09',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'lastName'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': 'e1fd74c7-8240-4e99-876e-73b42a063e65',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '32405a27-8e85-4c9b-8142-dd75d56fa75d',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.manage_columns_0.1',\n", - " 'parameters': {'operator': 'Drop column',\n", - " 'drop_column_parameters': {'column_to_drop': 'firstName'}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': '1550cb2f-c734-46f8-bfdc-4f8614c30c09',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '7b74dbbc-6f7e-4656-8f78-25272604bc45',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.handle_missing_0.1',\n", - " 'parameters': {'operator': 'Drop missing',\n", - " 'drop_missing_parameters': {'dimension': 'Drop Rows',\n", - " 'drop_rows_parameters': {'input_column': 'userId'}},\n", - " 'impute_parameters': {'column_type': 'Numeric',\n", - " 'numeric_parameters': {'strategy': 'Approximate Median'}}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': '32405a27-8e85-4c9b-8142-dd75d56fa75d',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]},\n", - " {'node_id': '82cb5ad3-3b9c-428d-9260-ce6efcd4c4f8',\n", - " 'type': 'TRANSFORM',\n", - " 'operator': 'sagemaker.spark.handle_missing_0.1',\n", - " 'parameters': {'operator': 'Drop missing',\n", - " 'drop_missing_parameters': {'dimension': 'Drop Rows',\n", - " 'drop_rows_parameters': {'input_column': 'registration'}},\n", - " 'impute_parameters': {'column_type': 'Numeric',\n", - " 'numeric_parameters': {'strategy': 'Approximate Median'}}},\n", - " 'inputs': [{'name': 'df',\n", - " 'node_id': '7b74dbbc-6f7e-4656-8f78-25272604bc45',\n", - " 'output_name': 'default'}],\n", - " 'outputs': [{'name': 'default'}]}]}" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "flow_file = \"dw_example.flow\"\n", "\n", @@ -1661,7 +953,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1678,7 +970,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1688,17 +980,9 @@ }, { "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting preprocessing.py\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "%%writefile preprocessing.py\n", "\n", @@ -1949,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1958,256 +1242,11 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Job Name: sagemaker-scikit-learn-2022-04-30-02-08-16-353\n", - "Inputs: [{'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-02-08-16-353/input/code/preprocessing.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n", - "Outputs: [{'OutputName': 'processed_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n", - "...........................\u001b[34mRequirement already satisfied: pandas in /miniconda3/lib/python3.7/site-packages (1.1.3)\u001b[0m\n", - "\u001b[34mCollecting pandas\n", - " Downloading pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.3/11.3 MB 87.5 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: numpy>=1.17.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (1.21.0)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: python-dateutil>=2.7.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (2.8.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pytz>=2017.3 in /miniconda3/lib/python3.7/site-packages (from pandas) (2022.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: six>=1.5 in /miniconda3/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\u001b[0m\n", - "\u001b[34mInstalling collected packages: pandas\n", - " Attempting uninstall: pandas\n", - " Found existing installation: pandas 1.1.3\n", - " Uninstalling pandas-1.1.3:\n", - " Successfully uninstalled pandas-1.1.3\u001b[0m\n", - "\u001b[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\u001b[0m\n", - "\u001b[34msagemaker-sklearn-container 2.0 requires pandas==1.1.3, but you have pandas 1.3.5 which is incompatible.\u001b[0m\n", - "\u001b[34mSuccessfully installed pandas-1.3.5\u001b[0m\n", - "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", - "\u001b[34mCollecting awswrangler\n", - " Downloading awswrangler-2.15.1-py3-none-any.whl (239 kB)\u001b[0m\n", - "\u001b[34m ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 239.6/239.6 KB 8.2 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting openpyxl<3.1.0,>=3.0.0\n", - " Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.2/242.2 KB 33.7 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting pyarrow<7.1.0,>=2.0.0\n", - " Downloading pyarrow-7.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 26.7/26.7 MB 52.3 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting redshift-connector<2.1.0,>=2.0.889\n", - " Downloading redshift_connector-2.0.906-py3-none-any.whl (109 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 109.8/109.8 KB 19.9 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting botocore<2.0.0,>=1.23.17\n", - " Downloading botocore-1.25.4-py3-none-any.whl (8.7 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 8.7/8.7 MB 96.3 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting opensearch-py<2.0.0,>=1.0.0\n", - " Downloading opensearch_py-1.1.0-py2.py3-none-any.whl (207 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 KB 30.5 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting progressbar2<5.0.0,>=4.0.0\n", - " Downloading progressbar2-4.0.0-py2.py3-none-any.whl (26 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pandas<2.0.0,>=1.2.0 in /miniconda3/lib/python3.7/site-packages (from awswrangler) (1.3.5)\u001b[0m\n", - "\u001b[34mCollecting backoff<2.0.0,>=1.11.1\n", - " Downloading backoff-1.11.1-py2.py3-none-any.whl (13 kB)\u001b[0m\n", - "\u001b[34mCollecting gremlinpython<4.0.0,>=3.5.2\n", - " Downloading gremlinpython-3.6.0-py2.py3-none-any.whl (72 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 72.8/72.8 KB 14.3 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: numpy<2.0.0,>=1.21.0 in /miniconda3/lib/python3.7/site-packages (from awswrangler) (1.21.0)\u001b[0m\n", - "\u001b[34mCollecting pg8000<2.0.0,>=1.20.0\n", - " Downloading pg8000-1.26.1-py3-none-any.whl (33 kB)\u001b[0m\n", - "\u001b[34mCollecting pymysql<2.0.0,>=1.0.0\n", - " Downloading PyMySQL-1.0.2-py3-none-any.whl (43 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.8/43.8 KB 7.2 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting jsonpath-ng<2.0.0,>=1.5.3\n", - " Downloading jsonpath_ng-1.5.3-py3-none-any.whl (29 kB)\u001b[0m\n", - "\u001b[34mCollecting boto3<2.0.0,>=1.20.17\n", - " Downloading boto3-1.22.4-py3-none-any.whl (132 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 132.5/132.5 KB 20.2 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting requests-aws4auth<2.0.0,>=1.1.1\n", - " Downloading requests_aws4auth-1.1.2-py2.py3-none-any.whl (24 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: jmespath<2.0.0,>=0.7.1 in /miniconda3/lib/python3.7/site-packages (from boto3<2.0.0,>=1.20.17->awswrangler) (0.10.0)\u001b[0m\n", - "\u001b[34mCollecting s3transfer<0.6.0,>=0.5.0\n", - " Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 79.5/79.5 KB 19.0 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /miniconda3/lib/python3.7/site-packages (from botocore<2.0.0,>=1.23.17->awswrangler) (2.8.1)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: urllib3<1.27,>=1.25.4 in /miniconda3/lib/python3.7/site-packages (from botocore<2.0.0,>=1.23.17->awswrangler) (1.25.11)\u001b[0m\n", - "\u001b[34mCollecting nest-asyncio\n", - " Downloading nest_asyncio-1.5.5-py3-none-any.whl (5.2 kB)\u001b[0m\n", - "\u001b[34mCollecting isodate<1.0.0,>=0.6.0\n", - " Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.7/41.7 KB 7.8 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting aiohttp<=3.8.1,>=3.8.0\n", - " Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.1/1.1 MB 74.4 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting aenum<4.0.0,>=1.4.5\n", - " Downloading aenum-3.1.11-py3-none-any.whl (131 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 131.5/131.5 KB 26.2 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting decorator\n", - " Downloading decorator-5.1.1-py3-none-any.whl (9.1 kB)\u001b[0m\n", - "\u001b[34mCollecting ply\n", - " Downloading ply-3.11-py2.py3-none-any.whl (49 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.6/49.6 KB 9.7 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: six in /miniconda3/lib/python3.7/site-packages (from jsonpath-ng<2.0.0,>=1.5.3->awswrangler) (1.15.0)\u001b[0m\n", - "\u001b[34mCollecting et-xmlfile\n", - " Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: certifi in /miniconda3/lib/python3.7/site-packages (from opensearch-py<2.0.0,>=1.0.0->awswrangler) (2021.10.8)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: pytz>=2017.3 in /miniconda3/lib/python3.7/site-packages (from pandas<2.0.0,>=1.2.0->awswrangler) (2022.1)\u001b[0m\n", - "\u001b[34mCollecting scramp>=1.4.1\n", - " Downloading scramp-1.4.1-py3-none-any.whl (8.5 kB)\u001b[0m\n", - "\u001b[34mCollecting python-utils>=3.0.0\n", - " Downloading python_utils-3.1.0-py2.py3-none-any.whl (19 kB)\u001b[0m\n", - "\u001b[34mCollecting lxml>=4.6.5\n", - " Downloading lxml-4.8.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (6.4 MB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.4/6.4 MB 103.0 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting packaging\n", - " Downloading packaging-21.3-py3-none-any.whl (40 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.8/40.8 KB 6.9 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: requests<2.27.2,>=2.23.0 in /miniconda3/lib/python3.7/site-packages (from redshift-connector<2.1.0,>=2.0.889->awswrangler) (2.27.1)\u001b[0m\n", - "\u001b[34mCollecting beautifulsoup4<5.0.0,>=4.7.0\n", - " Downloading beautifulsoup4-4.11.1-py3-none-any.whl (128 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 128.2/128.2 KB 21.6 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting yarl<2.0,>=1.0\n", - " Downloading yarl-1.7.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (271 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 271.8/271.8 KB 35.1 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting asynctest==0.13.0\n", - " Downloading asynctest-0.13.0-py3-none-any.whl (26 kB)\u001b[0m\n", - "\u001b[34mCollecting aiosignal>=1.1.2\n", - " Downloading aiosignal-1.2.0-py3-none-any.whl (8.2 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: charset-normalizer<3.0,>=2.0 in /miniconda3/lib/python3.7/site-packages (from aiohttp<=3.8.1,>=3.8.0->gremlinpython<4.0.0,>=3.5.2->awswrangler) (2.0.4)\u001b[0m\n", - "\u001b[34mCollecting attrs>=17.3.0\n", - " Downloading attrs-21.4.0-py2.py3-none-any.whl (60 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.6/60.6 KB 13.3 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting multidict<7.0,>=4.5\n", - " Downloading multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (94 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 94.8/94.8 KB 15.3 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting frozenlist>=1.1.1\n", - " Downloading frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 144.8/144.8 KB 15.0 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mRequirement already satisfied: typing-extensions>=3.7.4 in /miniconda3/lib/python3.7/site-packages (from aiohttp<=3.8.1,>=3.8.0->gremlinpython<4.0.0,>=3.5.2->awswrangler) (4.1.1)\u001b[0m\n", - "\u001b[34mCollecting async-timeout<5.0,>=4.0.0a3\n", - " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\u001b[0m\n", - "\u001b[34mCollecting soupsieve>1.2\n", - " Downloading soupsieve-2.3.2.post1-py3-none-any.whl (37 kB)\u001b[0m\n", - "\u001b[34mRequirement already satisfied: idna<4,>=2.5 in /miniconda3/lib/python3.7/site-packages (from requests<2.27.2,>=2.23.0->redshift-connector<2.1.0,>=2.0.889->awswrangler) (3.3)\u001b[0m\n", - "\u001b[34mCollecting asn1crypto>=1.4.0\n", - " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 105.0/105.0 KB 11.9 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mCollecting pyparsing!=3.0.5,>=2.0.2\n", - " Downloading pyparsing-3.0.8-py3-none-any.whl (98 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 98.5/98.5 KB 11.6 MB/s eta 0:00:00\u001b[0m\n", - "\u001b[34mInstalling collected packages: ply, asn1crypto, aenum, soupsieve, scramp, python-utils, pyparsing, pymysql, pyarrow, opensearch-py, nest-asyncio, multidict, lxml, isodate, frozenlist, et-xmlfile, decorator, backoff, attrs, asynctest, async-timeout, yarl, requests-aws4auth, progressbar2, pg8000, packaging, openpyxl, jsonpath-ng, botocore, beautifulsoup4, aiosignal, s3transfer, aiohttp, gremlinpython, boto3, redshift-connector, awswrangler\u001b[0m\n", - "\u001b[34m Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 0.16.0\n", - " Uninstalling pyarrow-0.16.0:\n", - " Successfully uninstalled pyarrow-0.16.0\u001b[0m\n", - "\u001b[34m Attempting uninstall: botocore\n", - " Found existing installation: botocore 1.19.4\n", - " Uninstalling botocore-1.19.4:\n", - " Successfully uninstalled botocore-1.19.4\u001b[0m\n", - "\u001b[34m Attempting uninstall: s3transfer\n", - " Found existing installation: s3transfer 0.3.7\n", - " Uninstalling s3transfer-0.3.7:\n", - " Successfully uninstalled s3transfer-0.3.7\n", - " Attempting uninstall: boto3\n", - " Found existing installation: boto3 1.16.4\n", - " Uninstalling boto3-1.16.4:\n", - " Successfully uninstalled boto3-1.16.4\u001b[0m\n", - "\u001b[34mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\u001b[0m\n", - "\u001b[34msagemaker-sklearn-container 2.0 requires boto3==1.16.4, but you have boto3 1.22.4 which is incompatible.\u001b[0m\n", - "\u001b[34msagemaker-sklearn-container 2.0 requires botocore==1.19.4, but you have botocore 1.25.4 which is incompatible.\u001b[0m\n", - "\u001b[34msagemaker-sklearn-container 2.0 requires pandas==1.1.3, but you have pandas 1.3.5 which is incompatible.\u001b[0m\n", - "\u001b[34mSuccessfully installed aenum-3.1.11 aiohttp-3.8.1 aiosignal-1.2.0 asn1crypto-1.5.1 async-timeout-4.0.2 asynctest-0.13.0 attrs-21.4.0 awswrangler-2.15.1 backoff-1.11.1 beautifulsoup4-4.11.1 boto3-1.22.4 botocore-1.25.4 decorator-5.1.1 et-xmlfile-1.1.0 frozenlist-1.3.0 gremlinpython-3.6.0 isodate-0.6.1 jsonpath-ng-1.5.3 lxml-4.8.0 multidict-6.0.2 nest-asyncio-1.5.5 openpyxl-3.0.9 opensearch-py-1.1.0 packaging-21.3 pg8000-1.26.1 ply-3.11 progressbar2-4.0.0 pyarrow-7.0.0 pymysql-1.0.2 pyparsing-3.0.8 python-utils-3.1.0 redshift-connector-2.0.906 requests-aws4auth-1.1.2 s3transfer-0.5.2 scramp-1.4.1 soupsieve-2.3.2.post1 yarl-1.7.2\u001b[0m\n", - "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", - "\u001b[34mReceived arguments Namespace(dw_output_path='s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing', processing_output_filename='processing_job_output.csv')\u001b[0m\n", - "\n", - "\u001b[34mTraceback (most recent call last):\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 341, in array_func\n", - " \"aggregate\", values, how, axis=data.ndim - 1, min_count=min_count\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1016, in _cython_operation\n", - " **kwargs,\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 677, in cython_operation\n", - " **kwargs,\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 508, in _cython_op_ndim_compat\n", - " **kwargs,\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 563, in _call_cython_op\n", - " func, values = self.get_cython_func_and_vals(values, is_numeric)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 205, in get_cython_func_and_vals\n", - " func = self._get_cython_function(kind, how, values.dtype, is_numeric)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 171, in _get_cython_function\n", - " f\"function is not implemented for this dtype: \"\u001b[0m\n", - "\u001b[34mNotImplementedError: function is not implemented for this dtype: [how->max,dtype->object]\u001b[0m\n", - "\u001b[34mDuring handling of the above exception, another exception occurred:\u001b[0m\n", - "\u001b[34mTraceback (most recent call last):\n", - " File \"/opt/ml/processing/input/code/preprocessing.py\", line 126, in \n", - " \"user_churned\": \"max\",\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 979, in aggregate\n", - " result = op.agg()\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 161, in agg\n", - " return self.agg_dict_like()\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 436, in agg_dict_like\n", - " key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/apply.py\", line 436, in \n", - " key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items()\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 243, in aggregate\n", - " return getattr(self, func)(*args, **kwargs)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1880, in max\n", - " numeric_only=numeric_only, min_count=min_count, alias=\"max\", npfunc=np.max\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1368, in _agg_general\n", - " min_count=min_count,\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 352, in _cython_agg_general\n", - " result = array_func(objvals)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/generic.py\", line 348, in array_func\n", - " result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/groupby.py\", line 1398, in _agg_py_fallback\n", - " res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1060, in agg_series\n", - " result = self._aggregate_series_fast(obj, func)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/groupby/ops.py\", line 1085, in _aggregate_series_fast\n", - " result, _ = sgrouper.get_result()\n", - " File \"pandas/_libs/reduction.pyx\", line 281, in pandas._libs.reduction.SeriesGrouper.get_result\n", - " File \"pandas/_libs/reduction.pyx\", line 88, in pandas._libs.reduction._BaseGrouper._apply_to_group\n", - " File \"<__array_function__ internals>\", line 6, in amax\n", - " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py\", line 2755, in amax\n", - " keepdims=keepdims, initial=initial, where=where)\n", - " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/fromnumeric.py\", line 84, in _wrapreduction\n", - " return reduction(axis=axis, out=out, **passkwargs)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10819, in max\u001b[0m\n", - "\u001b[34m return NDFrame.max(self, axis, skipna, level, numeric_only, **kwargs)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10365, in max\n", - " \"max\", nanops.nanmax, axis, skipna, level, numeric_only, **kwargs\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/generic.py\", line 10355, in _stat_function\n", - " func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/series.py\", line 4392, in _reduce\n", - " return op(delegate, skipna=skipna, **kwds)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 156, in f\n", - " result = alt(values, axis=axis, skipna=skipna, **kwds)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 411, in new_func\n", - " result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)\n", - " File \"/miniconda3/lib/python3.7/site-packages/pandas/core/nanops.py\", line 1018, in reduction\n", - " result = getattr(values, meth)(axis)\n", - " File \"/miniconda3/lib/python3.7/site-packages/numpy/core/_methods.py\", line 40, in _amax\n", - " return umr_maximum(a, axis, None, out, keepdims, initial, where)\u001b[0m\n", - "\u001b[34mTypeError: '>=' not supported between instances of 'datetime.date' and 'float'\u001b[0m\n" - ] - }, - { - "ename": "UnexpectedStatusException", - "evalue": "Error for Processing job sagemaker-scikit-learn-2022-04-30-02-08-16-353: Failed. Reason: AlgorithmError: See job logs for more information", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mUnexpectedStatusException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/processing.py\u001b[0m in \u001b[0;36mrun\u001b[0;34m(self, code, inputs, outputs, arguments, wait, logs, job_name, experiment_config, kms_key)\u001b[0m\n\u001b[1;32m 557\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjobs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_job\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 559\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlatest_job\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 560\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 561\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_include_code_in_inputs\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkms_key\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/processing.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, logs)\u001b[0m\n\u001b[1;32m 966\u001b[0m \"\"\"\n\u001b[1;32m 967\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlogs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogs_for_processing_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 970\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msagemaker_session\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait_for_processing_job\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36mlogs_for_processing_job\u001b[0;34m(self, job_name, wait, poll)\u001b[0m\n\u001b[1;32m 3885\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3886\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mwait\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3887\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_check_job_status\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdescription\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"ProcessingJobStatus\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3888\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mdot\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3889\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/sagemaker/session.py\u001b[0m in \u001b[0;36m_check_job_status\u001b[0;34m(self, job, desc, status_key_name)\u001b[0m\n\u001b[1;32m 3337\u001b[0m \u001b[0mmessage\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmessage\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3338\u001b[0m \u001b[0mallowed_statuses\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"Completed\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Stopped\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3339\u001b[0;31m \u001b[0mactual_status\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mstatus\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3340\u001b[0m )\n\u001b[1;32m 3341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mUnexpectedStatusException\u001b[0m: Error for Processing job sagemaker-scikit-learn-2022-04-30-02-08-16-353: Failed. Reason: AlgorithmError: See job logs for more information" - ] - } - ], + "outputs": [], "source": [ "%%time\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", @@ -2236,95 +1275,9 @@ }, { "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ProcessingInputs': [{'InputName': 'sample',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/sample.json',\n", - " 'LocalPath': '/opt/ml/processing/input/data/sample',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}},\n", - " {'InputName': 'simu-1',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-1.json',\n", - " 'LocalPath': '/opt/ml/processing/input/data/simu-1',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}},\n", - " {'InputName': 'simu-2',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-2.json',\n", - " 'LocalPath': '/opt/ml/processing/input/data/simu-2',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}},\n", - " {'InputName': 'simu-3',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-3.json',\n", - " 'LocalPath': '/opt/ml/processing/input/data/simu-3',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}},\n", - " {'InputName': 'simu-4',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/json/simu-4.json',\n", - " 'LocalPath': '/opt/ml/processing/input/data/simu-4',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}},\n", - " {'InputName': 'code',\n", - " 'AppManaged': False,\n", - " 'S3Input': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/sagemaker-scikit-learn-2022-04-30-01-59-49-481/input/code/preprocessing_predw.py',\n", - " 'LocalPath': '/opt/ml/processing/input/code',\n", - " 'S3DataType': 'S3Prefix',\n", - " 'S3InputMode': 'File',\n", - " 'S3DataDistributionType': 'FullyReplicated',\n", - " 'S3CompressionType': 'None'}}],\n", - " 'ProcessingOutputConfig': {'Outputs': [{'OutputName': 'processed_data',\n", - " 'S3Output': {'S3Uri': 's3://sagemaker-us-west-2-688520471316/music-streaming/data/processing',\n", - " 'LocalPath': '/opt/ml/processing/output',\n", - " 'S3UploadMode': 'EndOfJob'},\n", - " 'AppManaged': False}]},\n", - " 'ProcessingJobName': 'sagemaker-scikit-learn-2022-04-30-01-59-49-481',\n", - " 'ProcessingResources': {'ClusterConfig': {'InstanceCount': 1,\n", - " 'InstanceType': 'ml.m5.xlarge',\n", - " 'VolumeSizeInGB': 30}},\n", - " 'StoppingCondition': {'MaxRuntimeInSeconds': 86400},\n", - " 'AppSpecification': {'ImageUri': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:0.23-1-cpu-py3',\n", - " 'ContainerEntrypoint': ['python3',\n", - " '/opt/ml/processing/input/code/preprocessing_predw.py'],\n", - " 'ContainerArguments': ['--processing-output-filename', 'full_data.csv']},\n", - " 'RoleArn': 'arn:aws:iam::688520471316:role/service-role/AmazonSageMaker-ExecutionRole-20211229T100947',\n", - " 'ProcessingJobArn': 'arn:aws:sagemaker:us-west-2:688520471316:processing-job/sagemaker-scikit-learn-2022-04-30-01-59-49-481',\n", - " 'ProcessingJobStatus': 'Completed',\n", - " 'ProcessingEndTime': datetime.datetime(2022, 4, 30, 2, 7, 45, 433000, tzinfo=tzlocal()),\n", - " 'ProcessingStartTime': datetime.datetime(2022, 4, 30, 2, 3, 35, 10000, tzinfo=tzlocal()),\n", - " 'LastModifiedTime': datetime.datetime(2022, 4, 30, 2, 7, 45, 721000, tzinfo=tzlocal()),\n", - " 'CreationTime': datetime.datetime(2022, 4, 30, 1, 59, 49, 808000, tzinfo=tzlocal()),\n", - " 'ResponseMetadata': {'RequestId': 'cb40881c-69d9-4c7f-9a9c-d97153d589a5',\n", - " 'HTTPStatusCode': 200,\n", - " 'HTTPHeaders': {'x-amzn-requestid': 'cb40881c-69d9-4c7f-9a9c-d97153d589a5',\n", - " 'content-type': 'application/x-amz-json-1.1',\n", - " 'content-length': '3223',\n", - " 'date': 'Sat, 30 Apr 2022 02:08:14 GMT'},\n", - " 'RetryAttempts': 0}}" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "preprocessing_job_description" ] @@ -2358,20 +1311,9 @@ }, { "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/processing_job_output.csv'" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "processing_job_output_uri = f\"{processing_job_output_path}/{processing_job_output_name}\"\n", "processing_job_output_uri" @@ -2379,24 +1321,16 @@ }, { "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "download: s3://sagemaker-us-west-2-688520471316/music-streaming/data/processing/processing_job_output.csv to data/processing_job_output.csv\n" - ] - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!aws s3 cp $processing_job_output_uri ./data" ] }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2405,7 +1339,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2415,194 +1349,9 @@ }, { "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIduser_churnedaverage_events_weekendaverage_events_weekdaynum_songs_played_7dnum_ads_7dnum_error_7dnum_songs_played_30dnum_songs_played_90dnum_sessions...num_thumbs_upnum_add_to_playlistnum_adsnum_add_friendnum_downgradenum_upgradenum_errorpercentage_addays_since_activerepeats_ratio
0110010.094.937576.3043484135714135413551...2931407811110.0013923590.179444
1110020.070.500076.666667476104764767...41161141000.0016642650.052521
2110031.098.7500120.87500038671293867386737...27210312691190.002576660.175330
3110041.070.0000120.444444108421108410847...6830291010.001546480.076568
\n", - "

4 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " userId user_churned average_events_weekend average_events_weekday \\\n", - "0 11001 0.0 94.9375 76.304348 \n", - "1 11002 0.0 70.5000 76.666667 \n", - "2 11003 1.0 98.7500 120.875000 \n", - "3 11004 1.0 70.0000 120.444444 \n", - "\n", - " num_songs_played_7d num_ads_7d num_error_7d num_songs_played_30d \\\n", - "0 4135 7 1 4135 \n", - "1 476 1 0 476 \n", - "2 3867 12 9 3867 \n", - "3 1084 2 1 1084 \n", - "\n", - " num_songs_played_90d num_sessions ... num_thumbs_up \\\n", - "0 4135 51 ... 293 \n", - "1 476 7 ... 41 \n", - "2 3867 37 ... 272 \n", - "3 1084 7 ... 68 \n", - "\n", - " num_add_to_playlist num_ads num_add_friend num_downgrade num_upgrade \\\n", - "0 140 7 81 1 1 \n", - "1 16 1 14 1 0 \n", - "2 103 12 69 1 1 \n", - "3 30 2 9 1 0 \n", - "\n", - " num_error percentage_ad days_since_active repeats_ratio \n", - "0 1 0.001392 359 0.179444 \n", - "1 0 0.001664 265 0.052521 \n", - "2 9 0.002576 66 0.175330 \n", - "3 1 0.001546 48 0.076568 \n", - "\n", - "[4 rows x 27 columns]" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "processed_data.head(4)" ] @@ -2616,7 +1365,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2631,7 +1380,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2643,7 +1392,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2666,7 +1415,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ From f2626fa62e6d1c0976476ba7a5b529af91d55f48 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 20:01:46 +0000 Subject: [PATCH 13/27] final cleanup and reformat --- ...b => 1_cust_churn_overview_dataprep.ipynb} | 83 +++++++++++++------ .../2_cust_churn_train_deploy_infer.ipynb | 40 ++------- 2 files changed, 65 insertions(+), 58 deletions(-) rename use-cases/customer_churn/{0_cust_churn_overview_dw.ipynb => 1_cust_churn_overview_dataprep.ipynb} (91%) diff --git a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb similarity index 91% rename from use-cases/customer_churn/0_cust_churn_overview_dw.ipynb rename to use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index d4e6436eed..76bff32faf 100644 --- a/use-cases/customer_churn/0_cust_churn_overview_dw.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -6,26 +6,24 @@ "source": [ "# Build a Customer Churn Model for Music Streaming App Users: Overview and Data Preparation\n", "\n", - "In this demo, you are going to learn how to use various SageMaker functionalities to build, train, and deploy the model from end to end, including data pre-processing steps like ingestion, cleaning and processing, feature engineering, training and hyperparameter tuning, model explainability, and eventually deploy the model. There are two parts of the demo: in part 1: Prepare Data, you will process the data with the help of Data Wrangler, then create features from the cleaned data. By the end of part 1, you will have a complete feature data set that contains all attributes built for each user, and it is ready for modeling. Then in part 2: Modeling and Reference, you will use the data set built from part 1 to find an optimal model for the use case, then test the model predictability with the test data. To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input for the next steps.\n", + "## Background\n", "\n", + "This notebook is one of a sequence of notebooks that show you how to use various SageMaker functionalities to build, train, and deploy the model from end to end, including data pre-processing steps like ingestion, cleaning and processing, feature engineering, training and hyperparameter tuning, model explainability, and eventually deploy the model. There are two parts of the demo: \n", + "\n", + "1. Build a Customer Churn Model for Music Streaming App Users: Overview and Data Preparation (current notebook) - you will process the data with the help of Data Wrangler, then create features from the cleaned data. By the end of part 1, you will have a complete feature data set that contains all attributes built for each user, and it is ready for modeling.\n", + "1. Build a Customer Churn Model for Music Streaming App Users: Model Selection and Model Explainability - you will use the data set built from part 1 to find an optimal model for the use case, then test the model predictability with the test data. \n", "\n", "For how to set up the SageMaker Studio Notebook environment, please check the [onboarding video]( https://www.youtube.com/watch?v=wiDHCWVrjCU&feature=youtu.be). And for a list of services covered in the use case demo, please check the documentation linked in each section.\n", "\n", "\n", "## Content\n", "* [Overview](#Overview)\n", - "* [Data Selection](#2)\n", - "* [Ingest Data](#4)\n", - "* [Data Cleaning and Data Exploration](#5)\n", - "* [Pre-processing with SageMaker Data Wrangler](#7)\n", - "* [Feature Engineering with SageMaker Processing](#6)\n", - "* [Data Splitting](#8)\n", - "* [Model Selection](#9)\n", - "* [Training with SageMaker Estimator and Experiment](#10)\n", - "* [Hyperparameter Tuning with SageMaker Hyperparameter Tuning Job](#11)\n", - "* [Deploy the model with SageMaker Batch-transform](#12)\n", - "* [Model Explainability with SageMaker Clarify](#15)\n", - "* [Optional: Automate your training and model selection with SageMaker Autopilot (Console)](#13)" + "* [Data Selection](#Data-Selection)\n", + "* [Ingest Data](#Ingest-Data)\n", + "* [Data Cleaning and Data Exploration](#Data-Cleaning)\n", + "* [Pre-processing with SageMaker Data Wrangler](#Pre-processing-with-SageMaker-Data-Wrangler)\n", + "* [Feature Engineering with SageMaker Processing](#Feature-Engineering-with-SageMaker-Processing)\n", + "* [Data Splitting](#Data-Splitting)" ] }, { @@ -87,8 +85,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Data Selection\n", "\n", "You will use generated music streaming data that is simulated to imitate music streaming user behaviors. The data simulated contains 1100 users and their user behavior for one year (2019/10/28 - 2020/10/28). Data is simulated using the [EventSim](https://github.com/Interana/eventsim) and does not contain any real user data.\n", @@ -213,8 +209,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "### Ingest Data\n", "\n", "We ingest the simulated data from the public SageMaker S3 training database." @@ -285,8 +279,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "### Data Cleaning\n", "\n", "Due to the size of the data (~2GB), you will start exploring our data starting with a smaller sample, decide which pre-processing steps are necessary, and apply them to the whole dataset." @@ -753,8 +745,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Pre-processing with SageMaker Data Wrangler\n", "\n", "Now that you have a good understanding of your data and decided which steps are needed to pre-process your data, you can utilize the new Amazon SageMaker GUI tool **Data Wrangler**, without writing all the code for the SageMaker Processing Job.\n", @@ -948,7 +938,52 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Preprocess the Data" + "## Feature Engineering with SageMaker Processing\n", + "\n", + "\n", + "For user churn analysis, usually, you can consider build features from the following aspects:\n", + "\n", + "* Generate base features:\n", + " * user behavior features (listening behavior, app behavior).\n", + " * customer demographic features.\n", + " * customer support features (interactions, ratings, etc.)\n", + "* Formulate time series as features:\n", + " * construct streaming time as time series.\n", + " * build features in the different time windows (e.g. total songs listened in the last 7 days, 30 days, 180 days, etc.)\n", + " \n", + "For this use case, after exploring the data and with all the findings you gathered, now is the time to create features used for your model. Since the data set is time series, you can enrich your features by adding a time factor to it: e.g., for the total number of songs listened, you can create features like total songs listened in the last 7 days, last 30 days, last 90 days, last 180 days, etc. The features built for these use cases will be at the user level - each row represents one user, and will include the following:\n", + "\n", + "* daily features:\n", + " * average_events_weekday (numerical): average number of events per day during weekday\n", + " * average_events_weekend (numerical): average number of events per day during the weekend\n", + " * num_ads_7d: number of ads in last 7 days\n", + " * num_error_7d: total errors encountered in last 7 days\n", + " * num_songs_played_7d: total songs played in last 7 days\n", + " * num_songs_played_30d: total songs played in last 30 days\n", + " * num_songs_played_90d: total songs played in last 90 days\n", + "* user features:\n", + " * num_artists (numerical): number of artists the user has listened to\n", + " * num_songs (numerical): number of songs played\n", + " * num_ads (numerical): number of ads played\n", + " * num_thumbsup (numerical): number of times the user likes a song\n", + " * num_thumbsdown (numerical): number of times the user dislikes a song\n", + " * num_playlist (numerical): number of times user adds a song to a playlist\n", + " * num_addfriend (numerical): number of times user adds a friend\n", + " * num_error (numerical): number of times user encountered an error\n", + " * user_downgrade (binary): user has downgraded plan\n", + " * user_upgrade (binary): user has upgraded plan\n", + " * percentage_song: percentage of the user's action is 'NextSong' (only listens to songs) \n", + " * percentage_ad: percentage of the user's action is 'Roll Advert'\n", + " * repeats_ratio: percentage of total songs that are repeats\n", + " * days_since_active: days since the user registered and leave (if the user cancels)\n", + "* Session features:\n", + " * num_sessions: number of total sessions\n", + " * avg_time_per_session: average time spent per session\n", + " * avg_events_per_session: average number of events per session\n", + " * avg_gap_between_session: average time between sessions\n", + " \n", + "The following function will create the processing job with SageMaker Processing, a new Python SDK that lets data scientists and ML engineers easily run preprocessing, postprocessing and model evaluation workloads on Amazon SageMaker. This SDK uses SageMaker’s built-in container for scikit-learn, possibly the most popular library for data set transformation.\n", + "You can find a complete guide to the SageMaker Processing job in [this blog](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/)." ] }, { @@ -975,7 +1010,7 @@ "outputs": [], "source": [ "### SAVE THE OUTPUT FILE NAME FROM PROCESSING JOB\n", - "processing_job_output_name = 'processing_job_output.csv'" + "processing_job_output_name = \"processing_job_output.csv\"" ] }, { @@ -1293,8 +1328,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "### Data Splitting\n", "\n", "You formulated the use case as a classification problem on user level, so you can randomly split your data from last step into train/validation/test. If you want to predict \"will user X churn in the next Y days\" on per user per day level, you should think about spliting data in chronological order instead of random. \n", diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index 0fe4474302..f60cd4b063 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -13,19 +13,12 @@ "\n", "\n", "## Content\n", - "* [Overview](#Overview)\n", - "* [Data Selection](#2)\n", - "* [Ingest Data](#4)\n", - "* [Data Cleaning and Data Exploration](#5)\n", - "* [Pre-processing with SageMaker Data Wrangler](#7)\n", - "* [Feature Engineering with SageMaker Processing](#6)\n", - "* [Data Splitting](#8)\n", - "* [Model Selection](#9)\n", - "* [Training with SageMaker Estimator and Experiment](#10)\n", - "* [Hyperparameter Tuning with SageMaker Hyperparameter Tuning Job](#11)\n", - "* [Deploy the model with SageMaker Batch-transform](#12)\n", - "* [Model Explainability with SageMaker Clarify](#15)\n", - "* [Optional: Automate your training and model selection with SageMaker Autopilot (Console)](#13)" + "* [Model Selection](#Model-Selection)\n", + "* [Training with SageMaker Estimator and Experiment](#Training-with-SageMaker-Estimator-and-Experiment)\n", + "* [Hyperparameter Tuning with SageMaker Hyperparameter Tuning Job](#Hyperparameter-Tuning-with-SageMaker-Hyperparameter-Tuning-Job)\n", + "* [Deploy the model with SageMaker Batch-transform](#Deploy-the-model-with-SageMaker-Batch-transform)\n", + "* [Model Explainability with SageMaker Clarify](#Model-Explainability-with-SageMaker-Clarify)\n", + "* [Optional: Automate your training and model selection with SageMaker Autopilot (Console)](#Optional:-Automate-your-training-and-model-selection-with-SageMaker-Autopilot-(Console))" ] }, { @@ -86,8 +79,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Model Selection\n", "\n", "You can experiment with all your model choices and see which one gives better results. A few things to note when you choose algorithms:\n", @@ -104,8 +95,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Training with SageMaker Estimator and Experiment\n", "\n", "Once you decide on a range of models you want to experiment with, you can start training and comparing model results to choose the best one. A few things left for you to make a decision:\n", @@ -173,7 +162,7 @@ "##### Alternative: copy data from a public S3 bucket to your own bucket\n", "##### data file should include full_data.csv and sample.json\n", "#### cell 5 - 7 is not needed; the processing job before data wrangler screenshots is not needed\n", - "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data.zip ./data/raw/customer-churn-data.zip" + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data-v1.zip ./data/raw/customer-churn-data.zip" ] }, { @@ -412,8 +401,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Hyperparameter Tuning with SageMaker Hyperparameter Tuning Job\n", "\n", "Now that you understand how training one model works and how to create a SageMaker experiment, and selected the XGBoost model as the final model, you will need to fine-tune the hyperparameters for the best model performances. For a xgboost model, you can start with defining ranges for the eta, alpha, min_child_weight, and max_depth. You can check the [documentation when considering what haperparameter to tune](https://docs.aws.amazon.com/sagemaker/latest/dg/automatic-model-tuning-considerations.html)." @@ -590,7 +577,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", "## Deploy the model with SageMaker Batch-transform\n", "\n", "You can directly deploy the best model from your hyperparameter tuning job by getting the best training job from your tuner." @@ -801,8 +787,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Model Explainability with SageMaker Clarify\n", "\n", "You can visualize which feature contributes most to your prediction results by using the new SageMaker feature SageMaker Clarify. It will provide SHAP values which measures the importance of a feature by replacing it with a dummy and seeing how it affects the prediciton. (In reality, SHAP is smart about the choice of dummy and also takes into account feature interactions.) For a more general overview of model interpretability, see [this post](https://towardsdatascience.com/guide-to-interpretable-machine-learning-d40e8a64b6cf). For other capabilities of SageMaker Clarify, please see the [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-fairness-and-explainability.html) and the [example notebook](https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker_processing/fairness_and_explainability/fairness_and_explainability.ipynb)." @@ -901,8 +885,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\n", - "\n", "## Optional: Automate your training and model selection with SageMaker Autopilot (Console)\n", "\n", "With [SageMaker Autopilot](https://aws.amazon.com/blogs/aws/amazon-sagemaker-autopilot-fully-managed-automatic-machine-learning/), you can skip all the steps above and let it automatically tracks the inputs, parameters, configurations, and results of your iterations as trials. Go to SageMaker Experiments List on the left navigation pane, then choose **Create Experiment**. You will be directed to the experiment creating page. All you need to do is do give the Experiment job a name, specify your input and output data location, specify your target variable, and choose your ML problem type (classification or regression), or leave it as auto.\n", @@ -1003,17 +985,9 @@ "\n", "The data used in this notebook is simulated using the [EventSim](https://github.com/Interana/eventsim)." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { - "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", From 55a4ac3924393d06347fd37b69ce837195420c15 Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 20:04:15 +0000 Subject: [PATCH 14/27] delete file not needed anymore --- .../1_cust_churn_dataprep.ipynb | 1033 ----------------- 1 file changed, 1033 deletions(-) delete mode 100644 use-cases/customer_churn/1_cust_churn_dataprep.ipynb diff --git a/use-cases/customer_churn/1_cust_churn_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_dataprep.ipynb deleted file mode 100644 index 7221e8c63e..0000000000 --- a/use-cases/customer_churn/1_cust_churn_dataprep.ipynb +++ /dev/null @@ -1,1033 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Build a Customer Churn Model for Music Streaming App Users: Date Pre-processing with SageMaker Data Wrangler and Processing Job\n", - "\n", - "In this demo, you are going to learn how to use various SageMaker functionalities to build, train, and deploy the model from end to end, including data pre-processing steps like ingestion, cleaning and processing, feature engineering, training and hyperparameter tuning, model explainability, and eventually deploy the model. There are two parts of the demo: in part 1: Prepare Data, you will process the data with the help of Data Wrangler, then create features from the cleaned data. By the end of part 1, you will have a complete feature data set that contains all attributes built for each user, and it is ready for modeling. Then in part 2: Modeling and Reference, you will use the data set built from part 1 to find an optimal model for the use case, then test the model predictability with the test data. To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input for the next steps.\n", - "\n", - "\n", - "For how to set up the SageMaker Studio Notebook environment, please check the [onboarding video]( https://www.youtube.com/watch?v=wiDHCWVrjCU&feature=youtu.be). And for a list of services covered in the use case demo, please check the documentation linked in each section.\n", - "\n", - "\n", - "## Content\n", - "\n", - "* [Overview](#Overview)\n", - "* [Data Selection](#2)\n", - "* [Ingest Data](#4)\n", - "* [Data Cleaning and Data Exploration](#5)\n", - "* [Pre-processing with SageMaker Data Wrangler](#7)\n", - "* [Feature Engineering with SageMaker Processing](#6)\n", - "* [Data Splitting](#8)\n", - "* [Model Selection](#9)\n", - "* [Training with SageMaker Estimator and Experiment](#10)\n", - "* [Hyperparameter Tuning with SageMaker Hyperparameter Tuning Job](#11)\n", - "* [Deploy the model with SageMaker Batch-transform](#12)\n", - "* [Model Explainability with SageMaker Clarify](#15)\n", - "* [Optional: Automate your training and model selection with SageMaker Autopilot (Console)](#13)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "### What is Customer Churn and why is it important for businesses?\n", - "\n", - "Customer churn, or customer retention/attrition, means a customer has the tendency to leave and stop paying for a business. It is one of the primary metrics companies want to track to get a sense of their customer satisfaction, especially for a subscription-based business model. The company can track churn rate (defined as the percentage of customers churned during a period) as a health indicator for the business, but we would love to identify the at-risk customers before they churn and offer appropriate treatment to keep them with the business, and this is where machine learning comes into play.\n", - "\n", - "### Use Cases for Customer Churn\n", - "\n", - "Any subscription-based business would track customer churn as one of the most critical Key Performance Indicators (KPIs). Such companies and industries include Telecom companies (cable, cell phone, internet, etc.), digital subscriptions of media (news, forums, blogposts platforms, etc.), music and video streaming services, and other Software as a Service (SaaS) providers (e-commerce, CRM, Mar-Tech, cloud computing, video conference provider, and visualization and data science tools, etc.)\n", - "\n", - "### Define Business problem\n", - "\n", - "To start with, here are some common business problems to consider depending on your specific use cases and your focus:\n", - " * Will this customer churn (cancel the plan, cancel the subscription)?\n", - " * Will this customer downgrade a pricing plan?\n", - " * For a subscription business model, will a customer renew his/her subscription?\n", - "\n", - "### Machine learning problem formulation\n", - "\n", - "#### Classification: will this customer churn?\n", - "\n", - "To goal of classification is to identify the at-risk customers and sometimes their unusual behavior, such as: will this customer churn or downgrade their plan? Is there any unusual behavior for a customer? The latter question can be formulated as an anomaly detection problem.\n", - "\n", - "#### Time Series: will this customer churn in the next X months? When will this customer churn?\n", - "\n", - "You can further explore your users by formulating the problem as a time series one and detect when will the customer churn.\n", - "\n", - "### Data Requirements\n", - "\n", - "#### Data collection Sources\n", - "\n", - "Some most common data sources used to construct a data set for churn analysis are:\n", - "* Customer Relationship Management platform (CRM), \n", - "* engagement and usage data (analytics services), \n", - "* passive feedback (ratings based on your request), and active feedback (customer support request, feedback on social media and review platforms).\n", - "\n", - "#### Construct a Data Set for Churn Analysis\n", - "\n", - "Most raw data collected from the sources mentioned above are huge and often needs a lot of cleaning and pre-processing. For example, usage data is usually event-based log data and can be more than a few gigabytes every day; you can aggregate the data to user-level daily for further analysis. Feedback and review data are mostly text data, so you would need to clean and pre-process the natural language data to be normalized, machine-readable data. If you are joining multiple data sources (especially from different platforms) together, you would want to make sure all data points are consistent, and the user identity can be matched across different platforms.\n", - " \n", - "#### Challenges with Customer Churn\n", - "\n", - "* Business related\n", - " * Importance of domain knowledge: this is critical when you start building features for the machine learning model. It is important to understand the business enough to decide which features would trigger retention.\n", - "* Data issues\n", - " * fewer churn data available (imbalanced classes): data for churn analysis is often very imbalanced as most of the customers of a business are happy customers (usually).\n", - " * User identity mapping problem: if you are joining data from different platforms (CRM, email, feedback, mobile app, and website usage data), you would want to make sure user A is recognized as the same user across multiple platforms. There are third-party solutions that help you tackle this problem.\n", - " * Not collecting the right data for the use case or Lacking enough data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Use Case Study - Music Streaming User Churn Prediction\n", - "\n", - "\n", - "\n", - "## Data Selection\n", - "\n", - "You will use generated music streaming data that is simulated to imitate music streaming user behaviors. The data simulated contains 1100 users and their user behavior for one year (2019/10/28 - 2020/10/28). Data is simulated using the [EventSim](https://github.com/Interana/eventsim) and does not contain any real user data.\n", - "\n", - "* Observation window: you will use 1 year of data to generate predictions.\n", - "* Explanation of fields:\n", - " * `ts`: event UNIX timestamp\n", - " * `userId`: a randomly assigned unique user id\n", - " * `sessionId`: a randomly assigned session id unique to each user\n", - " * `page`: event taken by the user, e.g. \"next song\", \"upgrade\", \"cancel\"\n", - " * `auth`: whether the user is a logged-in user\n", - " * `method`: request method, GET or PUT\n", - " * `status`: request status\n", - " * `level`: if the user is a free or paid user\n", - " * `itemInSession`: event happened in the session\n", - " * `location`: location of the user's IP address\n", - " * `userAgent`: agent of the user's device\n", - " * `lastName`: user's last name\n", - " * `firstName`: user's first name\n", - " * `registration`: user's time of registration\n", - " * `gender`: gender of the user\n", - " * `artist`: artist of the song the user is playing at the event\n", - " * `song`: song title the user is playing at the event\n", - " * `length`: length of the session\n", - " \n", - " \n", - " * the data will be downloaded from Github and contained in an [_Amazon Simple Storage Service_](https://aws.amazon.com/s3/) (Amazon S3) bucket." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For this specific use case, you will focus on a solution to predict whether a customer will cancel the subscription. Some possible expansion of the work includes:\n", - "* predict plan downgrading\n", - "* when a user will churn\n", - "* add song attributes (genre, playlist, charts) and user attributes (demographics) to the data\n", - "* add user feedback and customer service requests to the data\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Architecture Diagram\n", - "\n", - "The services covered in the use case and an architecture diagram is shown below.\n", - "\n", - "
\n", - " \n", - "\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "## The output from Data Wrangler is also provided in the github repo (data/data_wrangler_output.csv).\n", - "## You can also read the provided csv directly." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Feature engineering with SageMaker Processing Job\n", - "\n", - "\n", - "For user churn analysis, usually, you can consider build features from the following aspects:\n", - "\n", - "* Generate base features:\n", - " * user behavior features (listening behavior, app behavior).\n", - " * customer demographic features.\n", - " * customer support features (interactions, ratings, etc.)\n", - "* Formulate time series as features:\n", - " * construct streaming time as time series.\n", - " * build features in the different time windows (e.g. total songs listened in the last 7 days, 30 days, 180 days, etc.)\n", - " \n", - "For this use case, after exploring the data and with all the findings you gathered, now is the time to create features used for your model. Since the data set is time series, you can enrich your features by adding a time factor to it: e.g., for the total number of songs listened, you can create features like total songs listened in the last 7 days, last 30 days, last 90 days, last 180 days, etc. The features built for these use cases will be at the user level - each row represents one user, and will include the following:\n", - "\n", - "* daily features:\n", - " * average_events_weekday (numerical): average number of events per day during weekday\n", - " * average_events_weekend (numerical): average number of events per day during the weekend\n", - " * num_ads_7d: number of ads in last 7 days\n", - " * num_error_7d: total errors encountered in last 7 days\n", - " * num_songs_played_7d: total songs played in last 7 days\n", - " * num_songs_played_30d: total songs played in last 30 days\n", - " * num_songs_played_90d: total songs played in last 90 days\n", - "* user features:\n", - " * num_artists (numerical): number of artists the user has listened to\n", - " * num_songs (numerical): number of songs played\n", - " * num_ads (numerical): number of ads played\n", - " * num_thumbsup (numerical): number of times the user likes a song\n", - " * num_thumbsdown (numerical): number of times the user dislikes a song\n", - " * num_playlist (numerical): number of times user adds a song to a playlist\n", - " * num_addfriend (numerical): number of times user adds a friend\n", - " * num_error (numerical): number of times user encountered an error\n", - " * user_downgrade (binary): user has downgraded plan\n", - " * user_upgrade (binary): user has upgraded plan\n", - " * percentage_song: percentage of the user's action is 'NextSong' (only listens to songs) \n", - " * percentage_ad: percentage of the user's action is 'Roll Advert'\n", - " * repeats_ratio: percentage of total songs that are repeats\n", - " * days_since_active: days since the user registered and leave (if the user cancels)\n", - "* Session features:\n", - " * num_sessions: number of total sessions\n", - " * avg_time_per_session: average time spent per session\n", - " * avg_events_per_session: average number of events per session\n", - " * avg_gap_between_session: average time between sessions\n", - " \n", - "The following function will create the processing job with SageMaker Processing, a new Python SDK that lets data scientists and ML engineers easily run preprocessing, postprocessing and model evaluation workloads on Amazon SageMaker. This SDK uses SageMaker’s built-in container for scikit-learn, possibly the most popular library for data set transformation.\n", - "You can find a complete guide to the SageMaker Processing job in [this blog](https://aws.amazon.com/blogs/aws/amazon-sagemaker-processing-fully-managed-data-processing-and-model-evaluation/)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[31mERROR: pg8000 1.17.0 has requirement scramp==1.2.0, but you'll have scramp 1.2.2 which is incompatible.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install -q pandas=='1.1.5'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# !pip -uQ install s3fs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%store -r\n", - "%store" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processing_output_filename" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "import sagemaker\n", - "import json\n", - "import pandas as pd\n", - "import numpy as np\n", - "import glob\n", - "import boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "sagemaker_session = sagemaker.Session()\n", - "s3 = sagemaker_session.boto_session.resource(\"s3\")\n", - "\n", - "region = boto3.Session().region_name\n", - "role = sagemaker.get_execution_role()\n", - "smclient = boto3.Session().client(\"sagemaker\")\n", - "\n", - "output_path = f\"s3://{bucket}/{prefix}/data/processing/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [], - "source": [ - "from sagemaker.sklearn.processing import SKLearnProcessor\n", - "\n", - "sklearn_processor = SKLearnProcessor(\n", - " # framework_version='0.20.0',\n", - " framework_version=\"0.23-1\",\n", - " role=role,\n", - " instance_type=\"ml.m5.xlarge\",\n", - " instance_count=1,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "### SAVE THE OUTPUT FILE NAME FROM PROCESSING JOB\n", - "processing_job_output_name = 'processing_job_output.csv'\n", - "%store processing_job_output_name" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting preprocessing.py\n" - ] - } - ], - "source": [ - "%%writefile preprocessing.py\n", - "\n", - "import os\n", - "import warnings\n", - "import time\n", - "import pandas as pd\n", - "import argparse\n", - "import subprocess\n", - "import sys\n", - "\n", - "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"awswrangler\"])\n", - "import awswrangler as wr\n", - "\n", - "start_time = time.time()\n", - "\n", - "if __name__ == \"__main__\":\n", - " parser = argparse.ArgumentParser()\n", - " parser.add_argument(\"--dw-output-path\")\n", - " parser.add_argument(\"--processing-output-filename\")\n", - "\n", - " args, _ = parser.parse_known_args()\n", - " print(\"Received arguments {}\".format(args))\n", - "\n", - " data_s3_uri = args.dw_output_path\n", - " output_filename = args.processing_output_filename\n", - "\n", - " # data_path = os.path.join('/opt/ml/processing/input', dw_output_name)\n", - " # df = pd.read_csv(data_path)\n", - " df = wr.s3.read_csv(path=data_s3_uri, dataset=True)\n", - " ## convert to time\n", - " df[\"date\"] = pd.to_datetime(df[\"ts\"], unit=\"ms\")\n", - " df[\"ts_dow\"] = df[\"date\"].dt.weekday\n", - " df[\"ts_date_day\"] = df[\"date\"].dt.date\n", - " df[\"ts_is_weekday\"] = [1 if x in [0, 1, 2, 3, 4] else 0 for x in df[\"ts_dow\"]]\n", - " df[\"registration_ts\"] = pd.to_datetime(df[\"registration\"], unit=\"ms\").dt.date\n", - " ## add labels\n", - " df[\"churned_event\"] = [1 if x == \"Cancellation Confirmation\" else 0 for x in df[\"page\"]]\n", - " df[\"user_churned\"] = df.groupby(\"userId\")[\"churned_event\"].transform(\"max\")\n", - "\n", - " ## convert pages categorical variables to numerical\n", - " events_list = [\n", - " \"NextSong\",\n", - " \"Thumbs Down\",\n", - " \"Thumbs Up\",\n", - " \"Add to Playlist\",\n", - " \"Roll Advert\",\n", - " \"Add Friend\",\n", - " \"Downgrade\",\n", - " \"Upgrade\",\n", - " \"Error\",\n", - " ]\n", - " usage_column_name = []\n", - " for event in events_list:\n", - " event_name = \"_\".join(event.split()).lower()\n", - " usage_column_name.append(event_name)\n", - " df[event_name] = [1 if x == event else 0 for x in df[\"page\"]]\n", - " ## feature engineering\n", - " # average_events_weekday (numerical): average number of events per day during weekday\n", - " # average_events_weekend (numerical): average number of events per day during the weekend\n", - " base_df = (\n", - " df.groupby([\"userId\", \"ts_date_day\", \"ts_is_weekday\"])\n", - " .agg({\"page\": \"count\"})\n", - " .groupby([\"userId\", \"ts_is_weekday\"])[\"page\"]\n", - " .mean()\n", - " .unstack(fill_value=0)\n", - " .reset_index()\n", - " .rename(columns={0: \"average_events_weekend\", 1: \"average_events_weekday\"})\n", - " )\n", - "\n", - " # num_ads_7d, num_songs_played_7d, num_songs_played_30d, num_songs_played_90d, num_ads_7d, num_error_7d\n", - " base_df_daily = (\n", - " df.groupby([\"userId\", \"ts_date_day\"])\n", - " .agg({\"page\": \"count\", \"nextsong\": \"sum\", \"roll_advert\": \"sum\", \"error\": \"sum\"})\n", - " .reset_index()\n", - " )\n", - " feature34 = (\n", - " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", - " .tail(7)\n", - " .groupby([\"userId\"])\n", - " .agg({\"nextsong\": \"sum\", \"roll_advert\": \"sum\", \"error\": \"sum\"})\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"nextsong\": \"num_songs_played_7d\",\n", - " \"roll_advert\": \"num_ads_7d\",\n", - " \"error\": \"num_error_7d\",\n", - " }\n", - " )\n", - " )\n", - " feature5 = (\n", - " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", - " .tail(30)\n", - " .groupby([\"userId\"])\n", - " .agg({\"nextsong\": \"sum\"})\n", - " .reset_index()\n", - " .rename(columns={\"nextsong\": \"num_songs_played_30d\"})\n", - " )\n", - " feature6 = (\n", - " base_df_daily.groupby([\"userId\", \"ts_date_day\"])\n", - " .tail(90)\n", - " .groupby([\"userId\"])\n", - " .agg({\"nextsong\": \"sum\"})\n", - " .reset_index()\n", - " .rename(columns={\"nextsong\": \"num_songs_played_90d\"})\n", - " )\n", - " # num_artists, num_songs, num_ads, num_thumbsup, num_thumbsdown, num_playlist, num_addfriend, num_error, user_downgrade,\n", - " # user_upgrade, percentage_ad, days_since_active\n", - " base_df_user = (\n", - " df.groupby([\"userId\"])\n", - " .agg(\n", - " {\n", - " \"page\": \"count\",\n", - " \"nextsong\": \"sum\",\n", - " \"artist\": \"nunique\",\n", - " \"song\": \"nunique\",\n", - " \"thumbs_down\": \"sum\",\n", - " \"thumbs_up\": \"sum\",\n", - " \"add_to_playlist\": \"sum\",\n", - " \"roll_advert\": \"sum\",\n", - " \"add_friend\": \"sum\",\n", - " \"downgrade\": \"max\",\n", - " \"upgrade\": \"max\",\n", - " \"error\": \"sum\",\n", - " \"ts_date_day\": \"max\",\n", - " \"registration_ts\": \"min\",\n", - " \"user_churned\": \"max\",\n", - " }\n", - " )\n", - " .reset_index()\n", - " )\n", - " base_df_user[\"percentage_ad\"] = base_df_user[\"roll_advert\"] / base_df_user[\"page\"]\n", - " base_df_user[\"days_since_active\"] = (\n", - " base_df_user[\"ts_date_day\"] - base_df_user[\"registration_ts\"]\n", - " ).dt.days\n", - " # repeats ratio\n", - " base_df_user[\"repeats_ratio\"] = 1 - base_df_user[\"song\"] / base_df_user[\"nextsong\"]\n", - "\n", - " # num_sessions, avg_time_per_session, avg_events_per_session,\n", - " base_df_session = (\n", - " df.groupby([\"userId\", \"sessionId\"])\n", - " .agg({\"length\": \"sum\", \"page\": \"count\", \"date\": \"min\"})\n", - " .reset_index()\n", - " )\n", - " base_df_session[\"prev_session_ts\"] = base_df_session.groupby([\"userId\"])[\"date\"].shift(1)\n", - " base_df_session[\"gap_session\"] = (\n", - " base_df_session[\"date\"] - base_df_session[\"prev_session_ts\"]\n", - " ).dt.days\n", - " user_sessions = (\n", - " base_df_session.groupby(\"userId\")\n", - " .agg({\"sessionId\": \"count\", \"length\": \"mean\", \"page\": \"mean\", \"gap_session\": \"mean\"})\n", - " .reset_index()\n", - " .rename(\n", - " columns={\n", - " \"sessionId\": \"num_sessions\",\n", - " \"length\": \"avg_time_per_session\",\n", - " \"page\": \"avg_events_per_session\",\n", - " \"gap_session\": \"avg_gap_between_session\",\n", - " }\n", - " )\n", - " )\n", - "\n", - " # merge features together\n", - " base_df[\"userId\"] = base_df[\"userId\"].astype(\"int\")\n", - " final_feature_df = base_df.merge(feature34, how=\"left\", on=\"userId\")\n", - " final_feature_df = final_feature_df.merge(feature5, how=\"left\", on=\"userId\")\n", - " final_feature_df = final_feature_df.merge(feature6, how=\"left\", on=\"userId\")\n", - " final_feature_df = final_feature_df.merge(user_sessions, how=\"left\", on=\"userId\")\n", - " final_feature_df = final_feature_df.merge(base_df_user, how=\"left\", on=\"userId\")\n", - "\n", - " final_feature_df = final_feature_df.fillna(0)\n", - " # renaming columns\n", - " final_feature_df.columns = [\n", - " \"userId\",\n", - " \"average_events_weekend\",\n", - " \"average_events_weekday\",\n", - " \"num_songs_played_7d\",\n", - " \"num_ads_7d\",\n", - " \"num_error_7d\",\n", - " \"num_songs_played_30d\",\n", - " \"num_songs_played_90d\",\n", - " \"num_sessions\",\n", - " \"avg_time_per_session\",\n", - " \"avg_events_per_session\",\n", - " \"avg_gap_between_session\",\n", - " \"num_events\",\n", - " \"num_songs\",\n", - " \"num_artists\",\n", - " \"num_unique_songs\",\n", - " \"num_thumbs_down\",\n", - " \"num_thumbs_up\",\n", - " \"num_add_to_playlist\",\n", - " \"num_ads\",\n", - " \"num_add_friend\",\n", - " \"num_downgrade\",\n", - " \"num_upgrade\",\n", - " \"num_error\",\n", - " \"ts_date_day\",\n", - " \"registration_ts\",\n", - " \"user_churned\",\n", - " \"percentage_ad\",\n", - " \"days_since_active\",\n", - " \"repeats_ratio\",\n", - " ]\n", - " # only keep created feature columns\n", - " final_feature_df = final_feature_df[\n", - " [\n", - " \"userId\",\n", - " \"user_churned\",\n", - " \"average_events_weekend\",\n", - " \"average_events_weekday\",\n", - " \"num_songs_played_7d\",\n", - " \"num_ads_7d\",\n", - " \"num_error_7d\",\n", - " \"num_songs_played_30d\",\n", - " \"num_songs_played_90d\",\n", - " \"num_sessions\",\n", - " \"avg_time_per_session\",\n", - " \"avg_events_per_session\",\n", - " \"avg_gap_between_session\",\n", - " \"num_events\",\n", - " \"num_songs\",\n", - " \"num_artists\",\n", - " \"num_thumbs_down\",\n", - " \"num_thumbs_up\",\n", - " \"num_add_to_playlist\",\n", - " \"num_ads\",\n", - " \"num_add_friend\",\n", - " \"num_downgrade\",\n", - " \"num_upgrade\",\n", - " \"num_error\",\n", - " \"percentage_ad\",\n", - " \"days_since_active\",\n", - " \"repeats_ratio\",\n", - " ]\n", - " ]\n", - "\n", - " print(\"shape of file to append:\\t\\t{}\".format(final_feature_df.shape))\n", - " iter_end_time = time.time()\n", - " end_time = time.time()\n", - " print(\"minutes elapsed: {}\".format(str((end_time - start_time) / 60)))\n", - "\n", - " final_features_output_path = os.path.join(\"/opt/ml/processing/output\", output_filename)\n", - " print(\"Saving processed data to {}\".format(final_features_output_path))\n", - " final_feature_df.to_csv(final_features_output_path, header=True, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "output_path = processing_output_filename" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "%%time\n", - "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", - "\n", - "processing_job_output_path = f\"s3://{bucket}/{prefix}/data/processing\"\n", - "\n", - "sklearn_processor.run(\n", - " code=\"preprocessing.py\",\n", - " outputs=[\n", - " ProcessingOutput(\n", - " output_name=\"processed_data\",\n", - " source=\"/opt/ml/processing/output\",\n", - " destination=processing_job_output_path,\n", - " )\n", - " ],\n", - " arguments=[\n", - " \"--dw-output-path\",\n", - " processing_job_output_path,\n", - " \"--processing-output-filename\",\n", - " processing_job_output_name,\n", - " ],\n", - ")\n", - "\n", - "preprocessing_job_description = sklearn_processor.jobs[-1].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "preprocessing_job_description" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Congratulations! You have completed Part1: Prepare the data, and now you should have created the complete feature set that is ready for modeling. You can proceed to Part2: modeling and Reference." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## PART 2: Modeling and Reference\n", - "\n", - "now that you have created the complete feature set, you can start to explore and find a best-working model for your churn use case. By the end of part 2, you will select an algorithm, find the best sets of hyperparameter for the model, examine how well the model performs, and finally find the top influential features.\n", - "\n", - "To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input (variable dataframe `processed_data`) for the next steps. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "### Data Splitting\n", - "\n", - "You formulated the use case as a classification problem on user level, so you can randomly split your data from last step into train/validation/test. If you want to predict \"will user X churn in the next Y days\" on per user per day level, you should think about spliting data in chronological order instead of random. \n", - "\n", - "You should split the data and make sure that data of both classes exist in your train, validation and test sets, to make sure both classes are represented in your data. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Find the output of Processing Job" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "processing_job_output_uri = f\"{processing_job_output_path}/{processing_job_output_name}\"\n", - "processing_job_output_uri" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!aws s3 cp $processing_job_output_uri ./data" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "metadata": {}, - "outputs": [], - "source": [ - "processed_data = pd.read_csv(processing_job_output_uri)" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "# Optional: you can also load the processed data from the provided feature set\n", - "# processed_data = pd.read_csv('./data/full_feature_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
userIduser_churnedaverage_events_weekendaverage_events_weekdaynum_songs_played_7dnum_ads_7dnum_error_7dnum_songs_played_30dnum_songs_played_90dnum_sessions...num_thumbs_upnum_add_to_playlistnum_adsnum_add_friendnum_downgradenum_upgradenum_errorpercentage_addays_since_activerepeats_ratio
0110010.0189.875152.60869682701428270827051...586280141621120.0013923590.589722
1110020.0141.000153.333333952209529527...82322281000.0016642650.526261
2110031.0197.500241.750000773424187734773437...5442062413811180.002576660.587665
3110041.0140.000240.888889216842216821687...136604181020.001546480.538284
\n", - "

4 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " userId user_churned average_events_weekend average_events_weekday \\\n", - "0 11001 0.0 189.875 152.608696 \n", - "1 11002 0.0 141.000 153.333333 \n", - "2 11003 1.0 197.500 241.750000 \n", - "3 11004 1.0 140.000 240.888889 \n", - "\n", - " num_songs_played_7d num_ads_7d num_error_7d num_songs_played_30d \\\n", - "0 8270 14 2 8270 \n", - "1 952 2 0 952 \n", - "2 7734 24 18 7734 \n", - "3 2168 4 2 2168 \n", - "\n", - " num_songs_played_90d num_sessions ... num_thumbs_up \\\n", - "0 8270 51 ... 586 \n", - "1 952 7 ... 82 \n", - "2 7734 37 ... 544 \n", - "3 2168 7 ... 136 \n", - "\n", - " num_add_to_playlist num_ads num_add_friend num_downgrade num_upgrade \\\n", - "0 280 14 162 1 1 \n", - "1 32 2 28 1 0 \n", - "2 206 24 138 1 1 \n", - "3 60 4 18 1 0 \n", - "\n", - " num_error percentage_ad days_since_active repeats_ratio \n", - "0 2 0.001392 359 0.589722 \n", - "1 0 0.001664 265 0.526261 \n", - "2 18 0.002576 66 0.587665 \n", - "3 2 0.001546 48 0.538284 \n", - "\n", - "[4 rows x 27 columns]" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "processed_data.head(4)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Split data to train/validation/test by 70/20/10" - ] - }, - { - "cell_type": "code", - "execution_count": 50, - "metadata": {}, - "outputs": [], - "source": [ - "data = processed_data.sample(frac=1, random_state=1729)\n", - "grouped_df = data.groupby(\"user_churned\")\n", - "arr_list = [np.split(g, [int(0.7 * len(g)), int(0.9 * len(g))]) for i, g in grouped_df]\n", - "\n", - "train_data = pd.concat([t[0] for t in arr_list])\n", - "validation_data = pd.concat([t[1] for t in arr_list])\n", - "test_data = pd.concat([v[2] for v in arr_list])" - ] - }, - { - "cell_type": "code", - "execution_count": 51, - "metadata": {}, - "outputs": [], - "source": [ - "def process_data(data, name, header=False):\n", - " data = data.drop(columns=[\"userId\"])\n", - " data = pd.concat([data[\"user_churned\"], data.drop([\"user_churned\"], axis=1)], axis=1)\n", - " data.to_csv(name, header=header, index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "metadata": {}, - "outputs": [], - "source": [ - "process_data(train_data, \"data/train_updated.csv\")\n", - "process_data(validation_data, \"data/validation_updated.csv\")\n", - "process_data(test_data, \"data/test_updated.csv\")\n", - "\n", - "process_data(train_data, \"data/train_w_header.csv\", header=True)\n", - "process_data(validation_data, \"data/validation_w_header.csv\", header=True)\n", - "process_data(test_data, \"data/test_w_header.csv\", header=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Save splitted data to S3\n", - "The splitted data is provided in the /data folder. You can also upload the provided files (`data/train_updated.csv`,`data/validation_updated.csv`, `data/test_updated.csv`) and proceed to the next step. " - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "s3_input_train = (\n", - " boto3.Session()\n", - " .resource(\"s3\")\n", - " .Bucket(bucket)\n", - " .Object(os.path.join(prefix, \"train/train.csv\"))\n", - " .upload_file(\"data/train_updated.csv\")\n", - ")\n", - "s3_input_validation = (\n", - " boto3.Session()\n", - " .resource(\"s3\")\n", - " .Bucket(bucket)\n", - " .Object(os.path.join(prefix, \"validation/validation.csv\"))\n", - " .upload_file(\"data/validation_updated.csv\")\n", - ")\n", - "s3_input_validation = (\n", - " boto3.Session()\n", - " .resource(\"s3\")\n", - " .Bucket(bucket)\n", - " .Object(os.path.join(prefix, \"test/test_labeled.csv\"))\n", - " .upload_file(\"data/test_updated.csv\")\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Disclaimer\n", - "\n", - "The data used in this notebook is synthetic and does not contain real user data. The results (all the names, emails, IP addresses, and browser information) of this simulation are fake." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Citation\n", - "\n", - "The data used in this notebook is simulated using the [EventSim](https://github.com/Interana/eventsim)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "conda_python3", - "language": "python", - "name": "conda_python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 279ab68c15f4c91c40b2d19f947d0b91177f56ce Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 20:15:29 +0000 Subject: [PATCH 15/27] add background and revert code in second notebook --- .../2_cust_churn_train_deploy_infer.ipynb | 40 +++++++++---------- 1 file changed, 18 insertions(+), 22 deletions(-) diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index f60cd4b063..ef63fc15ba 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -6,8 +6,12 @@ "source": [ "# Build a Customer Churn Model for Music Streaming App Users: Model Selection and Model Explainability\n", "\n", - "In this demo, you are going to learn how to use various SageMaker functionalities to build, train, and deploy the model from end to end, including data pre-processing steps like ingestion, cleaning and processing, feature engineering, training and hyperparameter tuning, model explainability, and eventually deploy the model. There are two parts of the demo: in part 1: Prepare Data, you will process the data with the help of Data Wrangler, then create features from the cleaned data. By the end of part 1, you will have a complete feature data set that contains all attributes built for each user, and it is ready for modeling. Then in part 2: Modeling and Reference, you will use the data set built from part 1 to find an optimal model for the use case, then test the model predictability with the test data. To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input for the next steps.\n", + "## Background\n", "\n", + "This notebook is one of a sequence of notebooks that show you how to use various SageMaker functionalities to build, train, and deploy the model from end to end, including data pre-processing steps like ingestion, cleaning and processing, feature engineering, training and hyperparameter tuning, model explainability, and eventually deploy the model. There are two parts of the demo: \n", + "\n", + "1. Build a Customer Churn Model for Music Streaming App Users: Overview and Data Preparation - you will process the data with the help of Data Wrangler, then create features from the cleaned data. By the end of part 1, you will have a complete feature data set that contains all attributes built for each user, and it is ready for modeling.\n", + "1. Build a Customer Churn Model for Music Streaming App Users: Model Selection and Model Explainability (current notebook) - you will use the data set built from part 1 to find an optimal model for the use case, then test the model predictability with the test data. \n", "\n", "For how to set up the SageMaker Studio Notebook environment, please check the [onboarding video]( https://www.youtube.com/watch?v=wiDHCWVrjCU&feature=youtu.be). And for a list of services covered in the use case demo, please check the documentation linked in each section.\n", "\n", @@ -898,19 +902,6 @@ "" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "train_data = pd.read_csv(\"data/train_w_header.csv\")\n", - "validation_data = pd.read_csv(\"data/validation_w_header.csv\")\n", - "\n", - "data_for_experiment = pd.concat([train_data, validation_data])\n", - "data_for_experiment.to_csv(\"full_feature_data_temp.csv\", index=False)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -928,14 +919,18 @@ "metadata": {}, "outputs": [], "source": [ - "# pd.read(\"full_feature_data.csv\")\n", - "# s3_input_full_set = (\n", - "# boto3.Session()\n", - "# .resource(\"s3\")\n", - "# .Bucket(bucket)\n", - "# .Object(os.path.join(prefix, \"full/fullset.csv\"))\n", - "# .upload_file(\"full_feature_data.csv\")\n", - "# )" + "train_data = pd.read_csv(\"data/train_w_header.csv\")\n", + "validation_data = pd.read_csv(\"data/validation_w_header.csv\")\n", + "\n", + "data_for_experiment = pd.concat([train_data, validation_data])\n", + "data_for_experiment.to_csv(\"full_feature_data.csv\", index=False)\n", + "s3_input_full_set = (\n", + " boto3.Session()\n", + " .resource(\"s3\")\n", + " .Bucket(bucket)\n", + " .Object(os.path.join(prefix, \"full/fullset.csv\"))\n", + " .upload_file(\"full_feature_data.csv\")\n", + ")" ] }, { @@ -988,6 +983,7 @@ } ], "metadata": { + "instance_type": "ml.t3.medium", "kernelspec": { "display_name": "Python 3 (Data Science)", "language": "python", From 6a510c7c103cdc1bdae6a25e50fce1198912d1fe Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 20:19:58 +0000 Subject: [PATCH 16/27] change kernel --- .../customer_churn/1_cust_churn_overview_dataprep.ipynb | 6 +++--- .../customer_churn/2_cust_churn_train_deploy_infer.ipynb | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index 76bff32faf..6ad512ecb2 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -1489,9 +1489,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -1503,7 +1503,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index ef63fc15ba..ffb7d38a34 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -985,9 +985,9 @@ "metadata": { "instance_type": "ml.t3.medium", "kernelspec": { - "display_name": "Python 3 (Data Science)", + "display_name": "conda_python3", "language": "python", - "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0" + "name": "conda_python3" }, "language_info": { "codemirror_mode": { @@ -999,7 +999,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.10" + "version": "3.6.13" } }, "nbformat": 4, From 189d1cfb8d12289e0248fa6ea4f569e593fb06fa Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 22:56:44 +0000 Subject: [PATCH 17/27] pip install --- .../customer_churn/2_cust_churn_train_deploy_infer.ipynb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index ef63fc15ba..16281ca458 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -114,6 +114,15 @@ "#### Get ECR image URIs for pre-built SageMaker Docker images" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install sagemaker-experiments" + ] + }, { "cell_type": "code", "execution_count": null, From 378ce4f64950c857122c4f3f5f71d6b12ab1b5bc Mon Sep 17 00:00:00 2001 From: atqy Date: Mon, 2 May 2022 23:55:20 +0000 Subject: [PATCH 18/27] change file name --- use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index 6ad512ecb2..a061ad6498 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -223,7 +223,7 @@ "##### Alternative: copy data from a public S3 bucket to your own bucket\n", "##### data file should include full_data.csv and sample.json\n", "#### cell 5 - 7 is not needed; the processing job before data wrangler screenshots is not needed\n", - "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data.zip ./data/raw/customer-churn-data.zip" + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data-v2.zip ./data/raw/customer-churn-data.zip" ] }, { From 877a26ff57b736a6e5d6fd936a51fd4bfb71ca52 Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 3 May 2022 17:53:36 +0000 Subject: [PATCH 19/27] correct file name --- use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index 7e6229910f..5ffb2944bf 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -175,7 +175,7 @@ "##### Alternative: copy data from a public S3 bucket to your own bucket\n", "##### data file should include full_data.csv and sample.json\n", "#### cell 5 - 7 is not needed; the processing job before data wrangler screenshots is not needed\n", - "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data-v1.zip ./data/raw/customer-churn-data.zip" + "!aws s3 cp s3://sagemaker-sample-files/datasets/tabular/customer-churn/customer-churn-data-v2.zip ./data/raw/customer-churn-data.zip" ] }, { From a56ae247c092b0bbea912481f6fb1e1ef887204e Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 3 May 2022 20:00:54 +0000 Subject: [PATCH 20/27] delete cell --- .../2_cust_churn_train_deploy_infer.ipynb | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb index 5ffb2944bf..4d5a420400 100644 --- a/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb +++ b/use-cases/customer_churn/2_cust_churn_train_deploy_infer.ipynb @@ -911,17 +911,6 @@ "" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Modeling and Reference\n", - "\n", - "Now that you have created the complete feature set, you can start to explore and find a best-working model for your churn use case. By the end of part 2, you will select an algorithm, find the best sets of hyperparameter for the model, examine how well the model performs, and finally find the top influential features.\n", - "\n", - "To start with Part 2, you can either read in data from the output of your Part 1 results, or use the provided 'data/full_feature_data.csv' as the input (variable dataframe `processed_data`) for the next steps. " - ] - }, { "cell_type": "code", "execution_count": null, From 8c5bc19f1445a6fcedcedcd23912626a1361dffc Mon Sep 17 00:00:00 2001 From: atqy Date: Tue, 3 May 2022 20:04:24 +0000 Subject: [PATCH 21/27] move general information --- .../1_cust_churn_overview_dataprep.ipynb | 14 -------------- use-cases/customer_churn/README.md | 4 ++++ 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index a061ad6498..94b2aacc3a 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -126,20 +126,6 @@ "* add user feedback and customer service requests to the data\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Architecture Diagram\n", - "\n", - "The services covered in the use case and an architecture diagram is shown below.\n", - "\n", - "
\n", - " \n", - "\n", - "
" - ] - }, { "cell_type": "markdown", "metadata": {}, diff --git a/use-cases/customer_churn/README.md b/use-cases/customer_churn/README.md index 7beb65418b..9ba85789dd 100644 --- a/use-cases/customer_churn/README.md +++ b/use-cases/customer_churn/README.md @@ -56,6 +56,10 @@ As part of the solution, the following services are used: * [Amazon SageMaker Studio Notebooks](https://aws.amazon.com/sagemaker/): Used to preprocess and visualize the data, and to train model. * [Amazon SageMaker Endpoint](https://aws.amazon.com/sagemaker/): Used to deploy the trained model. +The diagram below shows how each service is used in relation to other services in different stages of this use case. +
+ +
## Cleaning Up From 91fe4c7020ec5fd8a9424f11dd7574623a0b6569 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 01:47:52 +0000 Subject: [PATCH 22/27] upgrade pandas --- use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index 94b2aacc3a..f4c7cc089b 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -147,7 +147,7 @@ "outputs": [], "source": [ "!pip install -q 's3fs==0.4.2' 'sagemaker-experiments'\n", - "!pip install --upgrade sagemaker boto3\n", + "!pip install --upgrade sagemaker boto3 pandas\n", "# s3fs is needed for pandas to read files from S3" ] }, From c76bd328f9847456ed21d828e9571ef1f90992e0 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 06:50:51 +0000 Subject: [PATCH 23/27] remove pandas upgarde --- .../customer_churn/1_cust_churn_overview_dataprep.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index f4c7cc089b..205e84aa6e 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -147,7 +147,7 @@ "outputs": [], "source": [ "!pip install -q 's3fs==0.4.2' 'sagemaker-experiments'\n", - "!pip install --upgrade sagemaker boto3 pandas\n", + "!pip install --upgrade sagemaker boto3\n", "# s3fs is needed for pandas to read files from S3" ] }, @@ -1014,9 +1014,9 @@ "import subprocess\n", "import sys\n", "\n", - "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"--upgrade\", \"pandas\"])\n", - "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"awswrangler\"])\n", "import pandas as pd\n", + "\n", + "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"awswrangler\"])\n", "import awswrangler as wr\n", "\n", "start_time = time.time()\n", From 1a6bb8ab2517e309efdd605a6851710dbe8c3b09 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 16:24:48 +0000 Subject: [PATCH 24/27] change preprocessing.py --- .../1_cust_churn_overview_dataprep.ipynb | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index 205e84aa6e..35be37cb9f 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -1007,18 +1007,16 @@ "source": [ "%%writefile preprocessing.py\n", "\n", + "import sys\n", + "import subprocess\n", + "\n", "import os\n", "import warnings\n", "import time\n", "import argparse\n", - "import subprocess\n", - "import sys\n", - "\n", + "import boto3\n", "import pandas as pd\n", "\n", - "subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", \"awswrangler\"])\n", - "import awswrangler as wr\n", - "\n", "start_time = time.time()\n", "\n", "if __name__ == \"__main__\":\n", @@ -1031,10 +1029,13 @@ "\n", " data_s3_uri = args.dw_output_path\n", " output_filename = args.processing_output_filename\n", - "\n", - " # data_path = os.path.join('/opt/ml/processing/input', dw_output_name)\n", - " # df = pd.read_csv(data_path)\n", - " df = wr.s3.read_csv(path=data_s3_uri, dataset=True)\n", + " \n", + " bucket = data_s3_uri.split(\"/\")[2]\n", + " key = '/'.join(data_s3_uri.split(\"/\")[3:] + [\"full_data.csv\"])\n", + " s3_client = boto3.client(\"s3\")\n", + " s3_client.download_file(bucket, key, \"full_data.csv\")\n", + " df = pd.read_csv(\"full_data.csv\")\n", + " \n", " ## convert to time\n", " df[\"date\"] = pd.to_datetime(df[\"ts\"], unit=\"ms\")\n", " df[\"ts_dow\"] = df[\"date\"].dt.weekday\n", From 18a01f9eb43a009bb8f994dd3ab077b91f0c700a Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 10:50:04 -0700 Subject: [PATCH 25/27] reformat --- .../customer_churn/1_cust_churn_overview_dataprep.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb index 35be37cb9f..169b3f1168 100644 --- a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb +++ b/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb @@ -1029,13 +1029,13 @@ "\n", " data_s3_uri = args.dw_output_path\n", " output_filename = args.processing_output_filename\n", - " \n", + "\n", " bucket = data_s3_uri.split(\"/\")[2]\n", - " key = '/'.join(data_s3_uri.split(\"/\")[3:] + [\"full_data.csv\"])\n", + " key = \"/\".join(data_s3_uri.split(\"/\")[3:] + [\"full_data.csv\"])\n", " s3_client = boto3.client(\"s3\")\n", " s3_client.download_file(bucket, key, \"full_data.csv\")\n", " df = pd.read_csv(\"full_data.csv\")\n", - " \n", + "\n", " ## convert to time\n", " df[\"date\"] = pd.to_datetime(df[\"ts\"], unit=\"ms\")\n", " df[\"ts_dow\"] = df[\"date\"].dt.weekday\n", From ba6ab570f293fc34affe90c8ac3ad95b38cbb316 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 10:51:44 -0700 Subject: [PATCH 26/27] rename dataprep notebook --- ..._churn_overview_dataprep.ipynb => 1_cust_churn_dataprep.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename use-cases/customer_churn/{1_cust_churn_overview_dataprep.ipynb => 1_cust_churn_dataprep.ipynb} (100%) diff --git a/use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb b/use-cases/customer_churn/1_cust_churn_dataprep.ipynb similarity index 100% rename from use-cases/customer_churn/1_cust_churn_overview_dataprep.ipynb rename to use-cases/customer_churn/1_cust_churn_dataprep.ipynb From ae6cdd70b1414b0589843a2298204c03e351f9c2 Mon Sep 17 00:00:00 2001 From: atqy Date: Wed, 4 May 2022 10:53:18 -0700 Subject: [PATCH 27/27] change rst links --- use-cases/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/use-cases/index.rst b/use-cases/index.rst index 8e2cae295e..9ecbaea207 100644 --- a/use-cases/index.rst +++ b/use-cases/index.rst @@ -4,7 +4,6 @@ Music Streaming Service: Customer Churn Detection .. toctree:: :maxdepth: 1 - customer_churn/0_cust_churn_overview_dw customer_churn/1_cust_churn_dataprep customer_churn/2_cust_churn_train_deploy_infer