diff --git a/end_to_end/music_recommendation/00_overview_arch_data.ipynb b/end_to_end/music_recommendation/00_overview_arch_data.ipynb
deleted file mode 100644
index 89e1474f31..0000000000
--- a/end_to_end/music_recommendation/00_overview_arch_data.ipynb
+++ /dev/null
@@ -1,419 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Architect and Build a Music Recommender System across the Entire ML-Lifecycle with Amazon SageMaker\n",
- "\n",
- "## Overview\n",
- "\n",
- "----\n",
- "\n",
- "Welcome of the Music Recommender use-case with Amazon SageMaker. In this series of notebooks we will go through the ML Lifecycle and show how we can build a Music Recommender System using a combination of SageMaker Services and features. IN each phase, we will have relevant notebooks that show you how easy it is to implement that phase of the lifecycle.\n",
- "\n",
- "\n",
- "----\n",
- "\n",
- "### Contents\n",
- "\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- " - [Architecture](#arch-overview)\n",
- " - [Get the Data](#get-the-data)\n",
- " - [Update the data sources](#update-data-sources)\n",
- " - [Explore the Data](#explore-data)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Architecture\n",
- "\n",
- "Let's look at the overall solution architecure for this use case. We will start by doing each of these tasks within the exploratoyr phase of the ML Lifecycle, then when we are done with Experimentation and Trials, we can develop an automated pipeline such as the one depicted here to prepare data, deposit in feature store, train and tune the model, deposit it in the registry, then deploy it to a SageMaker hosted endpoint, and run Monitoring on it.\n",
- "\n",
- "##### [back to top](#00-nb)\n",
- "\n",
- "----\n",
- "\n",
- ""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "\n",
- "ps = ParameterStore()\n",
- "ps.create(namespace='music-rec')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# update pandas to avoid data type issues in older 1.0 version\n",
- "!pip install pandas --upgrade --quiet\n",
- "import pandas as pd\n",
- "print(pd.__version__)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# create data folder\n",
- "!mkdir data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "import matplotlib.pyplot as plt\n",
- "%matplotlib inline\n",
- "\n",
- "import json\n",
- "import sagemaker \n",
- "import boto3\n",
- "import os\n",
- "from awscli.customizations.s3.utils import split_s3_bucket_key\n",
- "\n",
- "# Sagemaker session\n",
- "sess = sagemaker.Session()\n",
- "# get session bucket name\n",
- "bucket = sess.default_bucket()\n",
- "# bucket prefix or the subfolder for everything we produce\n",
- "prefix='music-recommendation'\n",
- "# s3 client\n",
- "s3_client = boto3.client(\"s3\")\n",
- "\n",
- "print(f\"this is your default SageMaker Studio bucket name: {bucket}\") \n",
- "\n",
- "ps.add({'bucket': bucket, 'prefix': prefix}, namespace='music-rec')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "def get_data(public_s3_data, to_bucket, sample_data=1):\n",
- " new_paths = []\n",
- " for f in public_s3_data:\n",
- " bucket_name, key_name = split_s3_bucket_key(f)\n",
- " filename = f.split('/')[-1]\n",
- " new_path = \"s3://{}/{}/{}\".format(to_bucket, prefix, filename)\n",
- " new_paths.append(new_path)\n",
- " \n",
- " # only download if not already downloaded\n",
- " if not os.path.exists('./data/{}'.format(filename)):\n",
- " # download s3 data\n",
- " print(\"Downloading file from {}\".format(f))\n",
- " s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))\n",
- " \n",
- " # subsample the data to create a smaller datatset for this demo\n",
- " new_df = pd.read_csv('./data/{}'.format(filename))\n",
- " new_df = new_df.sample(frac=sample_data)\n",
- " new_df.to_csv('./data/{}'.format(filename), index=False)\n",
- " \n",
- " # upload s3 data to our default s3 bucket for SageMaker Studio\n",
- " print(\"Uploading {} to {}\\n\".format(filename, new_path))\n",
- " s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(prefix,filename))\n",
- " \n",
- " return new_paths\n",
- "\n",
- "\n",
- "def get_model(model_path, to_bucket):\n",
- " # upload model to our default s3 bucket for SageMaker Studio\n",
- " filename = model_path.split('/')[-1]\n",
- " print(\"Uploading {} to {}\\n\".format(model_path, os.path.join(to_bucket,prefix,filename)))\n",
- " s3_client.upload_file(model_path, to_bucket, os.path.join(prefix,filename))\n",
- " return \"s://{}\".format(os.path.join(to_bucket,prefix,filename))\n",
- " \n",
- "\n",
- "def update_data_sources(flow_path, tracks_data_source, ratings_data_source):\n",
- " with open(flow_path) as flowf:\n",
- " flow = json.load(flowf)\n",
- " \n",
- " for node in flow['nodes']:\n",
- " # if the key exists for our s3 endpoint\n",
- " try:\n",
- " if node['parameters']['dataset_definition']['name'] == 'tracks.csv':\n",
- " # reset the s3 data source for tracks data\n",
- " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n",
- " print(\"Changed {} to {}\".format(old_source, tracks_data_source))\n",
- " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source\n",
- " elif node['parameters']['dataset_definition']['name'] == 'ratings.csv':\n",
- " # reset the s3 data source for ratings data\n",
- " old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']\n",
- " print(\"Changed {} to {}\".format(old_source, ratings_data_source))\n",
- " node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source\n",
- " except:\n",
- " continue\n",
- " # write out the updated json flow file\n",
- " with open(flow_path, 'w') as outfile:\n",
- " json.dump(flow, outfile)\n",
- " \n",
- " return flow"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Prereqs: Get Data \n",
- "\n",
- "##### [back to top](#00-nb)\n",
- "\n",
- "----\n",
- "\n",
- "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# public S3 bucket that contains our music data\n",
- "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "new_data_paths = get_data([f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"], bucket, sample_data=0.70)\n",
- "print(new_data_paths)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n",
- "tracks_data_source = f's3://{bucket}/{prefix}/tracks.csv'\n",
- "ratings_data_source = f's3://{bucket}/{prefix}/ratings.csv'\n",
- "\n",
- "ps.add({'tracks_data_source': tracks_data_source, 'ratings_data_source': ratings_data_source}, namespace='music-rec')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Upload pretrained model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "pretrained_model_path = get_model('./model/model.tar.gz', bucket)\n",
- "\n",
- "ps.add({'pretrained_model_path': pretrained_model_path}, namespace='music-rec')\n",
- "ps.store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Update the data source in the `.flow` file\n",
- "\n",
- "##### [back to top](#00-nb)\n",
- "\n",
- "----\n",
- "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n",
- "\n",
- "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "update_data_sources('01_music_dataprep.flow', tracks_data_source, ratings_data_source)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Explore the Data\n",
- "\n",
- "\n",
- "##### [back to top](#00-nb)\n",
- "\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tracks = pd.read_csv('./data/tracks.csv')\n",
- "ratings = pd.read_csv('./data/ratings.csv')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tracks.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ratings.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "print(\"{:,} different songs/tracks\".format(tracks['trackId'].nunique()))\n",
- "print(\"{:,} users\".format(ratings['userId'].nunique()))\n",
- "print(\"{:,} user rating events\".format(ratings['ratingEventId'].nunique()))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tracks.groupby('genre')['genre'].count().plot.bar(title=\"Tracks by Genre\");"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ratings[['ratingEventId','userId']].plot.hist(by='userId', bins=50, title=\"Distribution of # of Ratings by User\");"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create some new data to ingest later"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "tracks_new = tracks[:300]\n",
- "ratings_new = ratings[:1000]\n",
- "\n",
- "# export dataframes to csv\n",
- "tracks_new.to_csv('./data/tracks_new.csv', index=False)\n",
- "ratings_new.to_csv('./data/ratings_new.csv', index=False)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "s3_client.upload_file(Filename=\"./data/tracks_new.csv\", Bucket=bucket, Key=f'{prefix}/data/tracks_new.csv')\n",
- "s3_client.upload_file(Filename=\"./data/ratings_new.csv\", Bucket=bucket, Key=f'{prefix}/data/ratings_new.csv')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "----\n",
- "\n",
- "# Music Recommender Part 1: Data Prep using Data Wrangler\n",
- "\n",
- "After you completed running this notebook, you can open the Data Wrangler file `01_music_dataprep.flow`."
- ]
- }
- ],
- "metadata": {
- "instance_type": "ml.t3.medium",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/01_data_exploration.ipynb b/end_to_end/music_recommendation/01_data_exploration.ipynb
new file mode 100644
index 0000000000..9d85607008
--- /dev/null
+++ b/end_to_end/music_recommendation/01_data_exploration.ipynb
@@ -0,0 +1,301 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Music Recommender Data Exploration"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "----\n",
+ "\n",
+ "## Background\n",
+ "\n",
+ "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. In this notebook, we will be focusing on exploring the data. It is the first notebook in a series of notebooks. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n",
+ "\n",
+ "1. [Music Recommender Data Exploration](01_data_exploration.ipynb) (current notebook)\n",
+ "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n",
+ "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n",
+ "\n",
+ "----\n",
+ "\n",
+ "## Contents\n",
+ "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n",
+ "1. [Update the Data Source in the .flow File](#Update-the-Data-Source-in-the-.flow-File)\n",
+ "1. [Explore the Data](#Explore-the-Data)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import pprint\n",
+ "\n",
+ "sys.path.insert(1, \"./code\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# update pandas to avoid data type issues in older 1.0 version\n",
+ "!pip install pandas --upgrade --quiet\n",
+ "import pandas as pd\n",
+ "\n",
+ "print(pd.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create data folder\n",
+ "!mkdir data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline\n",
+ "\n",
+ "import json\n",
+ "import sagemaker\n",
+ "import boto3\n",
+ "import os\n",
+ "\n",
+ "# Sagemaker session\n",
+ "sess = sagemaker.Session()\n",
+ "# get session bucket name\n",
+ "bucket = sess.default_bucket()\n",
+ "# bucket prefix or the subfolder for everything we produce\n",
+ "prefix = \"music-recommendation\"\n",
+ "# s3 client\n",
+ "s3_client = boto3.client(\"s3\")\n",
+ "\n",
+ "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Prereqs: Get Data \n",
+ "\n",
+ "----\n",
+ "\n",
+ "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from demo_helpers import get_data, get_model, update_data_sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# public S3 bucket that contains our music data\n",
+ "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_data_paths = get_data(\n",
+ " s3_client,\n",
+ " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n",
+ " bucket,\n",
+ " prefix,\n",
+ " sample_data=0.70,\n",
+ ")\n",
+ "print(new_data_paths)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n",
+ "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n",
+ "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Update the Data Source in the .flow File\n",
+ "\n",
+ "----\n",
+ "\n",
+ "The `01_music_dataprep.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n",
+ "\n",
+ "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explore the Data\n",
+ "\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracks = pd.read_csv(\"./data/tracks.csv\")\n",
+ "ratings = pd.read_csv(\"./data/ratings.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracks.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ratings.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"{:,} different songs/tracks\".format(tracks[\"trackId\"].nunique()))\n",
+ "print(\"{:,} users\".format(ratings[\"userId\"].nunique()))\n",
+ "print(\"{:,} user rating events\".format(ratings[\"ratingEventId\"].nunique()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracks.groupby(\"genre\")[\"genre\"].count().plot.bar(title=\"Tracks by Genre\");"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ratings[[\"ratingEventId\", \"userId\"]].plot.hist(\n",
+ " by=\"userId\", bins=50, title=\"Distribution of # of Ratings by User\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create some new data to ingest later"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tracks_new = tracks[:300]\n",
+ "ratings_new = ratings[:1000]\n",
+ "\n",
+ "# export dataframes to csv\n",
+ "tracks_new.to_csv(\"./data/tracks_new.csv\", index=False)\n",
+ "ratings_new.to_csv(\"./data/ratings_new.csv\", index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_client.upload_file(\n",
+ " Filename=\"./data/tracks_new.csv\", Bucket=bucket, Key=f\"{prefix}/data/tracks_new.csv\"\n",
+ ")\n",
+ "s3_client.upload_file(\n",
+ " Filename=\"./data/ratings_new.csv\", Bucket=bucket, Key=f\"{prefix}/data/ratings_new.csv\"\n",
+ ")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "conda_python3",
+ "language": "python",
+ "name": "conda_python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/end_to_end/music_recommendation/02_export_feature_groups.ipynb b/end_to_end/music_recommendation/02_export_feature_groups.ipynb
new file mode 100644
index 0000000000..b53507c28d
--- /dev/null
+++ b/end_to_end/music_recommendation/02_export_feature_groups.ipynb
@@ -0,0 +1,1141 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler\n",
+ "\n",
+ "----\n",
+ "\n",
+ "## Background\n",
+ "\n",
+ "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n",
+ "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n",
+ "Processing Job and ingest processed data to Feature Store. It is the second notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case implement of this sequence of notebooks. \n",
+ "\n",
+ "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n",
+ "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb) (current notebook)\n",
+ "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n",
+ "\n",
+ "----\n",
+ "\n",
+ "## Contents\n",
+ "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n",
+ "1. [Update the Data Source in the .flow File](#Update-the-Data-Source-in-the-.flow-File)\n",
+ "1. [Create Feature Group](#Create-Feature-Group)\n",
+ "1. [Configure Feature Group](#Configure-Feature-Group)\n",
+ "1. [Initialize & Create Feature Group](#Initialize-&-Create-Feature-Group)\n",
+ "1. [Inputs and Outputs](#Inputs-and-Outputs)\n",
+ "1. [Upload Flow to S3](#Upload-Flow-to-S3)\n",
+ "1. [Run Processing Job](#Run-Processing-Job)\n",
+ "1. [Fetch Data from Offline Feature Store](#Fetch-Data-from-Offline-Feature-Store)\n",
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "import pprint\n",
+ "\n",
+ "sys.path.insert(1, \"./code\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# update pandas to avoid data type issues in older 1.0 version\n",
+ "!pip install pandas --upgrade --quiet\n",
+ "import pandas as pd\n",
+ "\n",
+ "print(pd.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create data folder\n",
+ "!mkdir data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "%matplotlib inline\n",
+ "\n",
+ "import json\n",
+ "import sagemaker\n",
+ "import boto3\n",
+ "import os\n",
+ "from awscli.customizations.s3.utils import split_s3_bucket_key\n",
+ "\n",
+ "# SageMaker session\n",
+ "sess = sagemaker.Session()\n",
+ "# get session bucket name\n",
+ "bucket = sess.default_bucket()\n",
+ "# bucket prefix or the subfolder for everything we produce\n",
+ "prefix = \"music-recommendation\"\n",
+ "# s3 client\n",
+ "s3_client = boto3.client(\"s3\")\n",
+ "\n",
+ "print(f\"this is your default SageMaker Studio bucket name: {bucket}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Prereqs: Get Data \n",
+ "\n",
+ "----\n",
+ "\n",
+ "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from demo_helpers import get_data, get_model, update_data_sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create data folder\n",
+ "!mkdir data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# public S3 bucket that contains our music data\n",
+ "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_data_paths = get_data(\n",
+ " s3_client,\n",
+ " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n",
+ " bucket,\n",
+ " prefix,\n",
+ " sample_data=0.70,\n",
+ ")\n",
+ "print(new_data_paths)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n",
+ "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n",
+ "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Update the Data Source in the `.flow` File\n",
+ "\n",
+ "----\n",
+ "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n",
+ "\n",
+ "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create Feature Group\n",
+ "----\n",
+ "\n",
+ "[Amazon SageMaker Feature Store](https://www.youtube.com/watch?v=pEg5c6d4etI) is a fully managed, purpose-built repository to store, update, retrieve, and share machine learning (ML) features. Features are the attributes or properties models use during training and inference to make predictions. For example, in a ML application that recommends a music playlist, features could include song ratings, which songs were listened to previously, and how long songs were listened to. The accuracy of a ML model is based on a precise set and composition of features. Often, these features are used repeatedly by multiple teams training multiple models. And whichever feature set was used to train the model needs to be available to make real-time predictions (inference). Keeping a single source of features that is consistent and up-to-date across these different access patterns is a challenge as most organizations keep two different feature stores, one for training and one for inference.\n",
+ "\n",
+ "Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.\n",
+ "\n",
+ "_What is a feature group_\n",
+ "\n",
+ "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n",
+ "collection of features - each feature in the feature group has a specified data type and name. \n",
+ "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n",
+ "collection of feature groups. To learn more about SageMaker Feature Store, see \n",
+ "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Define Feature Group \n",
+ "\n",
+ "Select Record identifier and Event time feature name. These are required parameters for feature group\n",
+ "creation.\n",
+ "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n",
+ "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n",
+ "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n",
+ "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n",
+ "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n",
+ "\n",
+ " 💡Record identifier and Event time feature name are required \n",
+ "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n",
+ "from the Run Menu from the menu bar. \n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# feature group name, with flow_name and an unique id. You can give it a customized name\n",
+ "feature_group_names = [\n",
+ " \"track-features-music-rec\",\n",
+ " \"user-5star-track-features-music-rec\",\n",
+ " \"ratings-features-music-rec\",\n",
+ "]\n",
+ "print(f\"Feature Group Name: {feature_group_names}\")\n",
+ "\n",
+ "record_identifier_feature_names = {\n",
+ " \"track-features-music-rec\": \"trackId\",\n",
+ " \"user-5star-track-features-music-rec\": \"userId\",\n",
+ " \"ratings-features-music-rec\": \"ratingEventId\",\n",
+ "}\n",
+ "event_time_feature_name = \"EventTime\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Feature Definitions\n",
+ "The following is a list of the feature names and feature types of the final dataset that will be produced \n",
+ "when your data flow is used to process your input dataset. These are automatically generated from the \n",
+ "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n",
+ "select a new step to export.\n",
+ "\n",
+ " 💡
Configurable Settings \n",
+ "\n",
+ "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n",
+ "features.\n",
+ "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n",
+ "(
Integral,
Fractional, or
String). The default type is set to
String. \n",
+ "This means that, if a column in your dataset is not a
float or
long type, it will default \n",
+ "to
String in your Feature Store.\n",
+ "\n",
+ "For
Event Time features, make sure the format follows the feature store\n",
+ "
\n",
+ " \n",
+ " Event Time feature format\n",
+ " \n",
+ "\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "track_column_schemas = [\n",
+ " {\"name\": \"trackId\", \"type\": \"string\"},\n",
+ " {\"name\": \"length\", \"type\": \"float\"},\n",
+ " {\"name\": \"energy\", \"type\": \"float\"},\n",
+ " {\"name\": \"acousticness\", \"type\": \"float\"},\n",
+ " {\"name\": \"valence\", \"type\": \"float\"},\n",
+ " {\"name\": \"speechiness\", \"type\": \"float\"},\n",
+ " {\"name\": \"instrumentalness\", \"type\": \"float\"},\n",
+ " {\"name\": \"liveness\", \"type\": \"float\"},\n",
+ " {\"name\": \"tempo\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Folk\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Country\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Latin\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Jazz\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_RnB\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Reggae\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Rap\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Pop_Rock\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Electronic\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Blues\", \"type\": \"float\"},\n",
+ " {\"name\": \"danceability\", \"type\": \"float\"},\n",
+ " {\"name\": \"EventTime\", \"type\": \"float\"},\n",
+ "]\n",
+ "\n",
+ "user_column_schemas = [\n",
+ " {\"name\": \"userId\", \"type\": \"long\"},\n",
+ " {\"name\": \"energy_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"acousticness_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"valence_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"speechiness_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"instrumentalness_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"liveness_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"tempo_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"danceability_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Latin_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Folk_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Blues_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Rap_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Reggae_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Jazz_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_RnB_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Country_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Electronic_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"genre_Pop_Rock_5star\", \"type\": \"float\"},\n",
+ " {\"name\": \"EventTime\", \"type\": \"float\"},\n",
+ "]\n",
+ "\n",
+ "rating_column_schemas = [\n",
+ " {\"name\": \"ratingEventId\", \"type\": \"string\"},\n",
+ " {\"name\": \"ts\", \"type\": \"long\"},\n",
+ " {\"name\": \"userId\", \"type\": \"long\"},\n",
+ " {\"name\": \"trackId\", \"type\": \"string\"},\n",
+ " {\"name\": \"sessionId\", \"type\": \"long\"},\n",
+ " {\"name\": \"itemInSession\", \"type\": \"long\"},\n",
+ " {\"name\": \"Rating\", \"type\": \"float\"},\n",
+ " {\"name\": \"EventTime\", \"type\": \"float\"},\n",
+ "]\n",
+ "\n",
+ "column_schemas = {\n",
+ " \"track-features-music-rec\": track_column_schemas,\n",
+ " \"user-5star-track-features-music-rec\": user_column_schemas,\n",
+ " \"ratings-features-music-rec\": rating_column_schemas,\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n",
+ "supported by Feature Store. The following will create a default_FG_type set to String for these types."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.feature_store.feature_definition import FeatureDefinition\n",
+ "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n",
+ "\n",
+ "default_feature_type = FeatureTypeEnum.STRING\n",
+ "column_to_feature_type_mapping = {\n",
+ " \"float\": FeatureTypeEnum.FRACTIONAL,\n",
+ " \"long\": FeatureTypeEnum.INTEGRAL,\n",
+ "}\n",
+ "\n",
+ "feature_definitions = {}\n",
+ "for feature_group_name in feature_group_names:\n",
+ " feature_definition = [\n",
+ " FeatureDefinition(\n",
+ " feature_name=column_schema[\"name\"],\n",
+ " feature_type=column_to_feature_type_mapping.get(\n",
+ " column_schema[\"type\"], default_feature_type\n",
+ " ),\n",
+ " )\n",
+ " for column_schema in column_schemas[feature_group_name]\n",
+ " ]\n",
+ " feature_definitions[feature_group_name] = feature_definition"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Configure Feature Group\n",
+ "\n",
+ "----\n",
+ " 💡 Configurable Settings \n",
+ "\n",
+ "1. feature_group_name: name of the feature group.\n",
+ "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n",
+ "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n",
+ "1. iam_role: IAM role for executing the processing job.\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from time import gmtime, strftime\n",
+ "import uuid"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# IAM role for executing the processing job.\n",
+ "iam_role = sagemaker.get_execution_role()\n",
+ "\n",
+ "# flow name and an unique ID for this export (used later as the processing job name for the export)\n",
+ "flow_name = \"01_music_dataprep\"\n",
+ "flow_export_id = f\"{strftime('%d-%H-%M-%S', gmtime())}-{str(uuid.uuid4())[:8]}\"\n",
+ "flow_export_name = f\"flow-{flow_export_id}\"\n",
+ "\n",
+ "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a\n",
+ "# S3 location owned by you.\n",
+ "feature_store_offline_s3_uri = \"s3://\" + bucket\n",
+ "\n",
+ "# controls if online store is enabled. Enabling the online store allows quick access to\n",
+ "# the latest value for a Record via the GetRecord API.\n",
+ "enable_online_store = True\n",
+ "fg_name_tracks = feature_group_name"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Initialize & Create Feature Group\n",
+ "\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize Boto3 session that is required to create feature group\n",
+ "import boto3\n",
+ "from sagemaker.session import Session\n",
+ "\n",
+ "region = boto3.Session().region_name\n",
+ "boto_session = boto3.Session(region_name=region)\n",
+ "\n",
+ "sagemaker_client = boto_session.client(service_name=\"sagemaker\", region_name=region)\n",
+ "featurestore_runtime = boto_session.client(\n",
+ " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n",
+ ")\n",
+ "\n",
+ "feature_store_session = Session(\n",
+ " boto_session=boto_session,\n",
+ " sagemaker_client=sagemaker_client,\n",
+ " sagemaker_featurestore_runtime_client=featurestore_runtime,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.feature_store.feature_group import FeatureGroup\n",
+ "import time\n",
+ "\n",
+ "\n",
+ "def wait_for_feature_group_creation_complete(feature_group):\n",
+ " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n",
+ " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
+ " while status == \"Creating\":\n",
+ " print(\"Waiting for Feature Group Creation\")\n",
+ " time.sleep(5)\n",
+ " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
+ " if status != \"Created\":\n",
+ " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n",
+ " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
+ "\n",
+ "\n",
+ "def create_feature_group(feature_group_name, feature_store_session, feature_definitions):\n",
+ " feature_group = FeatureGroup(\n",
+ " name=feature_group_name,\n",
+ " sagemaker_session=feature_store_session,\n",
+ " feature_definitions=feature_definitions[feature_group_name],\n",
+ " )\n",
+ "\n",
+ " # only create feature group if it doesn't already exist\n",
+ " try:\n",
+ " sagemaker_client.describe_feature_group(\n",
+ " FeatureGroupName=feature_group_name, NextToken=\"string\"\n",
+ " )\n",
+ " feature_group_exists = True\n",
+ " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n",
+ " except Exception as e:\n",
+ " error = e.response.get(\"Error\").get(\"Code\")\n",
+ " if error == \"ResourceNotFound\":\n",
+ " feature_group_exists = False\n",
+ " print(\"Creating Feature Group {}\".format(feature_group_name))\n",
+ " feature_group.create(\n",
+ " s3_uri=feature_store_offline_s3_uri,\n",
+ " record_identifier_name=record_identifier_feature_names[feature_group_name],\n",
+ " event_time_feature_name=event_time_feature_name,\n",
+ " role_arn=iam_role,\n",
+ " enable_online_store=enable_online_store,\n",
+ " )\n",
+ " # Invoke the Feature Store API to create the feature group and wait until it is ready\n",
+ " wait_for_feature_group_creation_complete(feature_group=feature_group)\n",
+ " if error == \"ResourceInUse\":\n",
+ " feature_group_exists = True\n",
+ " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n",
+ "\n",
+ " return feature_group_exists"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Feature group is initialized and created below"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_group_existence = {}\n",
+ "for feature_group_name in feature_group_names:\n",
+ " feature_group_exists = create_feature_group(\n",
+ " feature_group_name, feature_store_session, feature_definitions\n",
+ " )\n",
+ " feature_group_existence[feature_group_name] = feature_group_exists"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that the feature group is created, You will use a processing job to process your \n",
+ " data at scale and ingest the transformed data into this feature group."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Inputs and Outputs\n",
+ "\n",
+ "----\n",
+ "\n",
+ "The below settings configure the inputs and outputs for the flow export.\n",
+ "\n",
+ " 💡 Configurable Settings \n",
+ "\n",
+ "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n",
+ "\n",
+ "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n",
+ "2. For all other sources, configure attributes like query_string, database in the source's \n",
+ "DatasetDefinition object.\n",
+ "\n",
+ "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n",
+ "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
+ "from sagemaker.dataset_definition.inputs import (\n",
+ " AthenaDatasetDefinition,\n",
+ " DatasetDefinition,\n",
+ " RedshiftDatasetDefinition,\n",
+ ")\n",
+ "\n",
+ "data_sources = []"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Input - S3 Source: tracks.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_sources.append(\n",
+ " ProcessingInput(\n",
+ " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n",
+ " destination=\"/opt/ml/processing/tracks.csv\",\n",
+ " input_name=\"tracks.csv\",\n",
+ " s3_data_type=\"S3Prefix\",\n",
+ " s3_input_mode=\"File\",\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Input - S3 Source: ratings.csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_sources.append(\n",
+ " ProcessingInput(\n",
+ " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n",
+ " destination=\"/opt/ml/processing/ratings.csv\",\n",
+ " input_name=\"ratings.csv\",\n",
+ " s3_data_type=\"S3Prefix\",\n",
+ " s3_input_mode=\"File\",\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
+ " )\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Output: Feature Store \n",
+ "\n",
+ "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output. Notice the `output_name` variable below; this ID is found within the `.flow` file at the node point you want to capture transformations up to. The `.flow` file contains instructions for SageMaker Data Wrangler to know where to look for data and how to transform it. Each data transformation action is associated with a node and therefore a node ID. Using the associated node ID + output name tells SageMaker up to what point in the transformation process you want to export to a feature store."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.processing import FeatureStoreOutput\n",
+ "\n",
+ "# Output name is auto-generated from the select node's ID + output name from the .flow file\n",
+ "output_names = {\n",
+ " \"track-features-music-rec\": \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\",\n",
+ " \"user-5star-track-features-music-rec\": \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\",\n",
+ " \"ratings-features-music-rec\": \"9a283380-91ca-478e-be99-6ba3bf57c680.default\",\n",
+ "}\n",
+ "\n",
+ "processing_job_outputs = {}\n",
+ "\n",
+ "for feature_group_name in feature_group_names:\n",
+ " processing_job_output = ProcessingOutput(\n",
+ " output_name=output_names[feature_group_name],\n",
+ " app_managed=True,\n",
+ " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n",
+ " )\n",
+ " processing_job_outputs[feature_group_name] = processing_job_output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Upload Flow to S3\n",
+ "\n",
+ "----\n",
+ "To use the Data Wrangler as an input to the processing job, first upload your flow file to Amazon S3."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import json\n",
+ "import boto3\n",
+ "\n",
+ "# name of the flow file which should exist in the current notebook working directory\n",
+ "flow_file_name = \"01_music_dataprep.flow\"\n",
+ "\n",
+ "# Load .flow file from current notebook working directory\n",
+ "!echo \"Loading flow file from current notebook working directory: $PWD\"\n",
+ "\n",
+ "with open(flow_file_name) as f:\n",
+ " flow = json.load(f)\n",
+ "\n",
+ "# Upload flow to S3\n",
+ "s3_client = boto3.client(\"s3\")\n",
+ "s3_client.upload_file(\n",
+ " flow_file_name, bucket, f\"{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n",
+ ")\n",
+ "\n",
+ "flow_s3_uri = f\"s3://{bucket}/{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n",
+ "\n",
+ "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Input - Flow: 01_music_dataprep.flow\n",
+ "flow_input = ProcessingInput(\n",
+ " source=flow_s3_uri,\n",
+ " destination=\"/opt/ml/processing/flow\",\n",
+ " input_name=\"flow\",\n",
+ " s3_data_type=\"S3Prefix\",\n",
+ " s3_input_mode=\"File\",\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Run Processing Job\n",
+ "\n",
+ "----\n",
+ "### Job Configurations\n",
+ "\n",
+ " 💡 Configurable Settings \n",
+ "\n",
+ "You can configure the following settings for Processing Jobs. If you change any configurations you will \n",
+ "need to re-execute this and all cells below it by selecting the Run menu above and click \n",
+ "Run Selected Cells and All Below\n",
+ "\n",
+ "1. IAM role for executing the processing job. \n",
+ "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n",
+ "3. Data Wrangler Container URL.\n",
+ "4. Instance count, instance type and storage volume size in GB.\n",
+ "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
+ "6. Network Isolation settings\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Data Wrangler Container URL.\n",
+ "container_uri = sagemaker.image_uris.retrieve(framework=\"data-wrangler\", region=region)\n",
+ "\n",
+ "# Processing Job Instance count and instance type.\n",
+ "instance_count = 2\n",
+ "instance_type = \"ml.m5.4xlarge\"\n",
+ "\n",
+ "# Size in GB of the EBS volume to use for storing data during processing\n",
+ "volume_size_in_gb = 30\n",
+ "\n",
+ "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
+ "output_content_type = \"CSV\"\n",
+ "\n",
+ "# Network Isolation mode; default is off\n",
+ "enable_network_isolation = False"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create Processing Job\n",
+ "\n",
+ "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.processing import Processor\n",
+ "from sagemaker.network import NetworkConfig\n",
+ "\n",
+ "processor = Processor(\n",
+ " role=iam_role,\n",
+ " image_uri=container_uri,\n",
+ " instance_count=instance_count,\n",
+ " instance_type=instance_type,\n",
+ " volume_size_in_gb=volume_size_in_gb,\n",
+ " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n",
+ " sagemaker_session=sess,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Job Status & S3 Output Location\n",
+ "\n",
+ "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n",
+ "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "\n",
+ "feature_group_exists = False\n",
+ "for feature_group_name in feature_group_names:\n",
+ " print(f\"Processing {feature_group_name}\")\n",
+ " # Unique processing job name. Give a unique name every time you re-execute processing jobs\n",
+ " processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(\n",
+ " flow_export_id, str(uuid.uuid4())[:8]\n",
+ " )\n",
+ " print(f\"{processing_job_name}\")\n",
+ "\n",
+ " # Output configuration used as processing job container arguments\n",
+ " output_config = {output_names[feature_group_name]: {\"content_type\": output_content_type}}\n",
+ "\n",
+ " # Run Processing Job if job not already previously ran\n",
+ " if feature_group_exists: # feature_group_existence[feature_group_name]\n",
+ " print(\n",
+ " \"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(\n",
+ " feature_group_name\n",
+ " )\n",
+ " )\n",
+ " else:\n",
+ " print(\"Creating Processing Job: {}\".format(feature_group_name))\n",
+ " processor.run(\n",
+ " inputs=[flow_input] + data_sources,\n",
+ " outputs=[processing_job_outputs[feature_group_name]],\n",
+ " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n",
+ " wait=False,\n",
+ " logs=False,\n",
+ " job_name=processing_job_name,\n",
+ " )\n",
+ "\n",
+ " job_result = sess.wait_for_processing_job(processing_job_name)\n",
+ " print(job_result)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n",
+ "for detailed guide. [Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Fetch Data from Offline Feature Store\n",
+ "\n",
+ "----\n",
+ "There are 3 feature stores for the ratings, tracks, and user preferences data. We retrieve data from all 3 before joining them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_groups = []\n",
+ "for name in feature_group_names:\n",
+ " feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session)\n",
+ " feature_groups.append(feature_group)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_client = boto3.client(\"s3\")\n",
+ "account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
+ "\n",
+ "sagemaker_role = sagemaker.get_execution_role()\n",
+ "\n",
+ "s3_output_path = \"s3://\" + bucket"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_group_s3_prefixes = []\n",
+ "for feature_group in feature_groups:\n",
+ " feature_group_table_name = (\n",
+ " feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n",
+ " )\n",
+ " feature_group_s3_prefix = (\n",
+ " f\"{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}\"\n",
+ " )\n",
+ " feature_group_s3_prefixes.append(feature_group_s3_prefix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# wait for data to be added to offline feature store\n",
+ "def wait_for_offline_store(feature_group_s3_prefix):\n",
+ " print(feature_group_s3_prefix)\n",
+ " offline_store_contents = None\n",
+ " while offline_store_contents is None:\n",
+ " objects_in_bucket = s3_client.list_objects(Bucket=bucket, Prefix=feature_group_s3_prefix)\n",
+ " if \"Contents\" in objects_in_bucket and len(objects_in_bucket[\"Contents\"]) > 1:\n",
+ " offline_store_contents = objects_in_bucket[\"Contents\"]\n",
+ " else:\n",
+ " print(\"Waiting for data in offline store...\")\n",
+ " time.sleep(60)\n",
+ " print(\"Data available.\")\n",
+ "\n",
+ "\n",
+ "for s3_prefix in feature_group_s3_prefixes:\n",
+ " wait_for_offline_store(s3_prefix)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tables = {\n",
+ " \"ratings\": {\"feature_group\": feature_groups[2], \"cols\": [\"userId\", \"trackid\", \"rating\"]},\n",
+ " \"tracks\": {\n",
+ " \"feature_group\": feature_groups[0],\n",
+ " \"cols\": [\n",
+ " \"trackid\",\n",
+ " \"length\",\n",
+ " \"energy\",\n",
+ " \"acousticness\",\n",
+ " \"valence\",\n",
+ " \"speechiness\",\n",
+ " \"instrumentalness\",\n",
+ " \"liveness\",\n",
+ " \"tempo\",\n",
+ " \"danceability\",\n",
+ " \"genre_latin\",\n",
+ " \"genre_folk\",\n",
+ " \"genre_blues\",\n",
+ " \"genre_rap\",\n",
+ " \"genre_reggae\",\n",
+ " \"genre_jazz\",\n",
+ " \"genre_rnb\",\n",
+ " \"genre_country\",\n",
+ " \"genre_electronic\",\n",
+ " \"genre_pop_rock\",\n",
+ " ],\n",
+ " },\n",
+ " \"user_5star_features\": {\n",
+ " \"feature_group\": feature_groups[1],\n",
+ " \"cols\": [\n",
+ " \"userId\",\n",
+ " \"energy_5star\",\n",
+ " \"acousticness_5star\",\n",
+ " \"valence_5star\",\n",
+ " \"speechiness_5star\",\n",
+ " \"instrumentalness_5star\",\n",
+ " \"liveness_5star\",\n",
+ " \"tempo_5star\",\n",
+ " \"danceability_5star\",\n",
+ " \"genre_latin_5star\",\n",
+ " \"genre_folk_5star\",\n",
+ " \"genre_blues_5star\",\n",
+ " \"genre_rap_5star\",\n",
+ " \"genre_reggae_5star\",\n",
+ " \"genre_jazz_5star\",\n",
+ " \"genre_rnb_5star\",\n",
+ " \"genre_country_5star\",\n",
+ " \"genre_electronic_5star\",\n",
+ " \"genre_pop_rock_5star\",\n",
+ " ],\n",
+ " },\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Check if the Athena queries have been done and the data sets exist, then just do train test split or just proceed to training"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_train_val():\n",
+ " for k, v in tables.items():\n",
+ " query = v[\"feature_group\"].athena_query()\n",
+ " joined_cols = \", \".join(v[\"cols\"])\n",
+ " # limit number of datapoints for training time\n",
+ " query_string = 'SELECT {} FROM \"{}\" LIMIT 500000'.format(joined_cols, query.table_name)\n",
+ " print(query_string, \"\\n\")\n",
+ "\n",
+ " output_location = f\"s3://{bucket}/{prefix}/query_results/\"\n",
+ " query.run(query_string=query_string, output_location=output_location)\n",
+ " query.wait()\n",
+ "\n",
+ " tables[k][\"df\"] = query.as_dataframe()\n",
+ "\n",
+ " ratings = tables[\"ratings\"][\"df\"]\n",
+ " tracks = tables[\"tracks\"][\"df\"]\n",
+ " user_prefs = tables[\"user_5star_features\"][\"df\"]\n",
+ "\n",
+ " print(\"Merging datasets...\")\n",
+ " print(f\"Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n\")\n",
+ "\n",
+ " dataset = pd.merge(ratings, tracks, on=\"trackid\", how=\"inner\")\n",
+ " dataset = pd.merge(dataset, user_prefs, on=\"userId\", how=\"inner\")\n",
+ " dataset.drop_duplicates(inplace=True)\n",
+ " dataset.drop([\"userId\", \"trackid\"], axis=1, inplace=True)\n",
+ "\n",
+ " # split data\n",
+ " from sklearn.model_selection import train_test_split\n",
+ "\n",
+ " train, val = train_test_split(dataset, test_size=0.2, random_state=42)\n",
+ " print(\n",
+ " \"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape)\n",
+ " )\n",
+ "\n",
+ " return train, val"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "import pandas as pd\n",
+ "import glob\n",
+ "\n",
+ "\n",
+ "print(\"Creating training and validation sets...\\n\")\n",
+ "train, val = get_train_val()\n",
+ "# Write to csv in S3 without headers and index column\n",
+ "train.to_csv(\"./data/train_data.csv\", header=False, index=False)\n",
+ "val.to_csv(\"./data/val_data.csv\", header=False, index=False)\n",
+ "\n",
+ "pd.DataFrame({\"ColumnName\": train.columns}).to_csv(\n",
+ " \"./data/train_data_headers.csv\", header=False, index=False\n",
+ ")\n",
+ "pd.DataFrame({\"ColumnName\": val.columns}).to_csv(\n",
+ " \"./data/val_data_headers.csv\", header=False, index=False\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "instance_type": "ml.m5.4xlarge",
+ "kernelspec": {
+ "display_name": "conda_python3",
+ "language": "python",
+ "name": "conda_python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/end_to_end/music_recommendation/02a_export_fg_tracks.ipynb b/end_to_end/music_recommendation/02a_export_fg_tracks.ipynb
deleted file mode 100644
index edd8ec7a87..0000000000
--- a/end_to_end/music_recommendation/02a_export_fg_tracks.ipynb
+++ /dev/null
@@ -1,838 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 2a: Feature Store Creation - Tracks\n",
- "\n",
- "----\n",
- "\n",
- "This notebook creates a feature group for our tracks data to place in our feature store using the transformation instructions found in our `.flow` file. [Amazon SageMaker Feature Store](https://www.youtube.com/watch?v=pEg5c6d4etI) is a fully managed, purpose-built repository to store, update, retrieve, and share machine learning (ML) features.\n",
- "\n",
- "Features are the attributes or properties models use during training and inference to make predictions. For example, in a ML application that recommends a music playlist, features could include song ratings, which songs were listened to previously, and how long songs were listened to. The accuracy of a ML model is based on a precise set and composition of features. Often, these features are used repeatedly by multiple teams training multiple models. And whichever feature set was used to train the model needs to be available to make real-time predictions (inference). Keeping a single source of features that is consistent and up-to-date across these different access patterns is a challenge as most organizations keep two different feature stores, one for training and one for inference.\n",
- "\n",
- "Amazon SageMaker Feature Store is a purpose-built repository where you can store and access features so it’s much easier to name, organize, and reuse them across teams. SageMaker Feature Store provides a unified store for features during training and real-time inference without the need to write additional code or create manual processes to keep features consistent. SageMaker Feature Store keeps track of the metadata of stored features (e.g. feature name or version number) so that you can query the features for the right attributes in batches or in real time using Amazon Athena, an interactive query service. SageMaker Feature Store also keeps features updated, because as new data is generated during inference, the single repository is updated so new features are always available for models to use during training and inference.\n",
- "\n",
- "\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- " - [Define Feature Group](#02a-define-fg)\n",
- " - [Configure Feature Group](#02a-config-fg)\n",
- " - [Initialize & Create Feature Group](#02a-init-create-fg)\n",
- " - [Inputs and Outputs](#02a-input-output)\n",
- " - [Upload flow file](#02a-upload-flow)\n",
- " - [Run Processing Job](#02a-run-job)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n",
- "\n",
- "\n",
- "\n",
- "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n",
- "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n",
- "Processing Job and ingest processed data to Feature Store. \n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Feature Group\n",
- "\n",
- "_What is a feature group_\n",
- "\n",
- "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n",
- "collection of features - each feature in the feature group has a specified data type and name. \n",
- "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n",
- "collection of feature groups. To learn more about SageMaker Feature Store, see \n",
- "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Define Feature Group \n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----\n",
- "Select Record identifier and Event time feature name. These are required parameters for feature group\n",
- "creation.\n",
- "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n",
- "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n",
- "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n",
- "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n",
- "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n",
- "\n",
- " 💡Record identifier and Event time feature name are required \n",
- "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n",
- "from the Run Menu from the menu bar. \n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install sagemaker boto3 --upgrade --quiet"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
- "parameters = ps.read('music-rec')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "bucket = parameters['bucket']\n",
- "prefix = parameters['prefix']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "record_identifier_feature_name = 'trackId'\n",
- "if record_identifier_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n",
- "\n",
- "event_time_feature_name = 'EventTime'\n",
- "if event_time_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the event time feature name.\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Feature Definitions\n",
- "The following is a list of the feature names and feature types of the final dataset that will be produced \n",
- "when your data flow is used to process your input dataset. These are automatically generated from the \n",
- "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n",
- "select a new step to export.\n",
- "\n",
- " 💡
Configurable Settings \n",
- "\n",
- "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n",
- "features.\n",
- "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n",
- "(
Integral,
Fractional, or
String). The default type is set to
String. \n",
- "This means that, if a column in your dataset is not a
float or
long type, it will default \n",
- "to
String in your Feature Store.\n",
- "\n",
- "For
Event Time features, make sure the format follows the feature store\n",
- "
\n",
- " \n",
- " Event Time feature format\n",
- " \n",
- "\n",
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "column_schemas = [\n",
- " {\n",
- " \"name\": \"trackId\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"length\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"energy\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"acousticness\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"valence\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"speechiness\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"instrumentalness\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"liveness\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tempo\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Folk\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Country\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Latin\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Jazz\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_RnB\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Reggae\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Rap\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Pop_Rock\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Electronic\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Blues\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"danceability\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"EventTime\",\n",
- " \"type\": \"float\"\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n",
- "supported by Feature Store. The following will create a default_FG_type set to String for these types."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_definition import FeatureDefinition\n",
- "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n",
- "\n",
- "default_feature_type = FeatureTypeEnum.STRING\n",
- "column_to_feature_type_mapping = {\n",
- " \"float\": FeatureTypeEnum.FRACTIONAL,\n",
- " \"long\": FeatureTypeEnum.INTEGRAL\n",
- "}\n",
- "\n",
- "feature_definitions = [\n",
- " FeatureDefinition(\n",
- " feature_name=column_schema['name'], \n",
- " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n",
- " ) for column_schema in column_schemas\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Configure Feature Group\n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----\n",
- " 💡 Configurable Settings \n",
- "\n",
- "1. feature_group_name: name of the feature group.\n",
- "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n",
- "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n",
- "1. iam_role: IAM role for executing the processing job.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from time import gmtime, strftime\n",
- "import uuid\n",
- "import sagemaker \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Sagemaker session\n",
- "sess = sagemaker.Session()\n",
- "\n",
- "# IAM role for executing the processing job.\n",
- "iam_role = sagemaker.get_execution_role()\n",
- "\n",
- "# flow name and an unique ID for this export (used later as the processing job name for the export)\n",
- "flow_name = \"01_music_dataprep\"\n",
- "flow_export_id = f\"{strftime('%d-%H-%M-%S', gmtime())}-{str(uuid.uuid4())[:8]}\"\n",
- "flow_export_name = f\"flow-{flow_export_id}\"\n",
- "\n",
- "# feature group name, with flow_name and an unique id. You can give it a customized name\n",
- "feature_group_name = 'track-features-music-rec'\n",
- "print(f\"Feature Group Name: {feature_group_name}\")\n",
- "\n",
- "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n",
- "# S3 location owned by you.\n",
- "feature_store_offline_s3_uri = 's3://' + bucket\n",
- "\n",
- "# controls if online store is enabled. Enabling the online store allows quick access to \n",
- "# the latest value for a Record via the GetRecord API.\n",
- "enable_online_store = True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fg_name_tracks = feature_group_name\n",
- "\n",
- "ps.add({'fg_name_tracks': fg_name_tracks}, namespace='music-rec')\n",
- "ps.add({'flow_export_id': flow_export_id}, namespace='music-rec')\n",
- "\n",
- "dw_ecrlist = {\n",
- " 'region':{'us-west-2':'174368400705',\n",
- " 'us-east-2':'415577184552',\n",
- " 'us-west-1':'926135532090',\n",
- " 'us-east-1':'663277389841'\n",
- " }\n",
- "}\n",
- "\n",
- "ps.add({'dw_ecrlist': dw_ecrlist}, namespace='music-rec')\n",
- "ps.store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Initialize & Create Feature Group\n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Initialize Boto3 session that is required to create feature group\n",
- "import boto3\n",
- "from sagemaker.session import Session\n",
- "\n",
- "region = boto3.Session().region_name\n",
- "boto_session = boto3.Session(region_name=region)\n",
- "\n",
- "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
- "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
- "\n",
- "feature_store_session = Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client,\n",
- " sagemaker_featurestore_runtime_client=featurestore_runtime\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Feature group is initialized and created below"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_group import FeatureGroup\n",
- "\n",
- "feature_group = FeatureGroup(\n",
- " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n",
- "\n",
- "# only create feature group if it doesn't already exist\n",
- "try:\n",
- " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n",
- "except Exception as e:\n",
- " error = e.response.get('Error').get('Code')\n",
- " if error == \"ResourceNotFound\":\n",
- " feature_group_exists=False\n",
- " print(\"Creating Feature Group {}\".format(feature_group_name))\n",
- " feature_group.create(\n",
- " s3_uri=feature_store_offline_s3_uri,\n",
- " record_identifier_name=record_identifier_feature_name,\n",
- " event_time_feature_name=event_time_feature_name,\n",
- " role_arn=iam_role,\n",
- " enable_online_store=enable_online_store\n",
- " )\n",
- " if error == 'ResourceInUse':\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Invoke the Feature Store API to create the feature group and wait until it is ready"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "\n",
- "def wait_for_feature_group_creation_complete(feature_group):\n",
- " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " while status == \"Creating\":\n",
- " print(\"Waiting for Feature Group Creation\")\n",
- " time.sleep(5)\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " if status != \"Created\":\n",
- " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n",
- " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
- "\n",
- "wait_for_feature_group_creation_complete(feature_group=feature_group)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that the feature group is created, You will use a processing job to process your \n",
- " data at scale and ingest the transformed data into this feature group."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Inputs and Outputs\n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----\n",
- "\n",
- "The below settings configure the inputs and outputs for the flow export.\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n",
- "\n",
- "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n",
- "2. For all other sources, configure attributes like query_string, database in the source's \n",
- "DatasetDefinition object.\n",
- "\n",
- "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n",
- "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
- "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n",
- "\n",
- "data_sources = []"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: tracks.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/tracks.csv\",\n",
- " input_name=\"tracks.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: ratings.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/ratings.csv\",\n",
- " input_name=\"ratings.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Output: Feature Store \n",
- "\n",
- "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output. Notice the `output_name` variable below; this ID is found within the `.flow` file at the node point you want to capture transformations up to. The `.flow` file contains instructions for SageMaker Data Wrangler to know where to look for data and how to transform it. Each data transformation action is associated with a node and therefore a node ID. Using the associated node ID + output name tells SageMaker up to what point in the transformation process you want to export to a feature store."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import FeatureStoreOutput\n",
- "\n",
- "# Output name is auto-generated from the select node's ID + output name from the .flow file\n",
- "output_name = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node\n",
- "\n",
- "processing_job_output = ProcessingOutput(\n",
- " output_name=output_name,\n",
- " app_managed=True,\n",
- " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Upload Flow to S3\n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----\n",
- "To use the Data Wrangler as an input to the processing job, first upload your flow file to Amazon S3."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import json\n",
- "import boto3\n",
- "\n",
- "# name of the flow file which should exist in the current notebook working directory\n",
- "flow_file_name = \"01_music_dataprep.flow\"\n",
- "\n",
- "# Load .flow file from current notebook working directory \n",
- "!echo \"Loading flow file from current notebook working directory: $PWD\"\n",
- "\n",
- "with open(flow_file_name) as f:\n",
- " flow = json.load(f)\n",
- "\n",
- "# Upload flow to S3\n",
- "s3_client = boto3.client(\"s3\")\n",
- "s3_client.upload_file(flow_file_name, bucket, f\"{prefix}/data_wrangler_flows/{flow_export_name}.flow\")\n",
- "\n",
- "flow_s3_uri = f\"s3://{bucket}/{prefix}/data_wrangler_flows/{flow_export_name}.flow\"\n",
- "\n",
- "\n",
- "ps.add({'flow_s3_uri': flow_s3_uri}, namespace='music-rec')\n",
- "ps.store()\n",
- "\n",
- "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Input - Flow: 01_music_dataprep.flow\n",
- "flow_input = ProcessingInput(\n",
- " source=flow_s3_uri,\n",
- " destination=\"/opt/ml/processing/flow\",\n",
- " input_name=\"flow\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Run Processing Job \n",
- "\n",
- "##### [back to top](#02a-nb)\n",
- "\n",
- "----\n",
- "## Job Configurations\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "You can configure the following settings for Processing Jobs. If you change any configurations you will \n",
- "need to re-execute this and all cells below it by selecting the Run menu above and click \n",
- "Run Selected Cells and All Below\n",
- "\n",
- "1. IAM role for executing the processing job. \n",
- "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n",
- "3. Data Wrangler Container URL.\n",
- "4. Instance count, instance type and storage volume size in GB.\n",
- "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "6. Network Isolation settings\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n",
- "processing_job_name = \"dw-flow-proc-music-rec-tracks-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n",
- "print (f\"{processing_job_name}\")\n",
- "\n",
- "# Data Wrangler Container URL.\n",
- "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n",
- "\n",
- "# Processing Job Instance count and instance type.\n",
- "instance_count = 2\n",
- "instance_type = \"ml.m5.4xlarge\"\n",
- "\n",
- "# Size in GB of the EBS volume to use for storing data during processing\n",
- "volume_size_in_gb = 30\n",
- "\n",
- "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "output_content_type = \"CSV\"\n",
- "\n",
- "# Network Isolation mode; default is off\n",
- "enable_network_isolation = False\n",
- "\n",
- "# Output configuration used as processing job container arguments \n",
- "output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Processing Job\n",
- "\n",
- "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import Processor\n",
- "from sagemaker.network import NetworkConfig\n",
- "\n",
- "processor = Processor(\n",
- " role=iam_role,\n",
- " image_uri=container_uri,\n",
- " instance_count=instance_count,\n",
- " instance_type=instance_type,\n",
- " volume_size_in_gb=volume_size_in_gb,\n",
- " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n",
- " sagemaker_session=sess\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Job Status & S3 Output Location\n",
- "\n",
- "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n",
- "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "\n",
- "# Run Processing Job if job not already previously ran\n",
- "if feature_group_exists:\n",
- " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n",
- "else:\n",
- " print(\"Creating Processing Job: {}\".format(feature_group_name))\n",
- " processor.run(\n",
- " inputs=[flow_input] + data_sources, \n",
- " outputs=[processing_job_output],\n",
- " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n",
- " wait=False,\n",
- " logs=False,\n",
- " job_name=processing_job_name\n",
- " ) \n",
- " \n",
- " job_result = sess.wait_for_processing_job(processing_job_name)\n",
- " print(job_result)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n",
- "for detailed guide. [Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)"
- ]
- }
- ],
- "metadata": {
- "instance_type": "ml.t3.medium",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb b/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb
deleted file mode 100644
index 1a1e9ba331..0000000000
--- a/end_to_end/music_recommendation/02b_export_fg_5star_features.ipynb
+++ /dev/null
@@ -1,762 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 2b: Feature Store Creation - User Preferences\n",
- "\n",
- "----\n",
- "\n",
- "This notebook creates a feature group for our user music preference data to place in our feature store using the transformation instructions found in our `.flow` file.\n",
- "\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- " - [Define Feature Group](#02b-define-fg)\n",
- " - [Configure Feature Group](#02b-config-fg)\n",
- " - [Initialize & Create Feature Group](#02b-init-create-fg)\n",
- " - [Inputs and Outputs](#02b-input-output)\n",
- " - [Run Processing Job](#02b-run-job)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n",
- "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n",
- "Processing Job and ingest processed data to Feature Store. \n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Feature Group\n",
- "\n",
- "_What is a feature group_\n",
- "\n",
- "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n",
- "collection of features - each feature in the feature group has a specified data type and name. \n",
- "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n",
- "collection of feature groups. To learn more about SageMaker Feature Store, see \n",
- "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Define Feature Group \n",
- "\n",
- "##### [back to top](#02b-nb)\n",
- "\n",
- "----\n",
- "Select Record identifier and Event time feature name. These are required parameters for feature group\n",
- "creation.\n",
- "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n",
- "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n",
- "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n",
- "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n",
- "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n",
- "\n",
- " 💡Record identifier and Event time feature name are required \n",
- "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n",
- "from the Run Menu from the menu bar. \n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "bucket = parameters['bucket']\n",
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "flow_export_id = parameters['flow_export_id']\n",
- "flow_s3_uri = parameters['flow_s3_uri']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "record_identifier_feature_name = 'userId'\n",
- "if record_identifier_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n",
- "\n",
- "event_time_feature_name = 'EventTime'\n",
- "if event_time_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the event time feature name.\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Feature Definitions\n",
- "The following is a list of the feature names and feature types of the final dataset that will be produced \n",
- "when your data flow is used to process your input dataset. These are automatically generated from the \n",
- "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n",
- "select a new step to export.\n",
- "\n",
- " 💡
Configurable Settings \n",
- "\n",
- "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n",
- "features.\n",
- "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n",
- "(
Integral,
Fractional, or
String). The default type is set to
String. \n",
- "This means that, if a column in your dataset is not a
float or
long type, it will default \n",
- "to
String in your Feature Store.\n",
- "\n",
- "For
Event Time features, make sure the format follows the feature store\n",
- "
\n",
- " \n",
- " Event Time feature format\n",
- " \n",
- "\n",
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "column_schemas = [\n",
- " {\n",
- " \"name\": \"userId\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"energy_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"acousticness_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"valence_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"speechiness_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"instrumentalness_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"liveness_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"tempo_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"danceability_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Latin_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Folk_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Blues_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Rap_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Reggae_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Jazz_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_RnB_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Country_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Electronic_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"genre_Pop_Rock_5star\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"EventTime\",\n",
- " \"type\": \"float\"\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n",
- "supported by Feature Store. The following will create a default_FG_type set to String for these types."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_definition import FeatureDefinition\n",
- "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n",
- "\n",
- "default_feature_type = FeatureTypeEnum.STRING\n",
- "column_to_feature_type_mapping = {\n",
- " \"float\": FeatureTypeEnum.FRACTIONAL,\n",
- " \"long\": FeatureTypeEnum.INTEGRAL\n",
- "}\n",
- "\n",
- "feature_definitions = [\n",
- " FeatureDefinition(\n",
- " feature_name=column_schema['name'], \n",
- " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n",
- " ) for column_schema in column_schemas\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Configure Feature Group\n",
- "\n",
- "##### [back to top](#02b-nb)\n",
- "\n",
- "----\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "1. feature_group_name: name of the feature group.\n",
- "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n",
- "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n",
- "1. iam_role: IAM role for executing the processing job.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from time import gmtime, strftime\n",
- "import sagemaker \n",
- "\n",
- "# Sagemaker session\n",
- "sess = sagemaker.Session()\n",
- "\n",
- "# IAM role for executing the processing job.\n",
- "iam_role = sagemaker.get_execution_role()\n",
- "\n",
- "# flow name and an unique ID for this export (used later as the processing job name for the export)\n",
- "flow_name = \"01_music_dataprep\"\n",
- "flow_export_name = f\"flow-{flow_export_id}\"\n",
- "\n",
- "# feature group name, with flow_name and an unique id. You can give it a customized name\n",
- "feature_group_name = 'user-5star-track-features-music-rec'\n",
- "print(f\"Feature Group Name: {feature_group_name}\")\n",
- "\n",
- "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n",
- "# S3 location owned by you.\n",
- "feature_store_offline_s3_uri = 's3://' + bucket\n",
- "\n",
- "# controls if online store is enabled. Enabling the online store allows quick access to \n",
- "# the latest value for a Record via the GetRecord API.\n",
- "enable_online_store = True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fg_name_user_preferences = feature_group_name\n",
- "\n",
- "ps.add({'fg_name_user_preferences': fg_name_user_preferences}, namespace='music-rec')\n",
- "ps.store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Initialize & Create Feature Group\n",
- "\n",
- "##### [back to top](#02b-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Initialize Boto3 session that is required to create feature group\n",
- "import boto3\n",
- "from sagemaker.session import Session\n",
- "\n",
- "region = boto3.Session().region_name\n",
- "boto_session = boto3.Session(region_name=region)\n",
- "\n",
- "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
- "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
- "\n",
- "feature_store_session = Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client,\n",
- " sagemaker_featurestore_runtime_client=featurestore_runtime\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Feature group is initialized and created below"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_group import FeatureGroup\n",
- "\n",
- "feature_group = FeatureGroup(\n",
- " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n",
- "\n",
- "# only create feature group if it doesn't already exist\n",
- "try:\n",
- " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n",
- "except Exception as e:\n",
- " error = e.response.get('Error').get('Code')\n",
- " if error == \"ResourceNotFound\":\n",
- " feature_group_exists=False\n",
- " print(\"Creating Feature Group {}\".format(feature_group_name))\n",
- " feature_group.create(\n",
- " s3_uri=feature_store_offline_s3_uri,\n",
- " record_identifier_name=record_identifier_feature_name,\n",
- " event_time_feature_name=event_time_feature_name,\n",
- " role_arn=iam_role,\n",
- " enable_online_store=enable_online_store\n",
- " )\n",
- " if error == 'ResourceInUse':\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Invoke the Feature Store API to create the feature group and wait until it is ready"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "def wait_for_feature_group_creation_complete(feature_group):\n",
- " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " while status == \"Creating\":\n",
- " print(\"Waiting for Feature Group Creation\")\n",
- " time.sleep(5)\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " if status != \"Created\":\n",
- " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n",
- " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
- "\n",
- "wait_for_feature_group_creation_complete(feature_group=feature_group)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that the feature group is created, You will use a processing job to process your \n",
- " data at scale and ingest the transformed data into this feature group."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Inputs and Outputs\n",
- "\n",
- "##### [back to top](#02b-nb)\n",
- "\n",
- "----\n",
- "The below settings configure the inputs and outputs for the flow export.\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n",
- "\n",
- "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n",
- "2. For all other sources, configure attributes like query_string, database in the source's \n",
- "DatasetDefinition object.\n",
- "\n",
- "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n",
- "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
- "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n",
- "\n",
- "data_sources = []"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: tracks.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/tracks.csv\",\n",
- " input_name=\"tracks.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: ratings.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/ratings.csv\",\n",
- " input_name=\"ratings.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Output: Feature Store \n",
- "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output. Notice the `output_name` variable below; this ID is found within the `.flow` file at the node point you want to capture transformations up to. The `.flow` file contains instructions for SageMaker Data Wrangler to know where to look for data and how to transform it. Each data transformation action is associated with a node and therefore a node ID. Using the associated node ID + output name tells SageMaker up to what point in the transformation process you want to export to a feature store."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import FeatureStoreOutput\n",
- "\n",
- "# Output name is auto-generated from the select node's ID + output name from the flow file.\n",
- "output_name = \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node\n",
- "\n",
- "processing_job_output = ProcessingOutput(\n",
- " output_name=output_name,\n",
- " app_managed=True,\n",
- " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We already uploaded our flow file in the previous notebook. Here the Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Input - Flow: 01_music_dataprep.flow\n",
- "flow_input = ProcessingInput(\n",
- " source=flow_s3_uri,\n",
- " destination=\"/opt/ml/processing/flow\",\n",
- " input_name=\"flow\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Run Processing Job \n",
- "\n",
- "##### [back to top](#02b-nb)\n",
- "\n",
- "----\n",
- "## Job Configurations\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "You can configure the following settings for Processing Jobs. If you change any configurations you will \n",
- "need to re-execute this and all cells below it by selecting the Run menu above and click \n",
- "Run Selected Cells and All Below\n",
- "\n",
- "1. IAM role for executing the processing job. \n",
- "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n",
- "3. Data Wrangler Container URL.\n",
- "4. Instance count, instance type and storage volume size in GB.\n",
- "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "6. Network Isolation settings\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import uuid\n",
- "\n",
- "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n",
- "#processing_job_name = \"data-wrangler-flow-processing-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n",
- "\n",
- "processing_job_name = \"dw-flow-proc-music-rec-5star-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n",
- "print (f\"{processing_job_name}\")\n",
- "\n",
- "\n",
- "# Data Wrangler Container URL.\n",
- "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n",
- "\n",
- "# Processing Job Instance count and instance type.\n",
- "instance_count = 2\n",
- "instance_type = \"ml.m5.4xlarge\"\n",
- "\n",
- "# Size in GB of the EBS volume to use for storing data during processing\n",
- "volume_size_in_gb = 30\n",
- "\n",
- "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "output_content_type = \"CSV\"\n",
- "\n",
- "# Network Isolation mode; default is off\n",
- "enable_network_isolation = False\n",
- "\n",
- "# Output configuration used as processing job container arguments \n",
- "output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Processing Job\n",
- "\n",
- "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import Processor\n",
- "from sagemaker.network import NetworkConfig\n",
- "import json\n",
- "\n",
- "processor = Processor(\n",
- " role=iam_role,\n",
- " image_uri=container_uri,\n",
- " instance_count=instance_count,\n",
- " instance_type=instance_type,\n",
- " volume_size_in_gb=volume_size_in_gb,\n",
- " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n",
- " sagemaker_session=sess\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Job Status & S3 Output Location\n",
- "\n",
- "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n",
- "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "\n",
- "# Run Processing Job if job not already previously ran\n",
- "if feature_group_exists:\n",
- " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n",
- "else:\n",
- " print(\"Creating Processing Job: {}\".format(feature_group_name))\n",
- " processor.run(\n",
- " inputs=[flow_input] + data_sources, \n",
- " outputs=[processing_job_output],\n",
- " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n",
- " wait=False,\n",
- " logs=False,\n",
- " job_name=processing_job_name\n",
- " ) \n",
- " \n",
- " job_result = sess.wait_for_processing_job(processing_job_name)\n",
- " print(job_result)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n",
- "for detailed guide.[Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)"
- ]
- }
- ],
- "metadata": {
- "instance_type": "ml.m5.large",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb b/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb
deleted file mode 100644
index 69d0bc4b2d..0000000000
--- a/end_to_end/music_recommendation/02c_export_fg_ratings.ipynb
+++ /dev/null
@@ -1,706 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 2c: Feature Store Creation - Ratings\n",
- "\n",
- "----\n",
- "\n",
- "This notebook creates a feature group for our ratings data to place in our feature store using the transformation instructions found in our `.flow` file.\n",
- "\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- " - [Define Feature Group](#02c-define-fg)\n",
- " - [Configure Feature Group](#02c-config-fg)\n",
- " - [Initialize & Create Feature Group](#02c-init-create-fg)\n",
- " - [Inputs and Outputs](#02c-input-output)\n",
- " - [Run Processing Job](#02c-run-job)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)\n",
- "\n",
- "\n",
- "\n",
- "\n",
- "This notebook uses Amazon SageMaker Feature Store (Feature Store) to create a feature group, \n",
- "executes your Data Wrangler Flow `01_music_dataprep.flow` on the entire dataset using a SageMaker \n",
- "Processing Job and ingest processed data to Feature Store. \n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')\n",
- "\n",
- "bucket = parameters['bucket']\n",
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "flow_export_id = parameters['flow_export_id']\n",
- "flow_s3_uri = parameters['flow_s3_uri']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Feature Group\n",
- "\n",
- "_What is a feature group_\n",
- "\n",
- "A single feature corresponds to a column in your dataset. A feature group is a predefined schema for a \n",
- "collection of features - each feature in the feature group has a specified data type and name. \n",
- "A single record in a feature group corresponds to a row in your dataframe. A feature store is a \n",
- "collection of feature groups. To learn more about SageMaker Feature Store, see \n",
- "[Amazon Feature Store Documentation](http://docs.aws.amazon.com/sagemaker/latest/dg/feature-store.html)."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Define Feature Group \n",
- "\n",
- "##### [back to top](#02c-nb)\n",
- "\n",
- "----\n",
- "Select Record identifier and Event time feature name. These are required parameters for feature group\n",
- "creation.\n",
- "* **Record identifier name** is the name of the feature defined in the feature group's feature definitions \n",
- "whose value uniquely identifies a Record defined in the feature group's feature definitions.\n",
- "* **Event time feature name** is the name of the EventTime feature of a Record in FeatureGroup. An EventTime \n",
- "is a timestamp that represents the point in time when a new event occurs that corresponds to the creation or \n",
- "update of a Record in the FeatureGroup. All Records in the FeatureGroup must have a corresponding EventTime.\n",
- "\n",
- " 💡Record identifier and Event time feature name are required \n",
- "for feature group. After filling in the values, you can choose Run Selected Cell and All Below \n",
- "from the Run Menu from the menu bar. \n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "record_identifier_feature_name = \"ratingEventId\"\n",
- "if record_identifier_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the feature group record identifier.\")\n",
- "\n",
- "event_time_feature_name = \"EventTime\"\n",
- "if event_time_feature_name is None:\n",
- " raise SystemExit(\"Select a column name as the event time feature name.\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Feature Definitions\n",
- "The following is a list of the feature names and feature types of the final dataset that will be produced \n",
- "when your data flow is used to process your input dataset. These are automatically generated from the \n",
- "step `Custom Pyspark` from `Source: Answers.Csv`. To save from a different step, go to Data Wrangler to \n",
- "select a new step to export.\n",
- "\n",
- " 💡
Configurable Settings \n",
- "\n",
- "1. You can select a subset of the features. By default all columns of the result dataframe will be used as \n",
- "features.\n",
- "2. You can change the Data Wrangler data type to one of the Feature Store supported types \n",
- "(
Integral,
Fractional, or
String). The default type is set to
String. \n",
- "This means that, if a column in your dataset is not a
float or
long type, it will default \n",
- "to
String in your Feature Store.\n",
- "\n",
- "For
Event Time features, make sure the format follows the feature store\n",
- "
\n",
- " \n",
- " Event Time feature format\n",
- " \n",
- "\n",
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The following is a list of the feature names and data types of the final dataset that will be produced when your data flow is used to process your input dataset."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "column_schemas = [\n",
- " {\n",
- " \"name\": \"ratingEventId\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"ts\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"userId\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"trackId\",\n",
- " \"type\": \"string\"\n",
- " },\n",
- " {\n",
- " \"name\": \"sessionId\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"itemInSession\",\n",
- " \"type\": \"long\"\n",
- " },\n",
- " {\n",
- " \"name\": \"Rating\",\n",
- " \"type\": \"float\"\n",
- " },\n",
- " {\n",
- " \"name\": \"EventTime\",\n",
- " \"type\": \"float\"\n",
- " }\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Below we create the SDK input for those feature definitions. Some schema types in Data Wrangler are not \n",
- "supported by Feature Store. The following will create a default_FG_type set to String for these types."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_definition import FeatureDefinition\n",
- "from sagemaker.feature_store.feature_definition import FeatureTypeEnum\n",
- "\n",
- "default_feature_type = FeatureTypeEnum.STRING\n",
- "column_to_feature_type_mapping = {\n",
- " \"float\": FeatureTypeEnum.FRACTIONAL,\n",
- " \"long\": FeatureTypeEnum.INTEGRAL\n",
- "}\n",
- "\n",
- "feature_definitions = [\n",
- " FeatureDefinition(\n",
- " feature_name=column_schema['name'], \n",
- " feature_type=column_to_feature_type_mapping.get(column_schema['type'], default_feature_type)\n",
- " ) for column_schema in column_schemas\n",
- "]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Configure Feature Group\n",
- "\n",
- "##### [back to top](#02c-nb)\n",
- "\n",
- "----\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "1. feature_group_name: name of the feature group.\n",
- "1. feature_store_offline_s3_uri: SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a S3 location owned by you.\n",
- "1. enable_online_store: controls if online store is enabled. Enabling the online store allows quick access to the latest value for a Record via the GetRecord API.\n",
- "1. iam_role: IAM role for executing the processing job.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from time import gmtime, strftime\n",
- "import uuid\n",
- "import sagemaker \n",
- "\n",
- "# Sagemaker session\n",
- "sess = sagemaker.Session()\n",
- "\n",
- "# IAM role for executing the processing job.\n",
- "iam_role = sagemaker.get_execution_role()\n",
- "\n",
- "# flow name and an unique ID for this export (used later as the processing job name for the export)\n",
- "flow_name = \"01_music_dataprep\"\n",
- "flow_export_name = f\"flow-{flow_export_id}\"\n",
- "\n",
- "# feature group name, with flow_name and an unique id. You can give it a customized name\n",
- "feature_group_name = 'ratings-features-music-rec'\n",
- "print(f\"Feature Group Name: {feature_group_name}\")\n",
- "\n",
- "# SageMaker FeatureStore writes the data in the OfflineStore of a FeatureGroup to a \n",
- "# S3 location owned by you.\n",
- "feature_store_offline_s3_uri = 's3://' + bucket\n",
- "\n",
- "# controls if online store is enabled. Enabling the online store allows quick access to \n",
- "# the latest value for a Record via the GetRecord API.\n",
- "enable_online_store = True"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "fg_name_ratings = feature_group_name\n",
- "\n",
- "ps.add({'fg_name_ratings': fg_name_ratings}, namespace='music-rec')\n",
- "ps.store()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "### Initialize & Create Feature Group\n",
- "\n",
- "##### [back to top](#02c-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Initialize Boto3 session that is required to create feature group\n",
- "import boto3\n",
- "from sagemaker.session import Session\n",
- "\n",
- "region = boto3.Session().region_name\n",
- "boto_session = boto3.Session(region_name=region)\n",
- "\n",
- "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
- "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
- "\n",
- "feature_store_session = Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client,\n",
- " sagemaker_featurestore_runtime_client=featurestore_runtime\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Feature group is initialized and created below"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.feature_store.feature_group import FeatureGroup\n",
- "\n",
- "feature_group = FeatureGroup(\n",
- " name=feature_group_name, sagemaker_session=feature_store_session, feature_definitions=feature_definitions)\n",
- "\n",
- "# only create feature group if it doesn't already exist\n",
- "try:\n",
- " sagemaker_client.describe_feature_group(FeatureGroupName=feature_group_name, NextToken='string')\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))\n",
- "except Exception as e:\n",
- " error = e.response.get('Error').get('Code')\n",
- " if error == \"ResourceNotFound\":\n",
- " feature_group_exists=False\n",
- " print(\"Creating Feature Group {}\".format(feature_group_name))\n",
- " feature_group.create(\n",
- " s3_uri=feature_store_offline_s3_uri,\n",
- " record_identifier_name=record_identifier_feature_name,\n",
- " event_time_feature_name=event_time_feature_name,\n",
- " role_arn=iam_role,\n",
- " enable_online_store=enable_online_store\n",
- " )\n",
- " if error == 'ResourceInUse':\n",
- " feature_group_exists=True\n",
- " print(\"Feature Group {0} already exists. Using {0}\".format(feature_group_name))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Invoke the Feature Store API to create the feature group and wait until it is ready"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "def wait_for_feature_group_creation_complete(feature_group):\n",
- " \"\"\"Helper function to wait for the completions of creating a feature group\"\"\"\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " while status == \"Creating\":\n",
- " print(\"Waiting for Feature Group Creation\")\n",
- " time.sleep(5)\n",
- " status = feature_group.describe().get(\"FeatureGroupStatus\")\n",
- " if status != \"Created\":\n",
- " raise SystemExit(f\"Failed to create feature group {feature_group.name}: {status}\")\n",
- " print(f\"FeatureGroup {feature_group.name} successfully created.\")\n",
- "\n",
- "wait_for_feature_group_creation_complete(feature_group=feature_group)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that the feature group is created, You will use a processing job to process your \n",
- " data at scale and ingest the transformed data into this feature group."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Inputs and Outputs\n",
- "\n",
- "##### [back to top](#02c-nb)\n",
- "\n",
- "----\n",
- "The below settings configure the inputs and outputs for the flow export.\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "In Input - Source you can configure the data sources that will be used as input by Data Wrangler\n",
- "\n",
- "1. For S3 sources, configure the source attribute that points to the input S3 prefixes\n",
- "2. For all other sources, configure attributes like query_string, database in the source's \n",
- "DatasetDefinition object.\n",
- "\n",
- "If you modify the inputs the provided data must have the same schema and format as the data used in the Flow. \n",
- "You should also re-execute the cells in this section if you have modified the settings in any data sources.\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
- "from sagemaker.dataset_definition.inputs import AthenaDatasetDefinition, DatasetDefinition, RedshiftDatasetDefinition\n",
- "\n",
- "data_sources = []"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: tracks.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{tracks_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/tracks.csv\",\n",
- " input_name=\"tracks.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Input - S3 Source: ratings.csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_sources.append(ProcessingInput(\n",
- " source=f\"{ratings_data_source}\", # You could override this to point to another dataset on S3\n",
- " destination=\"/opt/ml/processing/ratings.csv\",\n",
- " input_name=\"ratings.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Output: Feature Store \n",
- "\n",
- "Below are the inputs required by the SageMaker Python SDK to launch a processing job with feature store as an output."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import FeatureStoreOutput\n",
- "\n",
- "# Output name is auto-generated from the select node's ID + output name from the flow file.\n",
- "output_name = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node\n",
- "\n",
- "processing_job_output = ProcessingOutput(\n",
- " output_name=output_name,\n",
- " app_managed=True,\n",
- " feature_store_output=FeatureStoreOutput(feature_group_name=feature_group_name),\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We already uploaded our flow file in the `02a` notebook. Here the Data Wrangler Flow is also provided to the Processing Job as an input source which we configure below."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "## Input - Flow: 01_music_dataprep.flow\n",
- "flow_input = ProcessingInput(\n",
- " source=flow_s3_uri,\n",
- " destination=\"/opt/ml/processing/flow\",\n",
- " input_name=\"flow\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Run Processing Job \n",
- "\n",
- "##### [back to top](#02c-nb)\n",
- "\n",
- "----\n",
- "## Job Configurations\n",
- "\n",
- " 💡 Configurable Settings \n",
- "\n",
- "You can configure the following settings for Processing Jobs. If you change any configurations you will \n",
- "need to re-execute this and all cells below it by selecting the Run menu above and click \n",
- "Run Selected Cells and All Below\n",
- "\n",
- "1. IAM role for executing the processing job. \n",
- "2. A unique name of the processing job. Give a unique name every time you re-execute processing jobs\n",
- "3. Data Wrangler Container URL.\n",
- "4. Instance count, instance type and storage volume size in GB.\n",
- "5. Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "6. Network Isolation settings\n",
- "
"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# IAM role for executing the processing job.\n",
- "iam_role = sagemaker.get_execution_role()\n",
- "\n",
- "processing_job_name = \"dw-flow-proc-music-rec-ratings-{}-{}\".format(flow_export_id, str(uuid.uuid4())[:8])\n",
- "print (f\"{processing_job_name}\")\n",
- "\n",
- "# Data Wrangler Container URL.\n",
- "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n",
- "\n",
- "# Processing Job Instance count and instance type.\n",
- "instance_count = 2\n",
- "instance_type = \"ml.m5.4xlarge\"\n",
- "\n",
- "# Size in GB of the EBS volume to use for storing data during processing\n",
- "volume_size_in_gb = 30\n",
- "\n",
- "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
- "output_content_type = \"CSV\"\n",
- "\n",
- "# Network Isolation mode; default is off\n",
- "enable_network_isolation = False\n",
- "\n",
- "# Output configuration used as processing job container arguments \n",
- "output_config = {\n",
- " output_name: {\n",
- " \"content_type\": output_content_type\n",
- " }\n",
- "}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Create Processing Job\n",
- "\n",
- "To launch a Processing Job, you will use the SageMaker Python SDK to create a Processor function."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import Processor\n",
- "from sagemaker.network import NetworkConfig\n",
- "import json\n",
- "\n",
- "processor = Processor(\n",
- " role=iam_role,\n",
- " image_uri=container_uri,\n",
- " instance_count=instance_count,\n",
- " instance_type=instance_type,\n",
- " volume_size_in_gb=volume_size_in_gb,\n",
- " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n",
- " sagemaker_session=sess\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Job Status & S3 Output Location\n",
- "\n",
- "Below you wait for processing job to finish. If it finishes successfully, your feature group should be populated \n",
- "with transformed feature values. In addition the raw parameters used by the Processing Job will be printed."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "\n",
- "# Run Processing Job if job not already previously ran\n",
- "if feature_group_exists:\n",
- " print(\"Feature Group {0} already exists therefore we will not run a processing job to create it again\".format(feature_group_name))\n",
- "else:\n",
- " print(\"Creating Processing Job: {}\".format(feature_group_name))\n",
- " processor.run(\n",
- " inputs=[flow_input] + data_sources, \n",
- " outputs=[processing_job_output],\n",
- " arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n",
- " wait=False,\n",
- " logs=False,\n",
- " job_name=processing_job_name\n",
- " ) \n",
- " \n",
- " job_result = sess.wait_for_processing_job(processing_job_name)\n",
- " print(job_result)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "You can view newly created feature group in Studio, refer to [Use Amazon SageMaker Feature Store with Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/feature-store-use-with-studio.html)\n",
- "for detailed guide.[Learn more about SageMaker Feature Store](https://github.com/aws/amazon-sagemaker-examples/tree/master/sagemaker-featurestore)"
- ]
- }
- ],
- "metadata": {
- "instance_type": "ml.t3.medium",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb
similarity index 96%
rename from end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb
rename to end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb
index 713761b922..4463aad1c6 100644
--- a/end_to_end/music_recommendation/03_train_model_lineage_registry_debugger.ipynb
+++ b/end_to_end/music_recommendation/03_train_deploy_debugger_explain_monitor_registry.ipynb
@@ -4,16 +4,18 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
- "# Music Recommender Part 3: Train Model with Debugger Hooks and Set Artifacts and Register Model\n",
+ "# Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK\n",
"\n",
"----\n",
- "In this notebook, we'll train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version.\n",
"\n",
- "A machine learning training job can have problems such as system bottlenecks, overfitting, saturated activation functions, and vanishing gradients, which can compromise model performance. SageMaker Debugger profiles and debugs training jobs to help resolve such problems and improve your ML model's compute resource utilization and performance. Debugger offers tools to send alerts when training anomalies are found, take actions against the problems, and identify the root cause of them by visualizing collected metrics and tensors.\n",
+ "## Background\n",
+ "\n",
+ "This notebook is part of a notebook series that goes through the ML lifecycle and shows how we can build a Music Recommender System using a combination of SageMaker services and features. This notebook will train our model using the data we prepped with SageMaker Data Wrangler and stored in our Feature Store, attaching [SageMaker Debugger](https://docs.aws.amazon.com/sagemaker/latest/dg/train-debugger.html) to the model training so that we can capture training metrics/statistics about the model. Then, we will deploy the model and use SageMaker Explainability and Model Monitor to examine our deployed model. After that, we'll log more model artifacts using [SageMaker ML Lineage Tracking](https://docs.aws.amazon.com/sagemaker/latest/dg/lineage-tracking.html). Finally we'll register the model and save its version. It is one of two notebooks you choose to run as the third notebook in the series. You can choose to run this notebook by itself or in sequence with the other notebooks listed below. Please see the [README.md](README.md) for more information about this use case of this sequence of notebooks.\n",
+ "\n",
+ "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n",
+ "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n",
+ "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb) (current notebook)\n",
"\n",
- "Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards. \n",
"\n",
" 💡 Alert \n",
"\n",
@@ -21,31 +23,17 @@
"
\n",
"\n",
"----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- " - [Fetch Data from Feature Store](#03-feature-store)\n",
- " - [Split Data and Save to S3](#03-split)\n",
- " - [Train Model](#03-train)\n",
- " - [SageMaker Debugger Reports](#03-debugger)\n",
- " - [Set Lineage Artifacts](#03-lineage)\n",
- " - [Register Model](#03-register)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Load stored variables\n",
- "If you ran this notebook before, you may want to re-use the resources you aready created with AWS. Run the cell below to load any prevously created variables. You should see a print-out of the existing variables. If you don't see anything you may need to create them again or it may be your first time running this notebook."
+ "## Contents\n",
+ "1. [Prereqs: Get Data](#Prereqs:-Get-Data)\n",
+ "1. [Train Model](#Train-Model)\n",
+ "1. [Deploy Model](#Deploy-Model)\n",
+ "1. [Create a Predictor](#Create-a-Predictor)\n",
+ "1. [Infer New Songs using Model](#Infer-New-Songs-using-Model)\n",
+ "1. [Explain Model Predictions](#Explain-Model-Predictions)\n",
+ "1. [View SageMaker Debugger Reports](#View-SageMaker-Debugger-Reports)\n",
+ "1. [SageMaker Model Monitor](#SageMaker-Model-Monitor)\n",
+ "1. [Register Model with SageMaker Model Registry](#Register-Model-with-SageMaker-Model-Registry)\n",
+ "1. [Clean Up](#Clean-Up)"
]
},
{
@@ -70,7 +58,10 @@
"import pandas as pd\n",
"import pathlib\n",
"import sagemaker\n",
- "import glob"
+ "import glob\n",
+ "import json\n",
+ "from datetime import datetime\n",
+ "import matplotlib.pyplot as plt"
]
},
{
@@ -81,30 +72,8 @@
"source": [
"import sys\n",
"import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
"\n",
- "parameters = ps.read('music-rec')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "fg_name_ratings = parameters['fg_name_ratings']\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "fg_name_user_preferences = parameters['fg_name_user_preferences']\n",
- "flow_export_id = parameters['flow_export_id']\n",
- "flow_s3_uri = parameters['flow_s3_uri']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "prefix = parameters['prefix']\n",
- "bucket = parameters['bucket']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']"
+ "sys.path.insert(1, \"./code\")"
]
},
{
@@ -126,21 +95,33 @@
"metadata": {},
"outputs": [],
"source": [
+ "# SageMaker session\n",
+ "sess = sagemaker.Session()\n",
+ "# get session bucket name\n",
+ "bucket = sess.default_bucket()\n",
+ "# bucket prefix or the subfolder for everything we produce\n",
+ "prefix = \"music-recommendation\"\n",
+ "# get sagemaker role\n",
+ "sagemaker_role = sagemaker.get_execution_role()\n",
+ "# s3 client\n",
+ "s3_client = boto3.client(\"s3\")\n",
+ "\n",
"region = boto3.Session().region_name\n",
"boto_session = boto3.Session(region_name=region)\n",
"\n",
"\n",
- "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
+ "sagemaker_client = boto_session.client(service_name=\"sagemaker\", region_name=region)\n",
"sagemaker_session = sagemaker.session.Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client\n",
+ " boto_session=boto_session, sagemaker_client=sagemaker_client\n",
+ ")\n",
+ "featurestore_runtime = boto_session.client(\n",
+ " service_name=\"sagemaker-featurestore-runtime\", region_name=region\n",
")\n",
- "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
"\n",
"feature_store_session = sagemaker.session.Session(\n",
" boto_session=boto_session,\n",
" sagemaker_client=sagemaker_client,\n",
- " sagemaker_featurestore_runtime_client=featurestore_runtime\n",
+ " sagemaker_featurestore_runtime_client=featurestore_runtime,\n",
")"
]
},
@@ -148,14 +129,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
- "## Fetch Data from Offline Feature Store\n",
- "\n",
- "##### [back to top](#03-nb)\n",
+ "## Prereqs: Get Data \n",
"\n",
"----\n",
- "There are 3 feature stores for the ratings, tracks, and user preferences data. We retrieve data from all 3 before joining them."
+ "\n",
+ "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
]
},
{
@@ -164,13 +142,7 @@
"metadata": {},
"outputs": [],
"source": [
- "from sagemaker.feature_store.feature_group import FeatureGroup\n",
- "\n",
- "feature_group_names = [fg_name_ratings, fg_name_tracks, fg_name_user_preferences]\n",
- "feature_groups = []\n",
- "for name in feature_group_names:\n",
- " feature_group = FeatureGroup(name=name, sagemaker_session=feature_store_session)\n",
- " feature_groups.append(feature_group)"
+ "from demo_helpers import get_data, get_model, update_data_sources"
]
},
{
@@ -179,13 +151,8 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_client = boto3.client('s3')\n",
- "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n",
- "\n",
- "sagemaker_role = sagemaker.get_execution_role()\n",
- "\n",
- "s3_output_path = 's3://' + bucket\n",
- "ps.add({'s3_output_path': s3_output_path}, namespace='music-rec')"
+ "# create data folder\n",
+ "!mkdir data"
]
},
{
@@ -194,36 +161,24 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_group_s3_prefixes = []\n",
- "for feature_group in feature_groups:\n",
- " feature_group_table_name = feature_group.describe().get(\"OfflineStoreConfig\").get(\"DataCatalogConfig\").get(\"TableName\")\n",
- " feature_group_s3_prefix = f'{account_id}/sagemaker/{region}/offline-store/{feature_group_table_name}'\n",
- " feature_group_s3_prefixes.append(feature_group_s3_prefix)"
+ "# public S3 bucket that contains our music data\n",
+ "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\""
]
},
{
"cell_type": "code",
"execution_count": null,
- "metadata": {
- "scrolled": true
- },
+ "metadata": {},
"outputs": [],
"source": [
- "# wait for data to be added to offline feature store\n",
- "def wait_for_offline_store(feature_group_s3_prefix):\n",
- " print(feature_group_s3_prefix)\n",
- " offline_store_contents = None\n",
- " while (offline_store_contents is None):\n",
- " objects_in_bucket = s3_client.list_objects(Bucket=bucket, Prefix=feature_group_s3_prefix)\n",
- " if ('Contents' in objects_in_bucket and len(objects_in_bucket['Contents']) > 1):\n",
- " offline_store_contents = objects_in_bucket['Contents']\n",
- " else:\n",
- " print('Waiting for data in offline store...')\n",
- " time.sleep(60)\n",
- " print('Data available.')\n",
- " \n",
- "for s3_prefix in feature_group_s3_prefixes:\n",
- " wait_for_offline_store(s3_prefix)"
+ "new_data_paths = get_data(\n",
+ " s3_client,\n",
+ " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n",
+ " bucket,\n",
+ " prefix,\n",
+ " sample_data=0.70,\n",
+ ")\n",
+ "print(new_data_paths)"
]
},
{
@@ -232,29 +187,19 @@
"metadata": {},
"outputs": [],
"source": [
- "tables = { \n",
- " 'ratings': {'feature_group': feature_groups[0],\n",
- " 'cols': ['userid', 'trackid', 'rating']\n",
- " },\n",
- " 'tracks': {'feature_group': feature_groups[1],\n",
- " 'cols': ['trackid', 'length', 'energy', 'acousticness', 'valence', 'speechiness', 'instrumentalness', \n",
- " 'liveness', 'tempo', 'danceability', 'genre_latin', 'genre_folk', 'genre_blues', 'genre_rap', \n",
- " 'genre_reggae', 'genre_jazz', 'genre_rnb', 'genre_country', 'genre_electronic', 'genre_pop_rock']\n",
- " },\n",
- " 'user_5star_features': {'feature_group': feature_groups[2],\n",
- " 'cols': ['userid', 'energy_5star', 'acousticness_5star', 'valence_5star', 'speechiness_5star', 'instrumentalness_5star', \n",
- " 'liveness_5star','tempo_5star', 'danceability_5star', 'genre_latin_5star', 'genre_folk_5star', 'genre_blues_5star', \n",
- " 'genre_rap_5star','genre_reggae_5star', 'genre_jazz_5star', 'genre_rnb_5star', 'genre_country_5star', \n",
- " 'genre_electronic_5star', 'genre_pop_rock_5star']\n",
- " },\n",
- " }"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "check if the athena queres have been done and the data sets exist, then just do train test split or just proceed to training"
+ "files_to_download = [\n",
+ " f\"sample_tracks.csv\",\n",
+ " f\"sample_user.csv\",\n",
+ " f\"train_data_headers.csv\",\n",
+ " f\"train_data.zip\",\n",
+ " f\"val_data_headers.csv\",\n",
+ " f\"val_data.zip\",\n",
+ "]\n",
+ "\n",
+ "for file in files_to_download:\n",
+ " s3_client.download_file(\n",
+ " f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\"\n",
+ " )"
]
},
{
@@ -263,38 +208,8 @@
"metadata": {},
"outputs": [],
"source": [
- "def get_train_val():\n",
- " for k, v in tables.items():\n",
- " query = v['feature_group'].athena_query()\n",
- " joined_cols = \", \".join(v['cols'])\n",
- " # limit number of datapoints for training time\n",
- " query_string = \"SELECT {} FROM \\\"{}\\\" LIMIT 500000\".format(joined_cols, query.table_name)\n",
- " print(query_string,'\\n')\n",
- "\n",
- " output_location = f's3://{bucket}/{prefix}/query_results/'\n",
- " query.run(query_string=query_string, output_location=output_location)\n",
- " query.wait()\n",
- "\n",
- " tables[k]['df'] = query.as_dataframe() \n",
- " \n",
- " ratings = tables['ratings']['df']\n",
- " tracks = tables['tracks']['df']\n",
- " user_prefs = tables['user_5star_features']['df']\n",
- " \n",
- " print('Merging datasets...')\n",
- " print(f'Ratings: {ratings.shape}\\nTracks: {tracks.shape}\\nUser Prefs: {user_prefs.shape}\\n')\n",
- " \n",
- " dataset = pd.merge(ratings, tracks, on='trackid', how='inner')\n",
- " dataset = pd.merge(dataset, user_prefs, on='userid', how='inner')\n",
- " dataset.drop_duplicates(inplace=True)\n",
- " dataset.drop(['userid', 'trackid'], axis=1, inplace=True)\n",
- "\n",
- " # split data\n",
- " from sklearn.model_selection import train_test_split\n",
- " train, val = train_test_split(dataset, test_size=0.2, random_state=42)\n",
- " print(\"Training dataset shape: {}\\nValidation dataset shape: {}\\n\".format(train.shape, val.shape))\n",
- "\n",
- " return train, val"
+ "! unzip './data/*.zip' -d './data'\n",
+ "! rm ./data/*.zip"
]
},
{
@@ -303,27 +218,16 @@
"metadata": {},
"outputs": [],
"source": [
- "%%time\n",
- "import pandas as pd\n",
- "\n",
- "# create the training data if it has not been created already\n",
- "if glob.glob('data/train_data.csv') and 'feature_names' in parameters:\n",
- " print('Using existing files: train_data.csv & val_data.csv')\n",
- " train = pd.read_csv('data/train_data.csv', names=[\"rating\"]+parameters['feature_names'])\n",
- " val = pd.read_csv('data/val_data.csv', names=[\"rating\"]+parameters['feature_names'])\n",
- "else:\n",
- " print('Creating training and validation sets...\\n')\n",
- " train, val = get_train_val()\n",
- " # Write to csv in S3 without headers and index column\n",
- " train.to_csv('./data/train_data.csv', header=False, index=False)\n",
- " val.to_csv('./data/val_data.csv', header=False, index=False)"
+ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n",
+ "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n",
+ "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Save data to S3"
+ "### Save data to S3"
]
},
{
@@ -333,39 +237,33 @@
"outputs": [],
"source": [
"%%time\n",
- "s3_client.upload_file('data/train_data.csv', bucket, f'{prefix}/data/train/train_data.csv')\n",
- "s3_client.upload_file('data/val_data.csv', bucket, f'{prefix}/data/val/val_data.csv')\n",
+ "\n",
+ "train_headers = pd.read_csv(\"data/train_data_headers.csv\", header=None)[0].tolist()\n",
+ "val_headers = pd.read_csv(\"data/val_data_headers.csv\", header=None)[0].tolist()\n",
+ "train = pd.read_csv(\"data/train_data.csv\", names=train_headers)\n",
+ "val = pd.read_csv(\"data/val_data.csv\", names=val_headers)\n",
+ "\n",
+ "s3_client.upload_file(\"data/train_data.csv\", bucket, f\"{prefix}/data/train/train_data.csv\")\n",
+ "s3_client.upload_file(\"data/val_data.csv\", bucket, f\"{prefix}/data/val/val_data.csv\")\n",
"\n",
"\n",
- "train_data_uri = f's3://{bucket}/{prefix}/data/train/train_data.csv'\n",
- "val_data_uri = f's3://{bucket}/{prefix}/data/val/val_data.csv'\n",
- "print (f\"Saving training data to {train_data_uri}\")\n",
+ "train_data_uri = f\"s3://{bucket}/{prefix}/data/train/train_data.csv\"\n",
+ "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n",
+ "print(f\"Saving training data to {train_data_uri}\")\n",
"\n",
"# configure data inputs for SageMaker training\n",
"from sagemaker.inputs import TrainingInput\n",
+ "\n",
"train_input = TrainingInput(train_data_uri, content_type=\"text/csv\")\n",
"val_input = TrainingInput(val_data_uri, content_type=\"text/csv\")"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "ps.add({'train_data_uri': train_data_uri, 'val_data_uri': val_data_uri}, namespace='music-rec')"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
"## Train Model \n",
"\n",
- "##### [back to top](#03-nb)\n",
- "\n",
"----"
]
},
@@ -387,12 +285,12 @@
"outputs": [],
"source": [
"# variables used for parameterizing the notebook run\n",
- "estimator_output_path = f's3://{bucket}/{prefix}/training_jobs'\n",
+ "estimator_output_path = f\"s3://{bucket}/{prefix}/training_jobs\"\n",
"train_instance_count = 2\n",
- "train_instance_type = 'ml.m5.4xlarge'\n",
+ "train_instance_type = \"ml.m5.4xlarge\"\n",
"save_interval = 2\n",
"image = sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\")\n",
- "model_name = 'music-recommendation-model'\n",
+ "model_name = \"music-rec-model-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
"\n",
"hyperparameters = {\n",
" \"max_depth\": \"4\",\n",
@@ -414,39 +312,18 @@
" instance_type=train_instance_type,\n",
" image_uri=image,\n",
" hyperparameters=hyperparameters,\n",
- " base_job_name=model_name,\n",
" output_path=estimator_output_path,\n",
- " \n",
" debugger_hook_config=DebuggerHookConfig(\n",
- " s3_output_path=estimator_output_path+'/debugger', \n",
+ " s3_output_path=estimator_output_path + \"/debugger\",\n",
" collection_configs=[\n",
+ " CollectionConfig(name=\"metrics\", parameters={\"save_interval\": str(save_interval)}),\n",
" CollectionConfig(\n",
- " name=\"metrics\",\n",
- " parameters={\n",
- " \"save_interval\": str(save_interval)\n",
- " }\n",
- " ),\n",
- " CollectionConfig(\n",
- " name=\"feature_importance\",\n",
- " parameters={\n",
- " \"save_interval\": str(save_interval)\n",
- " }\n",
- " ),\n",
- " CollectionConfig(\n",
- " name=\"full_shap\",\n",
- " parameters={\n",
- " \"save_interval\": str(save_interval)\n",
- " }\n",
- " ),\n",
- " CollectionConfig(\n",
- " name=\"average_shap\",\n",
- " parameters={\n",
- " \"save_interval\": str(save_interval)\n",
- " }\n",
+ " name=\"feature_importance\", parameters={\"save_interval\": str(save_interval)}\n",
" ),\n",
+ " CollectionConfig(name=\"full_shap\", parameters={\"save_interval\": str(save_interval)}),\n",
+ " CollectionConfig(name=\"average_shap\", parameters={\"save_interval\": str(save_interval)}),\n",
" ],\n",
" ),\n",
- "\n",
" rules=[\n",
" Rule.sagemaker(\n",
" rule_configs.loss_not_decreasing(),\n",
@@ -474,10 +351,7 @@
"outputs": [],
"source": [
"response = sagemaker_client.list_training_jobs(\n",
- " NameContains = model_name,\n",
- " StatusEquals = 'Completed',\n",
- " SortBy='CreationTime',\n",
- " SortOrder='Descending'\n",
+ " NameContains=model_name, StatusEquals=\"Completed\", SortBy=\"CreationTime\", SortOrder=\"Descending\"\n",
")"
]
},
@@ -491,22 +365,18 @@
"source": [
"%%time\n",
"\n",
- "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n",
+ "train_model = True # True if training a new model, False if wanting to use an existing estimator once you've already trained\n",
"\n",
"if train_model:\n",
- " print('Training the model')\n",
- " xgb_estimator.fit(inputs = {'train': train_input, 'validation': val_input})\n",
+ " print(\"Training the model\")\n",
+ " xgb_estimator.fit(inputs={\"train\": train_input, \"validation\": val_input}, job_name=model_name)\n",
" s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n",
- " ps.add({'s3_debugger_output_path': s3_debugger_output_path}, namespace='music-rec')\n",
- " ps.store()\n",
- "elif len(response['TrainingJobSummaries']) > 0:\n",
- " training_job_name = response['TrainingJobSummaries'][0]['TrainingJobName']\n",
+ "elif len(response[\"TrainingJobSummaries\"]) > 0:\n",
+ " training_job_name = response[\"TrainingJobSummaries\"][0][\"TrainingJobName\"]\n",
" xgb_estimator = Estimator.attach(training_job_name)\n",
- " parameters = ps.read('music-rec')\n",
- " s3_debugger_output_path = parameters['s3_debugger_output_path']\n",
- " print(f'Using estimator from completed training job: {training_job_name}\\nwith debugger path {s3_debugger_output_path}')\n",
+ " s3_debugger_output_path = xgb_estimator.latest_job_debugger_artifacts_path()\n",
"else:\n",
- " print(\"No existing estimator found. You'll need to run as train = True\")\n"
+ " print(\"No existing estimator found. You'll need to run as train = True\")"
]
},
{
@@ -516,10 +386,7 @@
"outputs": [],
"source": [
"training_job_name = xgb_estimator.latest_training_job.job_name\n",
- "print(training_job_name)\n",
- "\n",
- "ps.add({'training_job_name': training_job_name}, namespace='music-rec')\n",
- "ps.store()"
+ "print(training_job_name)"
]
},
{
@@ -529,6 +396,7 @@
"outputs": [],
"source": [
"import pprint\n",
+ "\n",
"training_job_info = sagemaker_client.describe_training_job(TrainingJobName=training_job_name)\n",
"pprint.pprint(f\"{training_job_info}\")"
]
@@ -537,15 +405,331 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "## Deploy Model\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "endpoint_name = \"music-rec-endpoint-{}\".format(datetime.now().strftime(\"%Y%m%d-%H%M%S\"))\n",
+ "print(endpoint_name)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "endpoint_list = sagemaker_client.list_endpoints(\n",
+ " SortBy=\"CreationTime\",\n",
+ " SortOrder=\"Descending\",\n",
+ " NameContains=endpoint_name,\n",
+ " StatusEquals=\"InService\",\n",
+ ")\n",
+ "endpoint_list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Create endpoint"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "if len(endpoint_list[\"Endpoints\"]) > 0:\n",
+ " print(f\"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}\")\n",
+ "else:\n",
+ " # deploy endpoint for model if it doesn't already exist\n",
+ " xgb_estimator.deploy(\n",
+ " initial_instance_count=1, instance_type=\"ml.m4.xlarge\", endpoint_name=endpoint_name\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Create a Predictor\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predictor = sagemaker.predictor.Predictor(\n",
+ " endpoint_name=endpoint_name, sagemaker_session=sagemaker_session\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Pull user data from feature group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_user = pd.read_csv(\"./data/sample_user.csv\")\n",
+ "df_user = df_user.set_index(\"FeatureName\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Pull sample of 1000 tracks from feature group"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_tracks = pd.read_csv(\"./data/sample_tracks.csv\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "feature_names = pd.read_csv(\"data/train_data_headers.csv\", header=None)[0].tolist()[1:]\n",
+ "data = (\n",
+ " df_tracks.assign(key=1)\n",
+ " .merge(pd.DataFrame(df_user[\"ValueAsString\"]).T.assign(key=1), on=\"key\")\n",
+ " .drop(\"key\", axis=1)\n",
+ ")\n",
+ "data.columns = [c.lower() for c in data.columns]\n",
+ "inference_df = data[feature_names]\n",
+ "inference_df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Format the datapoint\n",
+ "The datapoint must match the exact input format as the model was trained--with all features in the correct order. In this example, the `col_order` variable was saved when you created the train and test datasets earlier in the guide."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data_inputs = [\",\".join([str(i) for i in row]) for row in inference_df.values]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Infer New Songs using Model\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "predictions = []\n",
+ "for data_input in data_inputs:\n",
+ " results = predictor.predict(data_input, initial_args={\"ContentType\": \"text/csv\"})\n",
+ " prediction = json.loads(results)\n",
+ " predictions.append(prediction)\n",
+ "print(f\"Predicted rating for sample user:\", prediction)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Write to csv in S3 without headers and index column.\n",
+ "inference_df[\"rating\"] = predictions\n",
+ "inference_df = inference_df[[\"rating\"] + feature_names]\n",
+ "inference_df.to_csv(\"data/prediction_data.csv\", header=False, index=False)\n",
"\n",
- "## View SageMaker Debugger Reports\n",
+ "s3_client.upload_file(\"data/prediction_data.csv\", bucket, f\"{prefix}/data/pred/prediction_data.csv\")\n",
"\n",
- "##### [back to top](#03-nb)\n",
+ "pred_data_uri = f\"s3://{bucket}/{prefix}/data/pred/prediction_data.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_client.download_file(bucket, f\"{prefix}/data/train/train_data.csv\", f\"train_data.csv\")\n",
+ "df_train = pd.read_csv(\"train_data.csv\")\n",
"\n",
+ "label = \"rating\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Explain Model Predictions\n",
"----"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = xgb_estimator.create_model(name=model_name)\n",
+ "container_def = model.prepare_container_def()\n",
+ "sess.create_model(model_name, sagemaker_role, container_def)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "explainability_output_path = f\"s3://{bucket}/{prefix}/clarify-output/explainability\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(\n",
+ " role=sagemaker_role,\n",
+ " instance_count=1,\n",
+ " instance_type=\"ml.c4.xlarge\",\n",
+ " sagemaker_session=sagemaker_session,\n",
+ ")\n",
+ "\n",
+ "model_config = sagemaker.clarify.ModelConfig(\n",
+ " model_name=model_name, instance_type=\"ml.m4.xlarge\", instance_count=1, accept_type=\"text/csv\"\n",
+ ")\n",
+ "\n",
+ "shap_config = sagemaker.clarify.SHAPConfig(\n",
+ " baseline=[\n",
+ " df_train.median().values[1:].tolist()\n",
+ " ], # ignore the first column since that is that target\n",
+ " num_samples=100,\n",
+ " agg_method=\"mean_abs\",\n",
+ ")\n",
+ "\n",
+ "explainability_data_config = sagemaker.clarify.DataConfig(\n",
+ " s3_data_input_path=pred_data_uri,\n",
+ " s3_output_path=explainability_output_path,\n",
+ " label=label,\n",
+ " headers=[label] + feature_names,\n",
+ " dataset_type=\"text/csv\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "try:\n",
+ " s3_client.download_file(\n",
+ " Bucket=bucket,\n",
+ " Key=f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\",\n",
+ " Filename=\"data/shap_output.csv\",\n",
+ " )\n",
+ " print(\"Downloaded output from previous explainability job\")\n",
+ "except Exception as e:\n",
+ " error = e.response.get(\"Error\").get(\"Code\")\n",
+ " if error == \"404\":\n",
+ " print(\"Running explainability job\")\n",
+ " clarify_processor.run_explainability(\n",
+ " data_config=explainability_data_config,\n",
+ " model_config=model_config,\n",
+ " explainability_config=shap_config,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "inference_df[\"trackid\"] = data[\"trackid\"]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "playlist_length = 10 # number of songs to recommend in playlist\n",
+ "playlist = inference_df.sort_values(by=\"rating\", ascending=False).head(playlist_length)\n",
+ "print(\"Curated Playlist:\\n\", playlist[\"trackid\"])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "s3_client.download_file(\n",
+ " bucket, f\"{prefix}/clarify-output/explainability/explanations_shap/out.csv\", f\"out.csv\"\n",
+ ")\n",
+ "local_explanations_out = pd.read_csv(\"out.csv\")\n",
+ "local_explanations_out.columns = feature_names\n",
+ "\n",
+ "print(\"Model prediction:\", playlist.iloc[0, 0])\n",
+ "plt.figure(figsize=(12, 6))\n",
+ "local_explanations_out.iloc[0].sort_values().plot.barh(title=\"Local explanation for prediction\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## View SageMaker Debugger Reports\n",
+ "\n",
+ "----\n",
+ "\n",
+ "A machine learning training job can have problems such as system bottlenecks, overfitting, saturated activation functions, and vanishing gradients, which can compromise model performance. SageMaker Debugger profiles and debugs training jobs to help resolve such problems and improve your ML model's compute resource utilization and performance. Debugger offers tools to send alerts when training anomalies are found, take actions against the problems, and identify the root cause of them by visualizing collected metrics and tensors.. \n"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -567,8 +751,8 @@
"try:\n",
" trial = create_trial(s3_debugger_output_path)\n",
"except:\n",
- " parameters = ps.read('music-rec')\n",
- " s3_debugger_output_path = parameters['s3_debugger_output_path']\n",
+ " parameters = ps.read(\"music-rec\")\n",
+ " s3_debugger_output_path = parameters[\"s3_debugger_output_path\"]\n",
" trial = create_trial(s3_debugger_output_path)"
]
},
@@ -578,11 +762,8 @@
"metadata": {},
"outputs": [],
"source": [
- "feature_names = list(train.drop('rating', axis=1).columns)\n",
- "print(feature_names)\n",
- "\n",
- "ps.add({'feature_names': feature_names}, namespace='music-rec')\n",
- "ps.store()"
+ "feature_names = list(train.drop(\"rating\", axis=1).columns)\n",
+ "print(feature_names)"
]
},
{
@@ -597,6 +778,7 @@
"\n",
"MAX_PLOTS = 35\n",
"\n",
+ "\n",
"def get_data(trial, tname):\n",
" \"\"\"\n",
" For the given tensor name, walks though all the iterations\n",
@@ -608,27 +790,29 @@
" vals = [tensor.value(s) for s in steps]\n",
" return steps, vals\n",
"\n",
+ "\n",
"def match_tensor_name_with_feature_name(tensor_name, feature_names=feature_names):\n",
" feature_tag = tensor_name.split(\"/\")\n",
" for ifeat, feature_name in enumerate(feature_names):\n",
- " if feature_tag[-1]==\"f{}\".format(str(ifeat)): return feature_name\n",
+ " if feature_tag[-1] == \"f{}\".format(str(ifeat)):\n",
+ " return feature_name\n",
" return tensor_name\n",
"\n",
"\n",
- "def plot_collection(trial, collection_name, regex='.*', figsize=(8, 6)):\n",
+ "def plot_collection(trial, collection_name, regex=\".*\", figsize=(8, 6)):\n",
" \"\"\"\n",
- " Takes a `trial` and a collection name, and \n",
+ " Takes a `trial` and a collection name, and\n",
" plots all tensors that match the given regex.\n",
" \"\"\"\n",
" fig, ax = plt.subplots(figsize=figsize)\n",
- " tensors = (trial.collection(collection_name).tensor_names)\n",
+ " tensors = trial.collection(collection_name).tensor_names\n",
" matched_tensors = [t for t in tensors if re.match(regex, t)]\n",
" for tensor_name in islice(matched_tensors, MAX_PLOTS):\n",
" steps, data = get_data(trial, tensor_name)\n",
" ax.plot(steps, data, label=match_tensor_name_with_feature_name(tensor_name))\n",
"\n",
- " ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))\n",
- " ax.set_xlabel('Iteration')"
+ " ax.legend(loc=\"center left\", bbox_to_anchor=(1, 0.5))\n",
+ " ax.set_xlabel(\"Iteration\")"
]
},
{
@@ -647,15 +831,10 @@
"outputs": [],
"source": [
"def plot_feature_importance(trial, importance_type=\"weight\"):\n",
- " SUPPORTED_IMPORTANCE_TYPES = [\n",
- " \"weight\", \"gain\", \"cover\", \"total_gain\", \"total_cover\"]\n",
+ " SUPPORTED_IMPORTANCE_TYPES = [\"weight\", \"gain\", \"cover\", \"total_gain\", \"total_cover\"]\n",
" if importance_type not in SUPPORTED_IMPORTANCE_TYPES:\n",
- " raise ValueError(\n",
- " f\"{importance_type} is not one of the supported importance types.\")\n",
- " plot_collection(\n",
- " trial,\n",
- " \"feature_importance\",\n",
- " regex=f\"feature_importance/{importance_type}/.*\")"
+ " raise ValueError(f\"{importance_type} is not one of the supported importance types.\")\n",
+ " plot_collection(trial, \"feature_importance\", regex=f\"feature_importance/{importance_type}/.*\")"
]
},
{
@@ -723,7 +902,7 @@
"shap_values = trial.tensor(\"full_shap/f0\").value(trial.last_complete_step)\n",
"shap_no_base = shap_values[:, :-1]\n",
"shap_base_value = shap_values[0, -1]\n",
- "shap.summary_plot(shap_no_base, plot_type='bar', feature_names=feature_names)"
+ "shap.summary_plot(shap_no_base, plot_type=\"bar\", feature_names=feature_names)"
]
},
{
@@ -765,21 +944,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
- "## Examine Lineage\n",
- "\n",
- "##### [back to top](#03-nb)\n",
+ "## SageMaker Model Monitor\n",
"\n",
- "----\n",
- "Though you already know the training job details from above, if we were just given the model uri, we could use SageMaker Lineage to produce the training job details which produced the model."
+ "### Step 1: Enable real-time inference data capture\n",
+ "----"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Data Lineage and Metrics for Best Model"
+ "To enable data capture for monitoring the model data quality, you specify the new capture option called `DataCaptureConfig`. You can capture the request payload, the response payload or both with this configuration. The capture config applies to all variants. Please provide the Endpoint name in the following cell:"
]
},
{
@@ -788,14 +963,85 @@
"metadata": {},
"outputs": [],
"source": [
- "from sagemaker.lineage import context, artifact, association, action"
+ "from sagemaker.model_monitor import DataCaptureConfig\n",
+ "\n",
+ "# Please fill in the following for enabling data capture\n",
+ "s3_capture_upload_path = f\"s3://{bucket}/{prefix}/endpoint-data-capture/\" # example: s3://bucket-name/path/to/endpoint-data-capture/\n",
+ "\n",
+ "#####\n",
+ "## IMPORTANT\n",
+ "##\n",
+ "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model\n",
+ "## behind this Endpoint. Otherwise, Endpoint data capture will not work.\n",
+ "##\n",
+ "#####"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "# Change parameters as you would like - adjust sampling percentage,\n",
+ "# chose to capture request or response or both\n",
+ "data_capture_config = DataCaptureConfig(\n",
+ " enable_capture=True,\n",
+ " sampling_percentage=25,\n",
+ " destination_s3_uri=s3_capture_upload_path,\n",
+ " kms_key_id=None,\n",
+ " capture_options=[\"REQUEST\", \"RESPONSE\"],\n",
+ " csv_content_types=[\"text/csv\"],\n",
+ " json_content_types=[\"application/json\"],\n",
+ ")\n",
+ "\n",
+ "# Now it is time to apply the new configuration and wait for it to be applied\n",
+ "predictor.update_data_capture_config(data_capture_config=data_capture_config)\n",
+ "sess.wait_for_endpoint(endpoint=endpoint_name)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Before you proceed:\n",
+ "Currently SageMaker supports monitoring Endpoints out of the box only for **tabular (csv, flat-json)** datasets. If your Endpoint uses some other datasets, these following steps will NOT work for you.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Training data artifact"
+ "### Step 2: Model Monitor - Baselining\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In addition to collecting the data, SageMaker allows you to monitor and evaluate the data observed by the Endpoints. For this :\n",
+ "1. We need to create a baseline with which we compare the realtime traffic against. \n",
+ "1. Once a baseline is ready, we can setup a schedule to continously evaluate/compare against the baseline."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Constraint suggestion with baseline/training dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The training dataset with which you trained the model is usually a good baseline dataset. Note that the training dataset's data schema and the inference dataset schema should exactly match (i.e. number and order of the features).\n",
+ "\n",
+ "Using our training dataset, we'll ask SageMaker to suggest a set of baseline constraints and generate descriptive statistics to explore the data."
]
},
{
@@ -804,34 +1050,81 @@
"metadata": {},
"outputs": [],
"source": [
- "data_artifact_list = []\n",
- "for data_input in training_job_info['InputDataConfig']:\n",
- " channel = data_input['ChannelName']\n",
- " data_s3_uri = data_input['DataSource']['S3DataSource']['S3Uri']\n",
+ "##'s3://bucketname/path/to/baseline/data' - Where your validation data is\n",
+ "baseline_data_uri = val_data_uri\n",
+ "##'s3://bucketname/path/to/baseline/data' - Where the results are to be stored in\n",
+ "baseline_results_uri = f\"s3://{bucket}/{prefix}/baseline/results\"\n",
"\n",
- " matching_artifacts = list(artifact.Artifact.list(\n",
- " source_uri=data_s3_uri,\n",
- " sagemaker_session=sagemaker_session)\n",
- " )\n",
- " \n",
- " if matching_artifacts:\n",
- " data_artifact = matching_artifacts[0]\n",
- " print(f'Using existing artifact: {data_artifact.artifact_arn}')\n",
- " else:\n",
- " data_artifact = artifact.Artifact.create(\n",
- " artifact_name=channel,\n",
- " source_uri=data_s3_uri,\n",
- " artifact_type='Dataset',\n",
- " sagemaker_session=sagemaker_session)\n",
- " print(f'Create artifact {data_artifact.artifact_arn}: SUCCESSFUL')\n",
- " data_artifact_list.append(data_artifact)"
+ "print(\"Baseline data uri: {}\".format(baseline_data_uri))\n",
+ "print(\"Baseline results uri: {}\".format(baseline_results_uri))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Create a baselining job with the validation dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have the training data ready in S3, let's kick off a job to `suggest` constraints. `DefaultModelMonitor.suggest_baseline(..)` kicks off a `ProcessingJob` using a SageMaker provided Model Monitor container to generate the constraints. Please edit the configurations to fit your needs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "from sagemaker.model_monitor import DefaultModelMonitor\n",
+ "from sagemaker.model_monitor.dataset_format import DatasetFormat\n",
+ "from sagemaker import get_execution_role\n",
+ "import datetime\n",
+ "\n",
+ "role = get_execution_role(sagemaker_session=sess)\n",
+ "\n",
+ "datetime_stamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n",
+ "\n",
+ "my_default_monitor = DefaultModelMonitor(\n",
+ " role=role,\n",
+ " instance_count=2,\n",
+ " instance_type=\"ml.m5.xlarge\",\n",
+ " volume_size_in_gb=20,\n",
+ " max_runtime_in_seconds=1800,\n",
+ " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\",\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "\n",
+ "monitor_baseline = my_default_monitor.suggest_baseline(\n",
+ " baseline_dataset=baseline_data_uri,\n",
+ " dataset_format=DatasetFormat.csv(header=False),\n",
+ " output_s3_uri=baseline_results_uri,\n",
+ " job_name=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n",
+ " wait=True,\n",
+ ")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Model artifact"
+ "#### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n",
+ "In this short section [next few cells] we will be showing you how to further view the underlying jobs for the monitoring job"
]
},
{
@@ -840,30 +1133,36 @@
"metadata": {},
"outputs": [],
"source": [
- "trained_model_s3_uri = training_job_info['ModelArtifacts']['S3ModelArtifacts']\n",
+ "from time import gmtime, strftime\n",
+ "import boto3\n",
"\n",
- "matching_artifacts = list(artifact.Artifact.list(\n",
- " source_uri=trained_model_s3_uri,\n",
- " sagemaker_session=sagemaker_session)\n",
- ")\n",
+ "client = boto3.client(\"sagemaker\")\n",
"\n",
- "if matching_artifacts:\n",
- " model_artifact = matching_artifacts[0]\n",
- " print(f'Using existing artifact: {model_artifact.artifact_arn}')\n",
- "else:\n",
- " model_artifact = artifact.Artifact.create(\n",
- " artifact_name='TrainedModel',\n",
- " source_uri=trained_model_s3_uri,\n",
- " artifact_type='Model',\n",
- " sagemaker_session=sagemaker_session)\n",
- " print(f'Create artifact {model_artifact.artifact_arn}: SUCCESSFUL')"
+ "\n",
+ "def get_last_processing_job():\n",
+ "\n",
+ " response = client.list_processing_jobs(\n",
+ " NameContains=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n",
+ " StatusEquals=\"Completed\",\n",
+ " SortBy=\"CreationTime\",\n",
+ " SortOrder=\"Descending\",\n",
+ " MaxResults=20,\n",
+ " )\n",
+ " pprint.pprint(response[\"ProcessingJobSummaries\"][0])\n",
+ " return response[\"ProcessingJobSummaries\"][0][\"ProcessingJobName\"]"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": null,
"metadata": {},
+ "outputs": [],
"source": [
- "#### Set artifact associations"
+ "from sagemaker.processing import ProcessingJob\n",
+ "from sagemaker.estimator import Estimator\n",
+ "from sagemaker.model_monitor.model_monitoring import ModelMonitor\n",
+ "\n",
+ "my_default_monitor_name = get_last_processing_job()"
]
},
{
@@ -872,15 +1171,17 @@
"metadata": {},
"outputs": [],
"source": [
- "trial_component = sagemaker_client.describe_trial_component(TrialComponentName=training_job_info['TrainingJobName']+'-aws-training-job')\n",
- "trial_component_arn = trial_component['TrialComponentArn']"
+ "my_default_monitor_reload = ProcessingJob.from_processing_name(sess, my_default_monitor_name)\n",
+ "\n",
+ "response = client.describe_processing_job(ProcessingJobName=my_default_monitor_name)\n",
+ "pprint.pprint(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "#### Store artifacts"
+ "#### Explore the generated constraints and statistics"
]
},
{
@@ -889,35 +1190,90 @@
"metadata": {},
"outputs": [],
"source": [
- "artifact_list = data_artifact_list + [model_artifact]\n",
+ "import pandas as pd\n",
"\n",
- "for artif in artifact_list:\n",
- " if artif.artifact_type == 'Dataset':\n",
- " assoc = 'ContributedTo'\n",
- " else:\n",
- " assoc = 'Produced'\n",
- " try:\n",
- " association.Association.create(\n",
- " source_arn=artif.artifact_arn,\n",
- " destination_arn=trial_component_arn,\n",
- " association_type=assoc,\n",
- " sagemaker_session=sagemaker_session)\n",
- " print(f\"Association with {artif.artifact_type}: SUCCESSFUL\")\n",
- " except:\n",
- " print(f\"Association already exists with {artif.artifact_type}\")"
+ "baseline_job = my_default_monitor.latest_baselining_job\n",
+ "schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict[\"features\"])\n",
+ "schema_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "constraints_df = pd.io.json.json_normalize(\n",
+ " baseline_job.suggested_constraints().body_dict[\"features\"]\n",
+ ")\n",
+ "constraints_df.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Before proceeding to enable monitoring, you could chose to edit the constraint file as required to fine tune the constraints."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Step 3: Enable continous monitoring\n",
+ "----\n",
+ "\n",
+ "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Create a schedule"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "We are ready to create a model monitoring schedule for the Endpoint created earlier with the baseline resources (constraints and statistics)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sagemaker.model_monitor import CronExpressionGenerator\n",
+ "import datetime as datetime\n",
+ "from time import gmtime, strftime\n",
"\n",
- "## Model Registry\n",
"\n",
- "##### [back to top](#03-nb)\n",
+ "mon_schedule_name = \"music-rec-monitor-schedule-{}\".format(\n",
+ " datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n",
+ ")\n",
+ "s3_report_path = f\"s3://{bucket}/{prefix}/monitor/report\"\n",
"\n",
- "----"
+ "try:\n",
+ " my_default_monitor.create_monitoring_schedule(\n",
+ " monitor_schedule_name=mon_schedule_name,\n",
+ " endpoint_input=endpoint_name,\n",
+ " output_s3_uri=s3_report_path,\n",
+ " statistics=my_default_monitor.baseline_statistics(),\n",
+ " constraints=my_default_monitor.suggested_constraints(),\n",
+ " schedule_cron_expression=CronExpressionGenerator.daily(),\n",
+ " enable_cloudwatch_metrics=True,\n",
+ " )\n",
+ " print(f\"Created monitoring schedule {mon_schedule_name}\")\n",
+ "except:\n",
+ " my_default_monitor.update_monitoring_schedule(\n",
+ " endpoint_input=endpoint_name,\n",
+ " schedule_cron_expression=CronExpressionGenerator.daily(),\n",
+ " enable_cloudwatch_metrics=True,\n",
+ " )\n",
+ " print(f\"Updated monitoring schedule {my_default_monitor.monitoring_schedule_name}\")"
]
},
{
@@ -926,23 +1282,57 @@
"metadata": {},
"outputs": [],
"source": [
- "mpg_name = prefix+'-notebooks'\n",
+ "import time\n",
"\n",
- "ps.add({'mpg_name':mpg_name}, namespace='music-rec')\n",
+ "desc_schedule_result = my_default_monitor.describe_schedule()\n",
+ "while desc_schedule_result[\"MonitoringScheduleStatus\"] != \"Scheduled\":\n",
+ " print(\"Schedule status: {}\".format(desc_schedule_result[\"MonitoringScheduleStatus\"]))\n",
+ " desc_schedule_result = my_default_monitor.describe_schedule()\n",
+ " time.sleep(30)\n",
+ "print(\"Schedule status: {}\".format(desc_schedule_result[\"MonitoringScheduleStatus\"]))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### All set\n",
+ "Now that your monitoring schedule has been created. Please return to the Amazon SageMaker Studio to list the executions for this Schedule and observe the results going forward."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Register Model with SageMaker Model Registry\n",
"\n",
+ "Amazon SageMaker ML Lineage Tracking creates and stores information about the steps of a machine learning workflow from data preparation to model deployment. With the tracking information you can reproduce the workflow steps, track model and dataset lineage, and establish model governance and audit standards\n",
"\n",
- "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "mpg_name = prefix + \"-notebooks\"\n",
+ "\n",
+ "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)[\n",
+ " \"ModelPackageSummaryList\"\n",
+ "]\n",
"\n",
"if model_packages:\n",
- " print(f'Using existing Model Package Group: {mpg_name}')\n",
+ " print(f\"Using existing Model Package Group: {mpg_name}\")\n",
"else:\n",
" mpg_input_dict = {\n",
- " 'ModelPackageGroupName': mpg_name,\n",
- " 'ModelPackageGroupDescription': 'Music Recommendation Models'\n",
+ " \"ModelPackageGroupName\": mpg_name,\n",
+ " \"ModelPackageGroupDescription\": \"Music Recommendation Models\",\n",
" }\n",
"\n",
" mpg_response = sagemaker_client.create_model_package_group(**mpg_input_dict)\n",
- " print(f'Create Model Package Group {mpg_name}: SUCCESSFUL')"
+ " print(f\"Create Model Package Group {mpg_name}: SUCCESSFUL\")"
]
},
{
@@ -953,29 +1343,32 @@
},
"outputs": [],
"source": [
- "sys.path.insert(1, './code')\n",
+ "sys.path.insert(1, \"./code\")\n",
"from inference_specification import InferenceSpecification\n",
"\n",
- "model_uri = training_job_info.get('ModelArtifacts', {}).get('S3ModelArtifacts')\n",
- "training_image = training_job_info['AlgorithmSpecification']['TrainingImage']\n",
+ "model_uri = training_job_info.get(\"ModelArtifacts\", {}).get(\"S3ModelArtifacts\")\n",
+ "training_image = training_job_info[\"AlgorithmSpecification\"][\"TrainingImage\"]\n",
"\n",
"mp_inference_spec = InferenceSpecification().get_inference_specification_dict(\n",
" ecr_image=training_image,\n",
" supports_gpu=False,\n",
- " supported_content_types=['text/csv'],\n",
- " supported_mime_types=['text/csv'])\n",
+ " supported_content_types=[\"text/csv\"],\n",
+ " supported_mime_types=[\"text/csv\"],\n",
+ ")\n",
"\n",
- "mp_inference_spec['InferenceSpecification']['Containers'][0]['ModelDataUrl'] = model_uri\n",
+ "mp_inference_spec[\"InferenceSpecification\"][\"Containers\"][0][\"ModelDataUrl\"] = model_uri\n",
"mp_input_dict = {\n",
- " 'ModelPackageGroupName': mpg_name,\n",
- " 'ModelPackageDescription': 'SageMaker Music Recommender',\n",
- " 'ModelApprovalStatus': 'PendingManualApproval'\n",
+ " \"ModelPackageGroupName\": mpg_name,\n",
+ " \"ModelPackageDescription\": \"SageMaker Music Recommender\",\n",
+ " \"ModelApprovalStatus\": \"PendingManualApproval\",\n",
"}\n",
"\n",
"mp_input_dict.update(mp_inference_spec)\n",
"mp_response = sagemaker_client.create_model_package(**mp_input_dict)\n",
- " \n",
- "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList']\n",
+ "\n",
+ "model_packages = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)[\n",
+ " \"ModelPackageSummaryList\"\n",
+ "]\n",
"model_packages"
]
},
@@ -983,7 +1376,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## Create Model from Estimator"
+ "### Create Model from Estimator"
]
},
{
@@ -992,23 +1385,47 @@
"metadata": {},
"outputs": [],
"source": [
- "model_matches = sagemaker_client.list_models(NameContains=model_name)['Models']\n",
+ "model_matches = sagemaker_client.list_models(NameContains=model_name)[\"Models\"]\n",
"\n",
"for model_name_match in model_matches:\n",
- " sagemaker_session.delete_model(model_name_match['ModelName'])\n",
+ " sagemaker_session.delete_model(model_name_match[\"ModelName\"])\n",
" print(f\"Deleted existing model: {model_name_match['ModelName']}\")\n",
- " \n",
+ "\n",
"model = sagemaker_session.create_model_from_job(\n",
" name=model_name,\n",
" training_job_name=training_job_name,\n",
" role=sagemaker_role,\n",
- " image_uri=training_job_info['AlgorithmSpecification']['TrainingImage']\n",
+ " image_uri=training_job_info[\"AlgorithmSpecification\"][\"TrainingImage\"],\n",
")\n",
"\n",
- "print(f\"Created new model: {model_name}\")\n",
- "\n",
- "ps.add({'model_name':model_name}, namespace='music-rec')\n",
- "ps.store()"
+ "print(f\"Created new model: {model_name}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clean Up\n",
+ "----"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import demo_helpers # our custom set of functions\n",
+ "\n",
+ "demo_helpers.delete_project_resources(\n",
+ " sagemaker_boto_client=sagemaker_client,\n",
+ " sagemaker_session=sagemaker_session,\n",
+ " endpoint_names=[endpoint_name],\n",
+ " mpg_name=mpg_name,\n",
+ " prefix=prefix,\n",
+ " delete_s3_objects=True,\n",
+ " bucket_name=bucket,\n",
+ ")"
]
},
{
@@ -1022,9 +1439,9 @@
"metadata": {
"instance_type": "ml.m5.4xlarge",
"kernelspec": {
- "display_name": "Python 3 (Data Science)",
+ "display_name": "conda_python3",
"language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
+ "name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
@@ -1036,7 +1453,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.10"
+ "version": "3.6.13"
}
},
"nbformat": 4,
diff --git a/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb b/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb
deleted file mode 100644
index f8f6009869..0000000000
--- a/end_to_end/music_recommendation/04_deploy_infer_explain.ipynb
+++ /dev/null
@@ -1,567 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 4: Deploy Model & Inference using Online Feature Store\n",
- "\n",
- "----\n",
- "\n",
- "In this notebook, we'll deploy our chosen model as an endpoint so that we can make predictions/inferences against it. \n",
- "Under the hood the *model.deploy* function creates a model, an endpoint configuration and an endpoint. \n",
- "\n",
- "Then we'll make music recommendations for a single user by inferencing against our model. We'll query our Feature Store to get some data to use for inferencing and show you how [SageMaker Clarify](https://docs.aws.amazon.com/sagemaker/latest/dg/clarify-model-explainability.html) can explain which features were most useful in making the recommended music predictions using SHAP values.\n",
- "\n",
- "Amazon SageMaker Clarify provides tools to help explain how machine learning models make predictions. These tools can help ML modelers and developers and other internal stakeholders understand model characteristics as a whole prior to deployment and to debug predictions provided by the model after it's deployed. Transparency about how ML models arrive at their predictions is also critical to consumers and regulators who need to trust the model predictions if they are going to accept the decisions based on them.\n",
- "\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n",
- " - [Deploy model](#04-deploy)\n",
- " - [Create predictor](#04-predictor)\n",
- " - [Infer new songs](#04-infer)\n",
- " - [Explain model predictions](#04-explain)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "try:\n",
- " !pip install -U awswrangler\n",
- "except ModuleNotFoundError:\n",
- " !pip install --no-input awswrangler"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# update pandas to avoid data type issues in older 1.0 version\n",
- "!pip install -qU pandas==1.2.0\n",
- "import pandas as pd\n",
- "print(pd.__version__)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import time\n",
- "import boto3\n",
- "import argparse\n",
- "import pathlib\n",
- "\n",
- "import sagemaker\n",
- "from sagemaker.feature_store.feature_group import FeatureGroup\n",
- "from sagemaker.estimator import Estimator\n",
- "import awswrangler as wr\n",
- "\n",
- "import os\n",
- "import json\n",
- "import matplotlib.pyplot as plt\n",
- "import numpy as np"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')\n",
- "\n",
- "bucket = parameters['bucket']\n",
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "fg_name_ratings = parameters['fg_name_ratings']\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "fg_name_user_preferences = parameters['fg_name_user_preferences']\n",
- "\n",
- "flow_export_id = parameters['flow_export_id']\n",
- "flow_s3_uri = parameters['flow_s3_uri']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']\n",
- "model_name = parameters['model_name']\n",
- "training_job_name = parameters['training_job_name']\n",
- "mpg_name = parameters['mpg_name']\n",
- "model_name = parameters['model_name']\n",
- "feature_names = parameters['feature_names']\n",
- "train_data_uri = parameters['train_data_uri']\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sess = sagemaker.Session()\n",
- "region = boto3.Session().region_name\n",
- "boto3.setup_default_session(region_name=region)\n",
- "\n",
- "s3_client = boto3.client('s3')\n",
- "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]\n",
- "\n",
- "boto_session = boto3.Session(region_name=region)\n",
- "\n",
- "sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)\n",
- "\n",
- "sagemaker_session = sagemaker.session.Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client\n",
- ")\n",
- "\n",
- "sagemaker_role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Deploy Model\n",
- "\n",
- "##### [back to top](#04-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint_name = '{}-endpoint-notebooks'.format(model_name)\n",
- "print(endpoint_name)\n",
- "\n",
- "ps.add({'endpoint_name':endpoint_name}, namespace='music-rec')\n",
- "ps.store()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# if you want to use a pretrained model, set use_pretrained = True\n",
- "## else use_pretrained = False to use the model you trained in the previous notebook\n",
- "use_pretrained = False\n",
- "\n",
- "if use_pretrained:\n",
- " # or use a pretrained model if you skipped model training in the last notebook\n",
- " xgb_estimator = sagemaker.model.Model(\n",
- " image_uri=sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\"),\n",
- " model_data=pretrained_model_path,\n",
- " role=sagemaker_role\n",
- " )\n",
- "else:\n",
- " print(training_job_name)\n",
- " # reinstantiate the estimator we trained in the previous notebook\n",
- " xgb_estimator = Estimator.attach(training_job_name)\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "endpoint_list = sagemaker_client.list_endpoints(\n",
- " SortBy='CreationTime',\n",
- " SortOrder='Descending',\n",
- " NameContains=endpoint_name,\n",
- " StatusEquals='InService'\n",
- ")\n",
- "endpoint_list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create endpoint"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "%%time\n",
- "if len(endpoint_list['Endpoints']) > 0:\n",
- " print(f\"Using existing endpoint: {endpoint_list['Endpoints'][0]['EndpointName']}\")\n",
- "else:\n",
- " # deploy endpoint for model if it doesn't already exist\n",
- " xgb_estimator.deploy(initial_instance_count=1,\n",
- " instance_type='ml.m4.xlarge',\n",
- " model_name=model_name,\n",
- " endpoint_name=endpoint_name\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "model_package = sagemaker_client.list_model_packages(ModelPackageGroupName=mpg_name)['ModelPackageSummaryList'][0]\n",
- "model_package_update = {\n",
- " 'ModelPackageArn': model_package['ModelPackageArn'],\n",
- " 'ModelApprovalStatus': 'Approved'\n",
- "}\n",
- "\n",
- "update_response = sagemaker_client.update_model_package(**model_package_update)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- " \n",
- "\n",
- "## Create a predictor\n",
- "\n",
- "##### [back to top](#04-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "predictor = sagemaker.predictor.Predictor(\n",
- " endpoint_name=endpoint_name,\n",
- " sagemaker_session=sagemaker_session)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Pull user data from feature group"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# random user ID. You can try any other ID\n",
- "sample_user_id = 11005"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)\n",
- "\n",
- "feature_store_session = sagemaker.Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_client,\n",
- " sagemaker_featurestore_runtime_client=featurestore_runtime\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# pull the sample user's 5 star preferences record from the feature store\n",
- "fg_response = featurestore_runtime.get_record(\n",
- " FeatureGroupName=fg_name_user_preferences, \n",
- " RecordIdentifierValueAsString=str(sample_user_id)\n",
- ")\n",
- "\n",
- "record = fg_response['Record']\n",
- "df_user = pd.DataFrame(record).set_index('FeatureName')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Pull sample of 1000 tracks from feature group"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# pull a sample of the tracks data (multiple records) from the feature store using athena query\n",
- "fg_name_tracks_obj = FeatureGroup(name=fg_name_tracks, sagemaker_session=feature_store_session)\n",
- "tracks_query = fg_name_tracks_obj.athena_query()\n",
- "tracks_table = tracks_query.table_name\n",
- "\n",
- "# use escaped quotes aound table name since it contains '-' symbols\n",
- "query_string = (\"SELECT * FROM \\\"{}\\\" LIMIT 1000\".format(tracks_table))\n",
- "print(\"Running \" + query_string)\n",
- "\n",
- "# run Athena query. The output is loaded to a Pandas dataframe.\n",
- "tracks_query.run(query_string=query_string, output_location=f\"s3://{bucket}/{prefix}/query_results/\")\n",
- "tracks_query.wait()\n",
- "df_tracks = tracks_query.as_dataframe()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data = df_tracks.merge(pd.DataFrame(df_user['ValueAsString']).T, how='cross')\n",
- "data.columns = [c.lower() for c in data.columns]\n",
- "inference_df = data[feature_names]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Format the datapoint\n",
- "The datapoint must match the exact input format as the model was trained--with all features in the correct order. In this example, the `col_order` variable was saved when you created the train and test datasets earlier in the guide."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_inputs = [','.join([str(i) for i in row]) for row in inference_df.values]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- " \n",
- "\n",
- "## Infer (predict) new songs using model\n",
- "\n",
- "##### [back to top](#04-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "predictions = []\n",
- "for data_input in data_inputs:\n",
- " results = predictor.predict(data_input, initial_args = {\"ContentType\": \"text/csv\"})\n",
- " prediction = json.loads(results)\n",
- " predictions.append(prediction)\n",
- "print(f'Predicted rating for user {int(sample_user_id)}:', prediction)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Write to csv in S3 without headers and index column.\n",
- "inference_df['rating'] = predictions\n",
- "inference_df = inference_df[['rating']+feature_names]\n",
- "inference_df.to_csv('data/prediction_data.csv', header=False, index=False)\n",
- "\n",
- "s3_client.upload_file('data/prediction_data.csv', bucket, f'{prefix}/data/pred/prediction_data.csv')\n",
- "\n",
- "pred_data_uri = f's3://{bucket}/{prefix}/data/pred/prediction_data.csv'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df_train = pd.read_csv(train_data_uri)\n",
- "\n",
- "label = 'rating'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- " \n",
- "\n",
- "## Explain model predictions\n",
- "\n",
- "##### [back to top](#04-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "explainability_output_path = f's3://{bucket}/{prefix}/clarify-output/explainability'"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "clarify_processor = sagemaker.clarify.SageMakerClarifyProcessor(\n",
- " role=sagemaker_role,\n",
- " instance_count=1,\n",
- " instance_type='ml.c4.xlarge',\n",
- " sagemaker_session=sagemaker_session)\n",
- "\n",
- "model_config = sagemaker.clarify.ModelConfig(\n",
- " model_name=model_name,\n",
- " instance_type='ml.m4.xlarge',\n",
- " instance_count=1,\n",
- " accept_type='text/csv')\n",
- "\n",
- "shap_config = sagemaker.clarify.SHAPConfig(\n",
- " baseline=[df_train.median().values[1:].tolist()], # ignore the first column since that is that target\n",
- " num_samples=100,\n",
- " agg_method='mean_abs')\n",
- "\n",
- "explainability_data_config = sagemaker.clarify.DataConfig(\n",
- " s3_data_input_path=pred_data_uri,\n",
- " s3_output_path=explainability_output_path,\n",
- " label=label,\n",
- " headers=[label]+feature_names,\n",
- " dataset_type='text/csv')\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "%%time\n",
- "try:\n",
- " s3_client.download_file(\n",
- " Bucket = bucket, \n",
- " Key = f'{prefix}/clarify-output/explainability/explanations_shap/out.csv', \n",
- " Filename = 'data/shap_output.csv'\n",
- " )\n",
- " print('Downloaded output from previous explainability job')\n",
- "except Exception as e:\n",
- " error = e.response.get('Error').get('Code')\n",
- " if error == '404':\n",
- " print('Running explainability job')\n",
- " clarify_processor.run_explainability(\n",
- " data_config=explainability_data_config,\n",
- " model_config=model_config,\n",
- " explainability_config=shap_config)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "inference_df['trackid'] = data['trackid']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "playlist_length = 10 # number of songs to recommend in playlist\n",
- "playlist = inference_df.sort_values(by='rating', ascending=False).head(playlist_length)\n",
- "print('Curated Playlist:\\n', playlist['trackid'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "local_explanations_out = pd.read_csv(explainability_output_path+'/explanations_shap/out.csv')\n",
- "local_explanations_out.columns = feature_names\n",
- "\n",
- "print(\"Model prediction:\", playlist.iloc[0, 0])\n",
- "plt.figure(figsize=(12,6))\n",
- "local_explanations_out.iloc[0].sort_values().plot.barh(title='Local explanation for prediction')"
- ]
- }
- ],
- "metadata": {
- "instance_type": "ml.t3.medium",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/05_model_monitor.ipynb b/end_to_end/music_recommendation/05_model_monitor.ipynb
deleted file mode 100644
index 3517343b4e..0000000000
--- a/end_to_end/music_recommendation/05_model_monitor.ipynb
+++ /dev/null
@@ -1,494 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 5: Model Monitor\n",
- "\n",
- "----\n",
- "In this notebook, we'll set up [SageMaker Model Monitor](https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor.html) to detect when our model or data significantly deviates from its \"normal\" behavior. SageMaker Model Monitor provides the ability to monitor machine learning models in production and detect deviations in data quality in comparison to a baseline dataset (e.g. training data set). This notebook walks you through enabling data capture and setting up continous monitoring for an existing Endpoint.\n",
- "\n",
- "This Notebook helps with the following:\n",
- "* Update your existing SageMaker Endpoint to enable Model Monitoring\n",
- "* Analyze the training dataset to generate a baseline constraint\n",
- "* Setup a MonitoringSchedule for monitoring deviations from the specified baseline\n",
- "\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- " - [Enable data capture](#05-capture)\n",
- " - [Baselining](#05-baseline)\n",
- " - [Enable continous monitoring](#05-continuous)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "\n",
- "\n",
- "## Step 1: Enable real-time inference data capture\n",
- "\n",
- "##### [back to top](#05-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "To enable data capture for monitoring the model data quality, you specify the new capture option called `DataCaptureConfig`. You can capture the request payload, the response payload or both with this configuration. The capture config applies to all variants. Please provide the Endpoint name in the following cell:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.model_monitor import DataCaptureConfig\n",
- "from sagemaker.predictor import Predictor\n",
- "from sagemaker import session\n",
- "import boto3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')\n",
- "\n",
- "bucket = parameters['bucket']\n",
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "fg_name_ratings = parameters['fg_name_ratings']\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "fg_name_user_preferences = parameters['fg_name_user_preferences']\n",
- "\n",
- "flow_export_id = parameters['flow_export_id']\n",
- "flow_s3_uri = parameters['flow_s3_uri']\n",
- "pretrained_model_path = parameters['pretrained_model_path']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']\n",
- "endpoint_name = parameters['endpoint_name']\n",
- "val_data_uri = parameters['val_data_uri']\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "sm_session = session.Session(boto3.Session())\n",
- "region = boto3.Session().region_name"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Please fill in the following for enabling data capture\n",
- "s3_capture_upload_path = f's3://{bucket}/{prefix}/endpoint-data-capture/' #example: s3://bucket-name/path/to/endpoint-data-capture/\n",
- "\n",
- "##### \n",
- "## IMPORTANT\n",
- "##\n",
- "## Please make sure to add the \"s3:PutObject\" permission to the \"role' you provided in the SageMaker Model \n",
- "## behind this Endpoint. Otherwise, Endpoint data capture will not work.\n",
- "## \n",
- "##### "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "%%time\n",
- "# Change parameters as you would like - adjust sampling percentage, \n",
- "# chose to capture request or response or both\n",
- "data_capture_config = DataCaptureConfig(\n",
- " enable_capture = True,\n",
- " sampling_percentage=25,\n",
- " destination_s3_uri=s3_capture_upload_path,\n",
- " kms_key_id=None,\n",
- " capture_options=[\"REQUEST\", \"RESPONSE\"],\n",
- " csv_content_types=[\"text/csv\"],\n",
- " json_content_types=[\"application/json\"]\n",
- ")\n",
- "\n",
- "# Now it is time to apply the new configuration and wait for it to be applied\n",
- "predictor = Predictor(endpoint_name=endpoint_name)\n",
- "predictor.update_data_capture_config(data_capture_config=data_capture_config)\n",
- "sm_session.wait_for_endpoint(endpoint=endpoint_name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Before you proceed:\n",
- "Currently SageMaker supports monitoring Endpoints out of the box only for **tabular (csv, flat-json)** datasets. If your Endpoint uses some other datasets, these following steps will NOT work for you.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Step 2: Model Monitor - Baselining\n",
- "\n",
- "##### [back to top](#05-nb)\n",
- "\n",
- "----"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "In addition to collecting the data, SageMaker allows you to monitor and evaluate the data observed by the Endpoints. For this :\n",
- "1. We need to create a baseline with which we compare the realtime traffic against. \n",
- "1. Once a baseline is ready, we can setup a schedule to continously evaluate/compare against the baseline."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Constraint suggestion with baseline/training dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The training dataset with which you trained the model is usually a good baseline dataset. Note that the training dataset's data schema and the inference dataset schema should exactly match (i.e. number and order of the features).\n",
- "\n",
- "Using our training dataset, we'll ask SageMaker to suggest a set of baseline constraints and generate descriptive statistics to explore the data."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "##'s3://bucketname/path/to/baseline/data' - Where your validation data is\n",
- "baseline_data_uri = val_data_uri \n",
- "##'s3://bucketname/path/to/baseline/data' - Where the results are to be stored in\n",
- "baseline_results_uri = f's3://{bucket}/{prefix}/baseline/results' \n",
- "\n",
- "print('Baseline data uri: {}'.format(baseline_data_uri))\n",
- "print('Baseline results uri: {}'.format(baseline_results_uri))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create a baselining job with the validation dataset"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we have the training data ready in S3, let's kick off a job to `suggest` constraints. `DefaultModelMonitor.suggest_baseline(..)` kicks off a `ProcessingJob` using a SageMaker provided Model Monitor container to generate the constraints. Please edit the configurations to fit your needs."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "from sagemaker.model_monitor import DefaultModelMonitor\n",
- "from sagemaker.model_monitor.dataset_format import DatasetFormat\n",
- "from sagemaker import get_execution_role\n",
- "import datetime\n",
- "\n",
- "role = get_execution_role(sagemaker_session=sm_session)\n",
- "\n",
- "datetime_stamp = datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\")\n",
- "\n",
- "my_default_monitor = DefaultModelMonitor(\n",
- " role=role,\n",
- " instance_count=2,\n",
- " instance_type='ml.m5.xlarge',\n",
- " volume_size_in_gb=20,\n",
- " max_runtime_in_seconds=1800,\n",
- " base_job_name=f\"{prefix}-monitor-{datetime_stamp}\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "%%time\n",
- "\n",
- "monitor_baseline = my_default_monitor.suggest_baseline(\n",
- " baseline_dataset=baseline_data_uri,\n",
- " dataset_format=DatasetFormat.csv(header=False),\n",
- " output_s3_uri=baseline_results_uri,\n",
- " job_name=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n",
- " wait=True\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Exploratory Analysis of the Processing Jobs underlying SageMaker Monitor\n",
- "In this short section [next few cells] we will be showing you how to further view the underlying jobs for the monitoring job"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from time import gmtime, strftime\n",
- "import boto3\n",
- "\n",
- "client = boto3.client('sagemaker')\n",
- "\n",
- "def get_last_processing_job():\n",
- " \n",
- " response = client.list_processing_jobs(\n",
- " NameContains=f\"{prefix}-monitor-baseline-{datetime_stamp}\",\n",
- " StatusEquals='Completed',\n",
- " SortBy='CreationTime',\n",
- " SortOrder='Descending',\n",
- " MaxResults=20\n",
- " )\n",
- " pprint.pprint(response['ProcessingJobSummaries'][0])\n",
- " return response['ProcessingJobSummaries'][0]['ProcessingJobName']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.processing import ProcessingJob \n",
- "from sagemaker.estimator import Estimator\n",
- "from sagemaker.model_monitor.model_monitoring import ModelMonitor\n",
- "\n",
- "my_default_monitor_name = get_last_processing_job()\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "my_default_monitor_reload = ProcessingJob.from_processing_name(sm_session, my_default_monitor_name)\n",
- "\n",
- "response = client.describe_processing_job(\n",
- " ProcessingJobName=my_default_monitor_name\n",
- ")\n",
- "pprint.pprint(response)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Explore the generated constraints and statistics"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd\n",
- "\n",
- "baseline_job = my_default_monitor.latest_baselining_job\n",
- "schema_df = pd.io.json.json_normalize(baseline_job.baseline_statistics().body_dict[\"features\"])\n",
- "schema_df.head(10)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "constraints_df = pd.io.json.json_normalize(baseline_job.suggested_constraints().body_dict[\"features\"])\n",
- "constraints_df.head(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Before proceeding to enable monitoring, you could chose to edit the constraint file as required to fine tune the constraints."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "## Step 3: Enable continous monitoring\n",
- "\n",
- "##### [back to top](#05-nb)\n",
- "\n",
- "----\n",
- "\n",
- "We have collected the data above, here we proceed to analyze and monitor the data with MonitoringSchedules."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Create a schedule"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We are ready to create a model monitoring schedule for the Endpoint created earlier with the baseline resources (constraints and statistics)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "from sagemaker.model_monitor import CronExpressionGenerator\n",
- "import datetime as datetime\n",
- "from time import gmtime, strftime\n",
- "\n",
- "\n",
- "mon_schedule_name = 'music-rec-monitor-schedule-{}'.format(datetime.datetime.now().strftime(\"%Y-%m-%d-%H%M%S\"))\n",
- "s3_report_path = f's3://{bucket}/{prefix}/monitor/report'\n",
- "\n",
- "try:\n",
- " my_default_monitor.create_monitoring_schedule(\n",
- " monitor_schedule_name=mon_schedule_name,\n",
- " endpoint_input=endpoint_name,\n",
- " output_s3_uri=s3_report_path,\n",
- " statistics=my_default_monitor.baseline_statistics(),\n",
- " constraints=my_default_monitor.suggested_constraints(),\n",
- " schedule_cron_expression=CronExpressionGenerator.daily(),\n",
- " enable_cloudwatch_metrics=True,\n",
- " )\n",
- " print(f\"Created monitoring schedule {mon_schedule_name}\")\n",
- "except:\n",
- " my_default_monitor.update_monitoring_schedule(\n",
- " endpoint_input=endpoint_name,\n",
- " schedule_cron_expression=CronExpressionGenerator.daily(),\n",
- " enable_cloudwatch_metrics=True,\n",
- " )\n",
- " print(f\"Updated monitoring schedule {my_default_monitor.monitoring_schedule_name}\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Schedule status: Pending\n"
- ]
- }
- ],
- "source": [
- "import time\n",
- "\n",
- "desc_schedule_result = my_default_monitor.describe_schedule()\n",
- "while desc_schedule_result['MonitoringScheduleStatus'] != 'Scheduled':\n",
- " print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))\n",
- " desc_schedule_result = my_default_monitor.describe_schedule()\n",
- " time.sleep(30)\n",
- "print('Schedule status: {}'.format(desc_schedule_result['MonitoringScheduleStatus']))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### All set\n",
- "Now that your monitoring schedule has been created. Please return to the Amazon SageMaker Studio to list the executions for this Schedule and observe the results going forward."
- ]
- }
- ],
- "metadata": {
- "anaconda-cloud": {},
- "instance_type": "ml.m5.large",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- },
- "notice": "Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/07_clean_up.ipynb b/end_to_end/music_recommendation/07_clean_up.ipynb
deleted file mode 100644
index a718a1218e..0000000000
--- a/end_to_end/music_recommendation/07_clean_up.ipynb
+++ /dev/null
@@ -1,187 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "\n",
- "\n",
- "# Music Recommender Part 7: Clean Up\n",
- "\n",
- "## Overview\n",
- "\n",
- "----\n",
- "### Clean up : Delete all Resources Created in the past 8 notebooks (nb 00-06)\n",
- "In the past notebooks we have created many Amazon Resources; represented by their ARNs : Amazon Resource Names.\n",
- "In order not to incur any cost in keeping those resources running, such as endpoints etc. We will use this notebook as a reminder to clean up and delete all the resources you have created in this music recommendation example.\n",
- "\n",
- "First we will read in all parameters saved in the 'music-rec' namespace as we went from one notebook to the next,\n",
- "second we will use a little utility under the `./code/demo_helpers.py` script file to actually delete all resources passed\n",
- "----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_infer_explain.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "import json\n",
- "import boto3\n",
- "import pathlib\n",
- "import sagemaker\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "import awswrangler as wr\n",
- "\n",
- "from sagemaker.estimator import Estimator\n",
- "from sagemaker.workflow.pipeline import Pipeline\n",
- "from sagemaker.workflow.steps import CreateModelStep\n",
- "from sagemaker.sklearn.processing import SKLearnProcessor\n",
- "from sagemaker.workflow.step_collections import RegisterModel\n",
- "from sagemaker.workflow.steps import ProcessingStep, TrainingStep\n",
- "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
- "from sagemaker.workflow.parameters import ParameterInteger, ParameterFloat, ParameterString\n",
- "from sagemaker.feature_store.feature_group import FeatureGroup"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "scrolled": true
- },
- "outputs": [],
- "source": [
- "import sys\n",
- "import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
- "\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')\n",
- "\n",
- "bucket = parameters['bucket']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']\n",
- "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n",
- "\n",
- "pipeline_endpoint_name = parameters['pipeline_endpoint_name']\n",
- "pipeline_name = parameters['pipeline_name']\n",
- "\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "fg_name_ratings = parameters['fg_name_ratings']\n",
- "fg_name_user_preferences = parameters['fg_name_user_preferences']\n",
- "\n",
- "dw_ecrlist = parameters['dw_ecrlist']\n",
- "\n",
- "pipeline_name = parameters['pipeline_name']\n",
- "dataprep_pipeline_name = parameters['dataprep_pipeline_name']\n",
- "train_deploy_pipeline_name = parameters['train_deploy_pipeline_name']\n",
- "\n",
- "endpoint_name = parameters['endpoint_name']\n",
- "pipeline_endpoint_name = parameters['pipeline_endpoint_name']\n",
- "\n",
- "mpg_name = parameters['mpg_name']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "region = boto3.Session().region_name\n",
- "boto3.setup_default_session(region_name=region)\n",
- "boto_session = boto3.Session(region_name=region)\n",
- "\n",
- "s3_client = boto3.client('s3', region_name=region)\n",
- "\n",
- "sagemaker_boto_client = boto_session.client('sagemaker')\n",
- "sagemaker_session = sagemaker.session.Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_boto_client)\n",
- "sagemaker_role = sagemaker.get_execution_role()\n",
- "\n",
- "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# when demo_helpers.delete_project_resources() is ran it will delete all the resources created by this demo\n",
- "sys.path.insert(1, './code')\n",
- "import demo_helpers # our custom set of functions\n",
- "\n",
- "\n",
- "def remove_all_resources():\n",
- " demo_helpers.delete_project_resources(\n",
- " sagemaker_boto_client=sagemaker_boto_client, \n",
- " sagemaker_session=sagemaker_session,\n",
- " endpoint_names=[pipeline_endpoint_name, endpoint_name],\n",
- " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name], \n",
- " mpg_name=mpg_name,\n",
- " feature_groups=[fg_name_ratings, fg_name_tracks, fg_name_user_preferences], \n",
- " prefix=prefix,\n",
- " delete_s3_objects=True,\n",
- " bucket_name=bucket\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Uncomment the next line and run to delete all resources\n",
- "# remove_all_resources()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "instance_type": "ml.t3.medium",
- "kernelspec": {
- "display_name": "Python 3 (Data Science)",
- "language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.10"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/end_to_end/music_recommendation/README.md b/end_to_end/music_recommendation/README.md
index 84f6256819..21273be10a 100644
--- a/end_to_end/music_recommendation/README.md
+++ b/end_to_end/music_recommendation/README.md
@@ -54,17 +54,12 @@ For this tutorial, we'll be using our own generated track and user ratings data,
# Approach
In the following notebooks we'll take 2 different approaches with the same modeling solution to create our music recommender.
-1. Run each notebook, 02a_ to 05_, to walkthrough each data prep and modeling step
+1. Run the following notebooks in order to walkthrough each data prep and modeling step
- 01_music_dataprep.flow: Flow file defining our data input and transformation steps; this file is created in the Sagemaker Data Wrangler GUI
- - 02a_export_fs_tracks.ipynb: export our tracks data created in Data Wrangler to a feature store
- - 02b_export_fs_5star_features.ipynb: export our 5-star rated tracks data created in Data Wrangler to a feature store
- - 02c_fs_create_ratings.ipynb: export our user ratings data created in Data Wrangler to a feature store
- - 03_train_model_lineage_registry_debugger.ipynb: train the model using xgboost to predict each song rating for each user
- - 04_inference_explainability.ipynb: go over feature importances using SHAP values
- - 05_model_monitor.ipynb: setup Sagemaker Model Monitor
+ - 02_export_feature_groups.ipynb: export our tracks data, 5-star rated tracks data, and user ratings data created in Data Wrangler to a feature store
+ - 03_train_deploy_debugger_explain_monitor_registry.ipynb: train and deploy the model using xgboost to predict each song rating for each user. We also go over feature importances using SHAP values and setup Sagemaker Model Monitor.
1. Setup a Sagemaker Pipeline to do all the aformentioned steps in a single notebook so that it can be ran automatically over time
- - 01_music_dataprep.flow: Flow file defining our data input and transformation steps; this file is created in the Sagemaker Data Wrangler GUI
- - 06_pipeline.ipynb: setup each modeling step using sagemaker.workflow Pipeline object
+ - end_to_end_pipeline.ipynb: setup each modeling step using sagemaker.workflow Pipeline object
### Solution Architecture

@@ -72,4 +67,4 @@ In the following notebooks we'll take 2 different approaches with the same model
# Clean Up
-In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial. We've also included a notebook, `07_clean_up.ipynb`, to delete all resources spun up by this demo.
+In order to prevent ongoing charges to your AWS account, clean up any resources we spun up during this tutorial at the end of notebooks [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb) and [Train, Deploy, and Monitor the Music Recommender Model using SageMaker Pipelines](end_to_end_pipeline.ipynb).
diff --git a/end_to_end/music_recommendation/code/demo_helpers.py b/end_to_end/music_recommendation/code/demo_helpers.py
index a68beda069..cd9bd416a1 100644
--- a/end_to_end/music_recommendation/code/demo_helpers.py
+++ b/end_to_end/music_recommendation/code/demo_helpers.py
@@ -1,9 +1,72 @@
+import os
+import json
import boto3
import time
+import pandas as pd
from sagemaker.lineage.context import Context
from sagemaker.lineage.action import Action
from sagemaker.lineage.association import Association
from sagemaker.lineage.artifact import Artifact
+from awscli.customizations.s3.utils import split_s3_bucket_key
+
+def get_data(s3_client, public_s3_data, to_bucket, to_prefix, sample_data=1):
+ new_paths = []
+ for f in public_s3_data:
+ bucket_name, key_name = split_s3_bucket_key(f)
+ filename = f.split('/')[-1]
+ new_path = "s3://{}/{}/{}".format(to_bucket, to_prefix, filename)
+ new_paths.append(new_path)
+
+ # only download if not already downloaded
+ if not os.path.exists('./data/{}'.format(filename)):
+ # download s3 data
+ print("Downloading file from {}".format(f))
+ s3_client.download_file(bucket_name, key_name, './data/{}'.format(filename))
+
+ # subsample the data to create a smaller datatset for this demo
+ new_df = pd.read_csv('./data/{}'.format(filename))
+ new_df = new_df.sample(frac=sample_data)
+ new_df.to_csv('./data/{}'.format(filename), index=False)
+
+ # upload s3 data to our default s3 bucket for SageMaker Studio
+ print("Uploading {} to {}\n".format(filename, new_path))
+ s3_client.upload_file('./data/{}'.format(filename), to_bucket, os.path.join(to_prefix,filename))
+
+ return new_paths
+
+
+def get_model(s3_client, model_path, to_bucket, to_prefix):
+ # upload model to our default s3 bucket for SageMaker Studio
+ filename = model_path.split('/')[-1]
+ print("Uploading {} to {}\n".format(model_path, os.path.join(to_bucket,to_prefix,filename)))
+ s3_client.upload_file(model_path, to_bucket, os.path.join(to_prefix,filename))
+ return "s://{}".format(os.path.join(to_bucket,to_prefix,filename))
+
+
+def update_data_sources(flow_path, tracks_data_source, ratings_data_source):
+ with open(flow_path) as flowf:
+ flow = json.load(flowf)
+
+ for node in flow['nodes']:
+ # if the key exists for our s3 endpoint
+ try:
+ if node['parameters']['dataset_definition']['name'] == 'tracks.csv':
+ # reset the s3 data source for tracks data
+ old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']
+ print("Changed {} to {}".format(old_source, tracks_data_source))
+ node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = tracks_data_source
+ elif node['parameters']['dataset_definition']['name'] == 'ratings.csv':
+ # reset the s3 data source for ratings data
+ old_source = node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri']
+ print("Changed {} to {}".format(old_source, ratings_data_source))
+ node['parameters']['dataset_definition']['s3ExecutionContext']['s3Uri'] = ratings_data_source
+ except:
+ continue
+ # write out the updated json flow file
+ with open(flow_path, 'w') as outfile:
+ json.dump(flow, outfile)
+
+ return flow
def delete_project_resources(sagemaker_boto_client, sagemaker_session, endpoint_names=None, pipeline_names=None, mpg_name=None,
@@ -53,7 +116,7 @@ def delete_lineage_data():
delete_associations(summary.context_arn)
ctx = Context(context_name=summary.context_name, sagemaker_session=sagemaker_session)
ctx.delete()
- time.sleep(1)
+ time.sleep(2)
for summary in Action.list():
if prefix in summary.source.source_uri:
@@ -70,8 +133,12 @@ def delete_lineage_data():
artfct = Artifact(artifact_arn=summary.artifact_arn, sagemaker_session=sagemaker_session)
artfct.delete()
time.sleep(1)
+
# Delete model lineage associations and artifacts created in demo
- delete_lineage_data()
+ try:
+ delete_lineage_data()
+ except Exception as err:
+ print(f"Failed to delete lineage data: {err}")
if endpoint_names is not None:
try:
diff --git a/end_to_end/music_recommendation/06_pipeline.ipynb b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb
similarity index 52%
rename from end_to_end/music_recommendation/06_pipeline.ipynb
rename to end_to_end/music_recommendation/end_to_end_pipeline.ipynb
index 522b8e7b81..7fb39c5676 100644
--- a/end_to_end/music_recommendation/06_pipeline.ipynb
+++ b/end_to_end/music_recommendation/end_to_end_pipeline.ipynb
@@ -4,27 +4,24 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
- "# Music Recommender Part 6: SageMaker Pipelines\n",
+ "# Train, Deploy, and Monitor the Music Recommender Model using SageMaker Pipelines\n",
"\n",
"----\n",
- "In this final notebook, we'll combine all the steps we've gone over in each individual notebook, and condense them down into a [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html) object which will automate the entire modeling process from the beginning of data ingestion to monitoring the model. SageMaker Pipelines is a tool for building machine learning pipelines that take advantage of direct SageMaker integration. Because of this integration, you can create a pipeline and set up SageMaker Projects for orchestration using a tool that handles much of the step creation and management for you.\n",
+ "## Background\n",
+ "\n",
+ "In this notebook, we'll build an end-to-end pipeline to create a music recommender using [SageMaker Pipelines](https://docs.aws.amazon.com/sagemaker/latest/dg/pipelines.html), which will automate the entire modeling process from the beginning of data ingestion to monitoring the model. SageMaker Pipelines is a tool for building machine learning pipelines that take advantage of direct SageMaker integration. Because of this integration, you can create a pipeline and set up SageMaker Projects for orchestration using a tool that handles much of the step creation and management for you.\n",
+ "\n",
+ "If you want to learn more about each step of the pipeline, feel free to look at the series of notebooks listed below. It basically implements the same process in this notebook in a manual way with more detailed descriptions of what each step does. Please see the [README.md](README.md) for more information about the use case implemented by this sequence of notebooks. \n",
+ "\n",
+ "1. [Music Recommender Data Exploration](01_data_exploration.ipynb)\n",
+ "1. [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)\n",
+ "1. [Train, Deploy, and Monitor the Music Recommender Model using SageMaker SDK](03_train_deploy_debugger_explain_monitor_registry.ipynb)\n",
"\n",
"----\n",
- "### Contents\n",
- "- [Overview](00_overview_arch_data.ipynb)\n",
- "- [Part 1: Data Prep using Data Wrangler](01_music_dataprep.flow)\n",
- "- [Part 2a: Feature Store Creation - Tracks](02a_export_fg_tracks.ipynb)\n",
- "- [Part 2b: Feature Store Creation - User Preferences](02b_export_fg_5star_features.ipynb)\n",
- "- [Part 2c: Feature Store Creation - Ratings](02c_export_fg_ratings.ipynb)\n",
- "- [Part 3: Train Model with Debugger Hooks. Set Artifacts and Register Model.](03_train_model_lineage_registry_debugger.ipynb)\n",
- "- [Part 4: Deploy Model & Inference using Online Feature Store](04_deploy_inference_explainability.ipynb)\n",
- "- [Part 5: Model Monitor](05_model_monitor.ipynb)\n",
- "- [Part 6: SageMaker Pipelines](06_pipeline.ipynb)\n",
- " - [Architecture](#06-arch)\n",
- " - [Pipelines Overview](#pipelines)\n",
- "- [Part 7: Resource Cleanup](07_clean_up.ipynb)"
+ "## Contents\n",
+ "1. [Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment](#Architecture:-Create-a-SageMaker-Pipeline-to-Automate-All-the-Steps-from-Data-Prep-to-Model-Deployment)\n",
+ "1. [SageMaker Pipeline Overview](#SageMaker-Pipeline-Overview)\n",
+ "1. [Clean Up](#Clean-Up)"
]
},
{
@@ -102,21 +99,8 @@
"source": [
"import sys\n",
"import pprint\n",
- "sys.path.insert(1, './code')\n",
- "from parameter_store import ParameterStore\n",
"\n",
- "ps = ParameterStore(verbose=False)\n",
- "\n",
- "parameters = ps.read('music-rec')\n",
- "\n",
- "bucket = parameters['bucket']\n",
- "prefix = parameters['prefix']\n",
- "ratings_data_source = parameters['ratings_data_source']\n",
- "tracks_data_source = parameters['tracks_data_source']\n",
- "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n",
- "endpoint_name = parameters['endpoint_name']\n",
- "mpg_name = parameters['mpg_name']\n",
- "dw_ecrlist = parameters['dw_ecrlist']"
+ "sys.path.insert(1, \"./code\")"
]
},
{
@@ -129,15 +113,19 @@
"boto3.setup_default_session(region_name=region)\n",
"boto_session = boto3.Session(region_name=region)\n",
"\n",
- "s3_client = boto3.client('s3', region_name=region)\n",
+ "s3_client = boto3.client(\"s3\", region_name=region)\n",
"\n",
- "sagemaker_boto_client = boto_session.client('sagemaker')\n",
+ "sagemaker_boto_client = boto_session.client(\"sagemaker\")\n",
"sagemaker_session = sagemaker.session.Session(\n",
- " boto_session=boto_session,\n",
- " sagemaker_client=sagemaker_boto_client)\n",
+ " boto_session=boto_session, sagemaker_client=sagemaker_boto_client\n",
+ ")\n",
"sagemaker_role = sagemaker.get_execution_role()\n",
"\n",
- "account_id = boto3.client('sts').get_caller_identity()[\"Account\"]"
+ "account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
+ "\n",
+ "sess = sagemaker.Session()\n",
+ "bucket = sess.default_bucket()\n",
+ "prefix = \"music-recommendation-pipeline\""
]
},
{
@@ -148,13 +136,15 @@
"source": [
"processing_dir = \"/opt/ml/processing\"\n",
"\n",
- "# Output name is auto-generated from the select node's ID + output name from the flow file. \n",
+ "# Output name is auto-generated from the select node's ID + output name from the flow file.\n",
"# You can change to a different node ID to export a different step in the flow file\n",
- "output_name_tracks = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node in flow file\n",
- "output_name_user_preferences = \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node in flow file\"\n",
- "output_name_ratings = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node in flow file\n",
+ "output_name_tracks = \"19ad8e80-2002-4ee9-9753-fe9a384b1166.default\" # tracks node in flow file\n",
+ "output_name_user_preferences = (\n",
+ " \"7a6dad19-2c80-43e3-b03d-ec23c3842ae9.default\" # joined node in flow file\"\n",
+ ")\n",
+ "output_name_ratings = \"9a283380-91ca-478e-be99-6ba3bf57c680.default\" # ratings node in flow file\n",
"\n",
- "#======> variables used for parameterizing the notebook run\n",
+ "# ======> variables used for parameterizing the notebook run\n",
"flow_instance_count = 1\n",
"flow_instance_type = \"ml.m5.4xlarge\"\n",
"\n",
@@ -165,12 +155,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- " \n",
- "\n",
"## Architecture: Create a SageMaker Pipeline to Automate All the Steps from Data Prep to Model Deployment\n",
"\n",
- "##### [back to top](#06-nb)\n",
- "\n",
"----\n",
"\n",
""
@@ -180,22 +166,147 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "## Prereqs: Get Data \n",
+ "----\n",
"\n",
- "## SageMaker Pipeline Overview\n",
+ "Here we will download the music data from a public S3 bucket that we'll be using for this demo and uploads it to your default S3 bucket that was created for you when you initially created a SageMaker Studio workspace. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from demo_helpers import get_data, get_model, update_data_sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# create data folder\n",
+ "!mkdir data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# public S3 bucket that contains our music data\n",
+ "s3_bucket_music_data = \"s3://sagemaker-sample-files/datasets/tabular/synthetic-music\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "new_data_paths = get_data(\n",
+ " s3_client,\n",
+ " [f\"{s3_bucket_music_data}/tracks.csv\", f\"{s3_bucket_music_data}/ratings.csv\"],\n",
+ " bucket,\n",
+ " prefix,\n",
+ " sample_data=0.70,\n",
+ ")\n",
+ "print(new_data_paths)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# these are the new file paths located on your SageMaker Studio default s3 storage bucket\n",
+ "tracks_data_source = f\"s3://{bucket}/{prefix}/tracks.csv\"\n",
+ "ratings_data_source = f\"s3://{bucket}/{prefix}/ratings.csv\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For this example, we will provide the processed data you need to complete this task. But you are free to take a look at how we processed the data:\n",
"\n",
- "##### [back to top](#06-nb)\n",
+ "* If you are curious as to how `tracks_new.csv` and `ratings_new.csv` are generated, see [Music Recommender Data Exploration](01_data_exploration.ipynb)\n",
+ "* If you are curious as to how the rest of the files are generated, see [Music Recommender Data Preparation with SageMaker Feature Store and SageMaker Data Wrangler](02_export_feature_groups.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "files_to_download = [\n",
+ " f\"sample_tracks.csv\",\n",
+ " f\"sample_user.csv\",\n",
+ " f\"train_data_headers.csv\",\n",
+ " f\"train_data.zip\",\n",
+ " f\"val_data_headers.csv\",\n",
+ " f\"val_data.zip\",\n",
+ " f\"tracks_new.csv\",\n",
+ " f\"ratings_new.csv\",\n",
+ "]\n",
+ "\n",
+ "for file in files_to_download:\n",
+ " s3_client.download_file(\n",
+ " f\"sagemaker-sample-files\", f\"datasets/tabular/synthetic-music/{file}\", f\"./data/{file}\"\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "! unzip -o './data/*.zip' -d './data'\n",
+ "! rm ./data/*.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# upload train and validation datasets as well\n",
+ "s3_client.upload_file(\"data/tracks_new.csv\", bucket, f\"{prefix}/data/tracks_new.csv\")\n",
+ "s3_client.upload_file(\"data/ratings_new.csv\", bucket, f\"{prefix}/data/ratings_new.csv\")\n",
+ "s3_client.upload_file(\"data/train_data.csv\", bucket, f\"{prefix}/data/train/train_data.csv\")\n",
+ "s3_client.upload_file(\"data/val_data.csv\", bucket, f\"{prefix}/data/val/val_data.csv\")\n",
+ "\n",
+ "\n",
+ "train_data_uri = f\"s3://{bucket}/{prefix}/data/train/train_data.csv\"\n",
+ "val_data_uri = f\"s3://{bucket}/{prefix}/data/val/val_data.csv\"\n",
+ "print(f\"Saving training data to {train_data_uri}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## SageMaker Pipeline Overview\n",
"\n",
"---- \n",
"\n",
- "#### [Step 1: Data Wrangler Preprocessing Step](#data-wrangler)\n",
- "#### [Step 2: Dataset and train test split](#dataset-train-test)\n",
- "#### [Step 3: Train XGboost Model](#pipe-train-xgb)\n",
- "#### [Step 4: Model Pre-deployment](#pipe-pre-deploy)\n",
- "#### [Step 5: Register Model](#pipe-Register-Model)\n",
- "#### [Step 6: Deploy Model](#deploy)\n",
- "#### [Step 7: Monitor Model](#monitor)\n",
- "#### [Combine Steps and Run Pipeline](#combine)"
+ "### List of Steps\n",
+ "\n",
+ "1. [Step 1: Data Wrangler Preprocessing Step](#Step-1:-Data-Wrangler-Preprocessing-Step)\n",
+ "1. [Step 2: Create Dataset and Train/Test Split](#Step-2:-Create-Dataset-and-Train/Test-Split)\n",
+ "1. [Step 3: Train XGBoost Model](#Step-3:-Train-XGBoost-Model)\n",
+ "1. [Step 4: Model Pre-Deployment Step](#Step-4:-Model-Pre-Deployment-Step)\n",
+ "1. [Step 5: Register Model](#Step-5:-Register-Model)\n",
+ "1. [Step 6: Deploy Model](#Step-6:-Deploy-Model)\n",
+ "1. [Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint](#Step-7:-Monitor-Model-Deployed-to-SageMaker-Hosted-Endpoint)\n",
+ "1. [Combine Steps and Run Pipeline](#Combine-Steps-and-Run-Pipeline)"
]
},
{
@@ -226,8 +337,7 @@
")\n",
"\n",
"model_approval_status = ParameterString(\n",
- " name=\"ModelApprovalStatus\",\n",
- " default_value=\"PendingManualApproval\"\n",
+ " name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\"\n",
")"
]
},
@@ -235,10 +345,34 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### Step 1: Data Wranger Preprocessing Step\n",
- "[Pipeline Overview](#pipelines)\n",
+ "### Step 1: Data Wrangler Preprocessing Step"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Update the data source in the `.flow` file\n",
+ "The `01_music_datapred.flow` file is a JSON file containing instructions for where to find your data sources and how to transform the data. We'll be updating the object telling Data Wrangler where to find the input data on S3. We will set this to your default S3 bucket. With this update to the `.flow` file it now points to your new S3 bucket as the data source used by SageMaker Data Wrangler.\n",
"\n",
+ "Make sure the `.flow` file is closed before running this next step or it won't update the new s3 file locations in the file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "update_data_sources(\"01_music_dataprep.flow\", tracks_data_source, ratings_data_source)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
"#### Upload flow to S3\n",
"This will become an input to the first step and, as such, needs to be in S3."
]
@@ -252,8 +386,10 @@
"# name of the flow file which should exist in the current notebook working directory\n",
"flow_file_name = \"01_music_dataprep.flow\"\n",
"\n",
- "s3_client.upload_file(Filename=flow_file_name, Bucket=bucket, Key=f'{prefix}/dataprep-notebooks/music_dataprep.flow')\n",
- "flow_s3_uri = f's3://{bucket}/{prefix}/dataprep-notebooks/music_dataprep.flow'\n",
+ "s3_client.upload_file(\n",
+ " Filename=flow_file_name, Bucket=bucket, Key=f\"{prefix}/dataprep-notebooks/music_dataprep.flow\"\n",
+ ")\n",
+ "flow_s3_uri = f\"s3://{bucket}/{prefix}/dataprep-notebooks/music_dataprep.flow\"\n",
"\n",
"print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")"
]
@@ -275,24 +411,28 @@
"data_sources = []\n",
"\n",
"## Input - S3 Source: tracks.csv\n",
- "data_sources.append(ProcessingInput(\n",
- " source=f\"s3://{bucket}/{prefix}/data/tracks_new.csv\", # You can override this to point to another dataset on S3\n",
- " destination=f\"{processing_dir}/data/tracks_new.csv\",\n",
- " input_name=\"tracks_new.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))\n",
+ "data_sources.append(\n",
+ " ProcessingInput(\n",
+ " source=f\"s3://{bucket}/{prefix}/data/tracks_new.csv\", # You can override this to point to another dataset on S3\n",
+ " destination=f\"{processing_dir}/data/tracks_new.csv\",\n",
+ " input_name=\"tracks_new.csv\",\n",
+ " s3_data_type=\"S3Prefix\",\n",
+ " s3_input_mode=\"File\",\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
+ " )\n",
+ ")\n",
"\n",
"## Input - S3 Source: ratings.csv\n",
- "data_sources.append(ProcessingInput(\n",
- " source=f\"s3://{bucket}/{prefix}/data/ratings_new.csv\", # You can override this to point to another dataset on S3\n",
- " destination=f\"{processing_dir}/data/ratings_new.csv\",\n",
- " input_name=\"ratings_new.csv\",\n",
- " s3_data_type=\"S3Prefix\",\n",
- " s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
- "))\n",
+ "data_sources.append(\n",
+ " ProcessingInput(\n",
+ " source=f\"s3://{bucket}/{prefix}/data/ratings_new.csv\", # You can override this to point to another dataset on S3\n",
+ " destination=f\"{processing_dir}/data/ratings_new.csv\",\n",
+ " input_name=\"ratings_new.csv\",\n",
+ " s3_data_type=\"S3Prefix\",\n",
+ " s3_input_mode=\"File\",\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
+ " )\n",
+ ")\n",
"\n",
"## Input - Flow: 01_music_dataprep.flow\n",
"flow_input = ProcessingInput(\n",
@@ -301,7 +441,7 @@
" input_name=\"flow\",\n",
" s3_data_type=\"S3Prefix\",\n",
" s3_input_mode=\"File\",\n",
- " s3_data_distribution_type=\"FullyReplicated\"\n",
+ " s3_data_distribution_type=\"FullyReplicated\",\n",
")"
]
},
@@ -319,10 +459,9 @@
"outputs": [],
"source": [
"# Define feature group names we previously created in notebooks 02a-c\n",
- "fg_name_tracks = parameters['fg_name_tracks']\n",
- "fg_name_ratings = parameters['fg_name_ratings']\n",
- "fg_name_user_preferences = parameters['fg_name_user_preferences']\n",
- "dw_ecrlist = parameters['dw_ecrlist']"
+ "fg_name_tracks = \"track-features-music-rec\"\n",
+ "fg_name_ratings = \"ratings-features-music-rec\"\n",
+ "fg_name_user_preferences = \"user-5star-track-features-music-rec\""
]
},
{
@@ -334,23 +473,24 @@
"flow_output_tracks = sagemaker.processing.ProcessingOutput(\n",
" output_name=output_name_tracks,\n",
" app_managed=True,\n",
- " feature_store_output=sagemaker.processing.FeatureStoreOutput(\n",
- " feature_group_name=fg_name_tracks)\n",
- " )\n",
+ " feature_store_output=sagemaker.processing.FeatureStoreOutput(feature_group_name=fg_name_tracks),\n",
+ ")\n",
"\n",
"flow_output_user_preferences = sagemaker.processing.ProcessingOutput(\n",
" output_name=output_name_user_preferences,\n",
" app_managed=True,\n",
" feature_store_output=sagemaker.processing.FeatureStoreOutput(\n",
- " feature_group_name=fg_name_user_preferences)\n",
- " )\n",
+ " feature_group_name=fg_name_user_preferences\n",
+ " ),\n",
+ ")\n",
"\n",
"flow_output_ratings = sagemaker.processing.ProcessingOutput(\n",
" output_name=output_name_ratings,\n",
" app_managed=True,\n",
" feature_store_output=sagemaker.processing.FeatureStoreOutput(\n",
- " feature_group_name=fg_name_ratings)\n",
- " )"
+ " feature_group_name=fg_name_ratings\n",
+ " ),\n",
+ ")"
]
},
{
@@ -359,24 +499,12 @@
"metadata": {},
"outputs": [],
"source": [
- "# Output configuration used as processing job container arguments \n",
- "output_config_tracks = {\n",
- " output_name_tracks: {\n",
- " \"content_type\": \"CSV\"\n",
- " }\n",
- "}\n",
+ "# Output configuration used as processing job container arguments\n",
+ "output_config_tracks = {output_name_tracks: {\"content_type\": \"CSV\"}}\n",
"\n",
- "output_config_user_preferences = {\n",
- " output_name_user_preferences: {\n",
- " \"content_type\": \"CSV\"\n",
- " }\n",
- "}\n",
+ "output_config_user_preferences = {output_name_user_preferences: {\"content_type\": \"CSV\"}}\n",
"\n",
- "output_config_ratings = {\n",
- " output_name_ratings: {\n",
- " \"content_type\": \"CSV\"\n",
- " }\n",
- "}"
+ "output_config_ratings = {output_name_ratings: {\"content_type\": \"CSV\"}}"
]
},
{
@@ -404,41 +532,41 @@
"# Data Wrangler Container URL\n",
"# You can also find the proper container uri by exporting your Data Wrangler flow to a pipeline notebook\n",
"\n",
- "container_uri = f\"{dw_ecrlist['region'][region]}.dkr.ecr.{region}.amazonaws.com/sagemaker-data-wrangler-container:1.x\"\n",
+ "container_uri = sagemaker.image_uris.retrieve(framework=\"data-wrangler\", region=region)\n",
"\n",
"\n",
"flow_processor = sagemaker.processing.Processor(\n",
- " role=sagemaker_role, \n",
- " image_uri=container_uri, \n",
- " instance_count=flow_instance_count, \n",
- " instance_type=flow_instance_type, \n",
+ " role=sagemaker_role,\n",
+ " image_uri=container_uri,\n",
+ " instance_count=flow_instance_count,\n",
+ " instance_type=flow_instance_type,\n",
" volume_size_in_gb=30,\n",
" network_config=NetworkConfig(enable_network_isolation=False),\n",
- " sagemaker_session=sagemaker_session\n",
+ " sagemaker_session=sagemaker_session,\n",
")\n",
"\n",
"flow_step_tracks = ProcessingStep(\n",
- " name='DataWranglerStepTracks', \n",
- " processor=flow_processor, \n",
- " inputs=[flow_input] + data_sources, \n",
+ " name=\"DataWranglerStepTracks\",\n",
+ " processor=flow_processor,\n",
+ " inputs=[flow_input] + data_sources,\n",
" outputs=[flow_output_tracks],\n",
" job_arguments=[f\"--output-config '{json.dumps(output_config_tracks)}'\"],\n",
")\n",
"\n",
"flow_step_ratings = ProcessingStep(\n",
- " name='DataWranglerStepRatings', \n",
- " processor=flow_processor, \n",
- " inputs=[flow_input] + data_sources, \n",
+ " name=\"DataWranglerStepRatings\",\n",
+ " processor=flow_processor,\n",
+ " inputs=[flow_input] + data_sources,\n",
" outputs=[flow_output_ratings],\n",
- " job_arguments=[f\"--output-config '{json.dumps(output_config_ratings)}'\"]\n",
+ " job_arguments=[f\"--output-config '{json.dumps(output_config_ratings)}'\"],\n",
")\n",
"\n",
"flow_step_user_preferences = ProcessingStep(\n",
- " name='DataWranglerStepUserPref', \n",
- " processor=flow_processor, \n",
- " inputs=[flow_input] + data_sources, \n",
+ " name=\"DataWranglerStepUserPref\",\n",
+ " processor=flow_processor,\n",
+ " inputs=[flow_input] + data_sources,\n",
" outputs=[flow_output_user_preferences],\n",
- " job_arguments=[f\"--output-config '{json.dumps(output_config_user_preferences)}'\"]\n",
+ " job_arguments=[f\"--output-config '{json.dumps(output_config_user_preferences)}'\"],\n",
")"
]
},
@@ -446,10 +574,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### Step 2: Create Dataset and Train/Test Split\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "### Step 2: Create Dataset and Train/Test Split"
]
},
{
@@ -458,46 +583,57 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_client.upload_file(Filename='./code/create_datasets.py', Bucket=bucket, Key=f'{prefix}/code/create_datasets.py')\n",
- "create_dataset_script_uri = f's3://{bucket}/{prefix}/code/create_datasets.py'\n",
+ "s3_client.upload_file(\n",
+ " Filename=\"./code/create_datasets.py\", Bucket=bucket, Key=f\"{prefix}/code/create_datasets.py\"\n",
+ ")\n",
+ "create_dataset_script_uri = f\"s3://{bucket}/{prefix}/code/create_datasets.py\"\n",
"\n",
"create_dataset_processor = SKLearnProcessor(\n",
- " framework_version='0.23-1',\n",
+ " framework_version=\"0.23-1\",\n",
" role=sagemaker_role,\n",
" instance_type=\"ml.m5.4xlarge\",\n",
" instance_count=2,\n",
" volume_size_in_gb=100,\n",
- " base_job_name='music-recommendation-split-data',\n",
- " sagemaker_session=sagemaker_session)\n",
+ " base_job_name=\"music-rec-pipeline-split-data\",\n",
+ " sagemaker_session=sagemaker_session,\n",
+ ")\n",
"\n",
"create_dataset_step = ProcessingStep(\n",
- " name='SplitData',\n",
+ " name=\"SplitData\",\n",
" processor=create_dataset_processor,\n",
- " outputs = [\n",
- " sagemaker.processing.ProcessingOutput(output_name='train_data', source=f'{processing_dir}/output/train'),\n",
- " sagemaker.processing.ProcessingOutput(output_name='test_data', source=f'{processing_dir}/output/test')\n",
+ " outputs=[\n",
+ " sagemaker.processing.ProcessingOutput(\n",
+ " output_name=\"train_data\", source=f\"{processing_dir}/output/train\"\n",
+ " ),\n",
+ " sagemaker.processing.ProcessingOutput(\n",
+ " output_name=\"test_data\", source=f\"{processing_dir}/output/test\"\n",
+ " ),\n",
+ " ],\n",
+ " job_arguments=[\n",
+ " \"--feature-group-name-tracks\",\n",
+ " fg_name_tracks,\n",
+ " \"--feature-group-name-ratings\",\n",
+ " fg_name_ratings,\n",
+ " \"--feature-group-name-user-preferences\",\n",
+ " fg_name_user_preferences,\n",
+ " \"--bucket-name\",\n",
+ " bucket,\n",
+ " \"--bucket-prefix\",\n",
+ " prefix,\n",
+ " \"--region\",\n",
+ " region,\n",
" ],\n",
- " job_arguments=[\"--feature-group-name-tracks\", fg_name_tracks,\n",
- " \"--feature-group-name-ratings\", fg_name_ratings,\n",
- " \"--feature-group-name-user-preferences\", fg_name_user_preferences,\n",
- " \"--bucket-name\", bucket,\n",
- " \"--bucket-prefix\", prefix,\n",
- " \"--region\", region\n",
- " ],\n",
" code=create_dataset_script_uri,\n",
- " depends_on=[flow_step_tracks.name, flow_step_ratings.name, flow_step_user_preferences.name]\n",
- ")\n"
+ " depends_on=[flow_step_tracks.name, flow_step_ratings.name, flow_step_user_preferences.name],\n",
+ ")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
"### Step 3: Train XGBoost Model\n",
- "In this step we use the ParameterString `train_instance_param` defined at the beginning of the pipeline.\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "In this step we use the ParameterString `train_instance_param` defined at the beginning of the pipeline."
]
},
{
@@ -510,7 +646,7 @@
" \"max_depth\": \"4\",\n",
" \"eta\": \"0.2\",\n",
" \"objective\": \"reg:squarederror\",\n",
- " \"num_round\": \"100\"\n",
+ " \"num_round\": \"100\",\n",
"}\n",
"\n",
"save_interval = 5"
@@ -525,13 +661,13 @@
"xgb_estimator = Estimator(\n",
" role=sagemaker_role,\n",
" instance_count=2,\n",
- " instance_type='ml.m5.4xlarge',\n",
+ " instance_type=\"ml.m5.4xlarge\",\n",
" volume_size=60,\n",
" image_uri=sagemaker.image_uris.retrieve(\"xgboost\", region, \"0.90-2\"),\n",
" hyperparameters=hyperparameters,\n",
- " output_path=f's3://{bucket}/{prefix}/training_jobs',\n",
- " base_job_name='xgb-music-rec-model-pipeline',\n",
- " max_run=1800\n",
+ " output_path=f\"s3://{bucket}/{prefix}/training_jobs\",\n",
+ " base_job_name=\"xgb-music-rec-pipeline-model\",\n",
+ " max_run=1800,\n",
")"
]
},
@@ -542,45 +678,30 @@
"outputs": [],
"source": [
"train_step = TrainingStep(\n",
- " name='TrainStep',\n",
+ " name=\"TrainStep\",\n",
" estimator=xgb_estimator,\n",
" inputs={\n",
- " 'train': sagemaker.inputs.TrainingInput(\n",
- " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs['train_data'].S3Output.S3Uri,\n",
- " content_type=\"text/csv\"\n",
+ " \"train\": sagemaker.inputs.TrainingInput(\n",
+ " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs[\n",
+ " \"train_data\"\n",
+ " ].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
" ),\n",
- " 'validation': sagemaker.inputs.TrainingInput(\n",
- " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs['test_data'].S3Output.S3Uri,\n",
- " content_type=\"text/csv\"\n",
- " )\n",
- " }\n",
+ " \"validation\": sagemaker.inputs.TrainingInput(\n",
+ " s3_data=create_dataset_step.properties.ProcessingOutputConfig.Outputs[\n",
+ " \"test_data\"\n",
+ " ].S3Output.S3Uri,\n",
+ " content_type=\"text/csv\",\n",
+ " ),\n",
+ " },\n",
")"
]
},
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "#TuningStep"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### Step 4: Model Pre-Deployment Step\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "### Step 4: Model Pre-Deployment Step"
]
},
{
@@ -590,34 +711,24 @@
"outputs": [],
"source": [
"model = sagemaker.model.Model(\n",
- " name='music-recommender-xgboost-model',\n",
+ " name=\"music-rec-pipeline-xgboost-model\",\n",
" image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n",
" model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
" sagemaker_session=sagemaker_session,\n",
- " role=sagemaker_role\n",
+ " role=sagemaker_role,\n",
")\n",
"\n",
- "inputs = sagemaker.inputs.CreateModelInput(\n",
- " instance_type=\"ml.m4.xlarge\"\n",
- ")\n",
+ "inputs = sagemaker.inputs.CreateModelInput(instance_type=\"ml.m4.xlarge\")\n",
"\n",
- "create_model_step = CreateModelStep(\n",
- " name=\"CreateModel\",\n",
- " model=model,\n",
- " inputs=inputs\n",
- ")"
+ "create_model_step = CreateModelStep(name=\"CreateModel\", model=model, inputs=inputs)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
"### Step 5: Register Model\n",
- "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code.\n",
- "\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "In this step you will use the ParameterString `model_approval_status` defined at the outset of the pipeline code."
]
},
{
@@ -643,10 +754,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### Step 6: Deploy Model\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "### Step 6: Deploy Model"
]
},
{
@@ -655,39 +763,44 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_client.upload_file(Filename='./code/deploy_model.py', Bucket=bucket, Key=f'{prefix}/code/deploy_model.py')\n",
- "deploy_model_script_uri = f's3://{bucket}/{prefix}/code/deploy_model.py'\n",
- "pipeline_endpoint_name = 'music-rec-model-endpoint'\n",
+ "s3_client.upload_file(\n",
+ " Filename=\"./code/deploy_model.py\", Bucket=bucket, Key=f\"{prefix}/code/deploy_model.py\"\n",
+ ")\n",
+ "deploy_model_script_uri = f\"s3://{bucket}/{prefix}/code/deploy_model.py\"\n",
+ "pipeline_endpoint_name = \"music-rec-pipeline-endpoint\"\n",
"\n",
"deploy_model_processor = SKLearnProcessor(\n",
- " framework_version='0.23-1',\n",
+ " framework_version=\"0.23-1\",\n",
" role=sagemaker_role,\n",
- " instance_type='ml.m5.xlarge',\n",
+ " instance_type=\"ml.m5.xlarge\",\n",
" instance_count=1,\n",
" volume_size_in_gb=60,\n",
- " base_job_name='music-recommender-deploy-model',\n",
- " sagemaker_session=sagemaker_session)\n",
+ " base_job_name=\"music-recommender-deploy-model\",\n",
+ " sagemaker_session=sagemaker_session,\n",
+ ")\n",
"\n",
"deploy_step = ProcessingStep(\n",
- " name='DeployModel',\n",
+ " name=\"DeployModel\",\n",
" processor=deploy_model_processor,\n",
" job_arguments=[\n",
- " \"--model-name\", create_model_step.properties.ModelName, \n",
- " \"--region\", region,\n",
- " \"--endpoint-instance-type\", deploy_model_instance_type,\n",
- " \"--endpoint-name\", pipeline_endpoint_name\n",
+ " \"--model-name\",\n",
+ " create_model_step.properties.ModelName,\n",
+ " \"--region\",\n",
+ " region,\n",
+ " \"--endpoint-instance-type\",\n",
+ " deploy_model_instance_type,\n",
+ " \"--endpoint-name\",\n",
+ " pipeline_endpoint_name,\n",
" ],\n",
- " code=deploy_model_script_uri)"
+ " code=deploy_model_script_uri,\n",
+ ")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "### Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint\n",
- "\n",
- "[Pipeline Overview](#pipelines)"
+ "### Step 7: Monitor Model Deployed to SageMaker Hosted Endpoint\n"
]
},
{
@@ -696,35 +809,47 @@
"metadata": {},
"outputs": [],
"source": [
- "s3_client.upload_file(Filename='./code/model_monitor.py', Bucket=bucket, Key=f'{prefix}/code/model_monitor.py')\n",
- "model_monitor_script_uri = f's3://{bucket}/{prefix}/code/model_monitor.py'\n",
- "mon_schedule_name_base = 'music-recommender-daily-monitor'\n",
+ "s3_client.upload_file(\n",
+ " Filename=\"./code/model_monitor.py\", Bucket=bucket, Key=f\"{prefix}/code/model_monitor.py\"\n",
+ ")\n",
+ "model_monitor_script_uri = f\"s3://{bucket}/{prefix}/code/model_monitor.py\"\n",
+ "mon_schedule_name_base = \"music-rec-pipeline-daily-monitor\"\n",
"\n",
"\n",
"model_monitor_processor = SKLearnProcessor(\n",
- " framework_version='0.23-1',\n",
+ " framework_version=\"0.23-1\",\n",
" role=sagemaker_role,\n",
- " instance_type='ml.m5.xlarge',\n",
+ " instance_type=\"ml.m5.xlarge\",\n",
" instance_count=1,\n",
" volume_size_in_gb=60,\n",
- " base_job_name='music-recommendation-model-monitor',\n",
- " sagemaker_session=sagemaker_session)\n",
+ " base_job_name=\"music-rec-pipeline-model-monitor\",\n",
+ " sagemaker_session=sagemaker_session,\n",
+ ")\n",
"\n",
"monitor_model_step = ProcessingStep(\n",
- " name='ModelMonitor',\n",
+ " name=\"ModelMonitor\",\n",
" processor=model_monitor_processor,\n",
- " outputs = [\n",
- " sagemaker.processing.ProcessingOutput(output_name='model_baseline', source=f'{processing_dir}/output/baselineresults')\n",
+ " outputs=[\n",
+ " sagemaker.processing.ProcessingOutput(\n",
+ " output_name=\"model_baseline\", source=f\"{processing_dir}/output/baselineresults\"\n",
+ " )\n",
+ " ],\n",
+ " job_arguments=[\n",
+ " \"--baseline-data-uri\",\n",
+ " val_data_uri,\n",
+ " \"--bucket-name\",\n",
+ " bucket,\n",
+ " \"--bucket-prefix\",\n",
+ " prefix,\n",
+ " \"--endpoint\",\n",
+ " pipeline_endpoint_name,\n",
+ " \"--region\",\n",
+ " region,\n",
+ " \"--schedule-name\",\n",
+ " mon_schedule_name_base,\n",
" ],\n",
- " job_arguments=[\"--baseline-data-uri\", val_data_uri,\n",
- " \"--bucket-name\", bucket,\n",
- " \"--bucket-prefix\", prefix,\n",
- " \"--endpoint\", pipeline_endpoint_name,\n",
- " \"--region\", region,\n",
- " \"--schedule-name\", mon_schedule_name_base\n",
- " ],\n",
" code=model_monitor_script_uri,\n",
- " depends_on=[deploy_step.name]\n",
+ " depends_on=[deploy_step.name],\n",
")"
]
},
@@ -732,10 +857,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
- "\n",
- "### Combine the Pipeline Steps and Run\n",
- "[Pipeline Overview](#pipelines)\n",
+ "### Combine Steps and Run Pipeline\n",
"\n",
"Once all of our steps are defined, we can put them together using the SageMaker `Pipeline` object. While we pass the steps in order so that it is easier to read, technically the order that we pass them does not matter since the pipeline DAG will parse it out properly based on any dependencies between steps. If the input of one step is the output of another step, the Pipelines understands which must come first."
]
@@ -748,23 +870,16 @@
},
"outputs": [],
"source": [
- "pipeline_name = f'MusicRecommendationPipeline'\n",
- "dataprep_pipeline_name = f'MusicRecommendationDataPrepPipeline'\n",
- "train_deploy_pipeline_name = f'MusicRecommendationTrainDeployPipeline'\n",
- "\n",
- "ps.add({'pipeline_name':pipeline_name, 'dataprep_pipeline_name':dataprep_pipeline_name, \n",
- " 'train_deploy_pipeline_name':train_deploy_pipeline_name,\n",
- " 'pipeline_endpoint_name':pipeline_endpoint_name}, \n",
- " namespace='music-rec'\n",
- ")\n",
- "ps.store()"
+ "pipeline_name = f\"MusicRecommendationPipeline\"\n",
+ "dataprep_pipeline_name = f\"MusicRecommendationDataPrepPipeline\"\n",
+ "train_deploy_pipeline_name = f\"MusicRecommendationTrainDeployPipeline\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Option 1: The Entire Pipeline End to end"
+ "#### Option 1: The Entire Pipeline End to end"
]
},
{
@@ -773,9 +888,7 @@
"metadata": {},
"outputs": [],
"source": [
- "pipeline_name = f'MusicRecommendationPipeline'\n",
- "\n",
- "ps.add({'pipeline_name':pipeline_name}, namespace='music-rec')"
+ "pipeline_name = f\"MusicRecommendationPipeline\""
]
},
{
@@ -786,20 +899,19 @@
"source": [
"pipeline = Pipeline(\n",
" name=pipeline_name,\n",
- " parameters=[\n",
- " train_instance_param, \n",
- " model_approval_status],\n",
+ " parameters=[train_instance_param, model_approval_status],\n",
" steps=[\n",
" flow_step_tracks,\n",
" flow_step_user_preferences,\n",
" flow_step_ratings,\n",
" create_dataset_step,\n",
- " train_step, \n",
- " create_model_step, \n",
+ " train_step,\n",
+ " create_model_step,\n",
" register_step,\n",
" deploy_step,\n",
- " monitor_model_step \n",
- " ])"
+ " monitor_model_step,\n",
+ " ],\n",
+ ")"
]
},
{
@@ -832,15 +944,9 @@
"source": [
"pipeline_dataprep = Pipeline(\n",
" name=dataprep_pipeline_name,\n",
- " parameters=[\n",
- " train_instance_param, \n",
- " model_approval_status],\n",
- " steps=[\n",
- " flow_step_tracks,\n",
- " flow_step_user_preferences,\n",
- " flow_step_ratings,\n",
- " create_dataset_step\n",
- " ])"
+ " parameters=[train_instance_param, model_approval_status],\n",
+ " steps=[flow_step_tracks, flow_step_user_preferences, flow_step_ratings, create_dataset_step],\n",
+ ")"
]
},
{
@@ -857,20 +963,31 @@
"outputs": [],
"source": [
"create_dataset_step_no_depend = ProcessingStep(\n",
- " name='SplitData',\n",
+ " name=\"SplitData\",\n",
" processor=create_dataset_processor,\n",
- " outputs = [\n",
- " sagemaker.processing.ProcessingOutput(output_name='train_data', source=f'{processing_dir}/output/train'),\n",
- " sagemaker.processing.ProcessingOutput(output_name='test_data', source=f'{processing_dir}/output/test')\n",
+ " outputs=[\n",
+ " sagemaker.processing.ProcessingOutput(\n",
+ " output_name=\"train_data\", source=f\"{processing_dir}/output/train\"\n",
+ " ),\n",
+ " sagemaker.processing.ProcessingOutput(\n",
+ " output_name=\"test_data\", source=f\"{processing_dir}/output/test\"\n",
+ " ),\n",
+ " ],\n",
+ " job_arguments=[\n",
+ " \"--feature-group-name-tracks\",\n",
+ " fg_name_tracks,\n",
+ " \"--feature-group-name-ratings\",\n",
+ " fg_name_ratings,\n",
+ " \"--feature-group-name-user-preferences\",\n",
+ " fg_name_user_preferences,\n",
+ " \"--bucket-name\",\n",
+ " bucket,\n",
+ " \"--bucket-prefix\",\n",
+ " prefix,\n",
+ " \"--region\",\n",
+ " region,\n",
" ],\n",
- " job_arguments=[\"--feature-group-name-tracks\", fg_name_tracks,\n",
- " \"--feature-group-name-ratings\", fg_name_ratings,\n",
- " \"--feature-group-name-user-preferences\", fg_name_user_preferences,\n",
- " \"--bucket-name\", bucket,\n",
- " \"--bucket-prefix\", prefix,\n",
- " \"--region\", region\n",
- " ],\n",
- " code=create_dataset_script_uri\n",
+ " code=create_dataset_script_uri,\n",
")"
]
},
@@ -882,17 +999,16 @@
"source": [
"pipeline_train_deploy_monitor = Pipeline(\n",
" name=train_deploy_pipeline_name,\n",
- " parameters=[\n",
- " train_instance_param, \n",
- " model_approval_status],\n",
+ " parameters=[train_instance_param, model_approval_status],\n",
" steps=[\n",
" create_dataset_step_no_depend,\n",
- " train_step, \n",
- " create_model_step, \n",
+ " train_step,\n",
+ " create_model_step,\n",
" register_step,\n",
" deploy_step,\n",
- " monitor_model_step \n",
- " ])"
+ " monitor_model_step,\n",
+ " ],\n",
+ ")"
]
},
{
@@ -937,7 +1053,7 @@
},
"outputs": [],
"source": [
- "#json.loads(pipeline.describe()['PipelineDefinition'])"
+ "# json.loads(pipeline.describe()['PipelineDefinition'])"
]
},
{
@@ -957,7 +1073,7 @@
"outputs": [],
"source": [
"# Special pipeline parameters can be defined or changed here\n",
- "parameters = {'TrainingInstance': 'ml.m5.4xlarge'}"
+ "parameters = {\"TrainingInstance\": \"ml.m5.4xlarge\"}"
]
},
{
@@ -977,7 +1093,7 @@
"start_response = pipeline.start(parameters=parameters)\n",
"# start_response = pipeline_dataprep.start(parameters=parameters)\n",
"# start_response = pipeline_train_deploy_monitor.start(parameters=parameters)\n",
- "start_response.wait(delay=60, max_attempts=200)\n",
+ "start_response.wait(delay=60, max_attempts=1000)\n",
"start_response.describe()"
]
},
@@ -988,20 +1104,41 @@
"After completion we can use Sagemaker Studio's **Components and Registries** tab to see our Pipeline graph and any further error or log messages."
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Clean Up\n",
+ "\n",
+ "----"
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "import demo_helpers\n",
+ "\n",
+ "demo_helpers.delete_project_resources(\n",
+ " sagemaker_boto_client=sagemaker_boto_client,\n",
+ " sagemaker_session=sagemaker_session,\n",
+ " endpoint_names=[pipeline_endpoint_name],\n",
+ " pipeline_names=[pipeline_name, dataprep_pipeline_name, train_deploy_pipeline_name],\n",
+ " prefix=prefix,\n",
+ " delete_s3_objects=True,\n",
+ " bucket_name=bucket,\n",
+ ")"
+ ]
}
],
"metadata": {
- "instance_type": "ml.t3.medium",
+ "instance_type": "ml.m5.4xlarge",
"kernelspec": {
- "display_name": "Python 3 (Data Science)",
+ "display_name": "conda_python3",
"language": "python",
- "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0"
+ "name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
@@ -1013,7 +1150,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.7.10"
+ "version": "3.6.13"
}
},
"nbformat": 4,
diff --git a/end_to_end/music_recommendation/index.rst b/end_to_end/music_recommendation/index.rst
index 9fd6d132b3..e8ddb460ee 100644
--- a/end_to_end/music_recommendation/index.rst
+++ b/end_to_end/music_recommendation/index.rst
@@ -9,12 +9,7 @@ Music Recommender System across the Entire ML-Lifecycle with Amazon SageMaker
.. toctree::
:maxdepth: 1
- 00_overview_arch_data.ipynb
- 02a_export_fg_tracks.ipynb
- 02b_export_fg_5star_features.ipynb
- 02c_export_fg_ratings.ipynb
- 03_train_model_lineage_registry_debugger.ipynb
- 04_deploy_infer_explain.ipynb
- 05_model_monitor.ipynb
- 06_pipeline.ipynb
- 07_clean_up.ipynb
+ 01_data_exploration
+ 02_export_feature_groups
+ 03_train_deploy_debugger_explain_monitor_registry
+ end_to_end_pipeline
\ No newline at end of file