atqy · atqy · Apr 29, 2022 · Apr 29, 2022 · Apr 29, 2022 · Apr 29, 2022
diff --git a/use-cases/computer_vision/1-metastases-detection-train-model.ipynb b/use-cases/computer_vision/1-metastases-detection-train-model.ipynb
diff --git a/...ion/4-metastases-detection-pipeline.ipynb → ...ision/metastases-detection-pipeline.ipynb b/...ion/4-metastases-detection-pipeline.ipynb → ...ision/metastases-detection-pipeline.ipynb
@@ -4,8 +4,8 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Computer Vision for Medical Imaging: Part 4. SageMaker Pipelines\n",
-    "This notebook is the final part of a 4-part series of techniques and services offer by SageMaker to build a model which predicts if an image of cells contains cancer. This notebook describes how to automate the ML workflow using SageMaker Pipelines."
+    "# Computer Vision for Medical Imaging - Pipeline Mode\n",
+    "This notebook showcases techniques and services offer by SageMaker to build a model which predicts if an image of cells contains cancer. This notebook describes how to automate the ML workflow using SageMaker Pipelines."
    ]
   },
   {
@@ -37,8 +37,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%store -r\n",
-    "%store"
+    "! pip install --upgrade sagemaker boto3"
    ]
   },
   {
@@ -48,6 +47,28 @@
     "## Import Libraries"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pip\n",
+    "\n",
+    "\n",
+    "def import_or_install(package):\n",
+    "    try:\n",
+    "        __import__(package)\n",
+    "    except ImportError:\n",
+    "        ! pip install $package\n",
+    "\n",
+    "\n",
+    "required_packages = [\"sagemaker\", \"boto3\", \"h5py\", \"tqdm\", \"matplotlib\", \"opencv-python\"]\n",
+    "\n",
+    "for package in required_packages:\n",
+    "    import_or_install(package)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -59,6 +80,12 @@
     "import numpy as np\n",
     "import matplotlib.pyplot as plt\n",
     "import cv2\n",
+    "import os\n",
+    "import zipfile\n",
+    "import h5py\n",
+    "import mxnet as mx\n",
+    "from datetime import datetime\n",
+    "from tqdm import tqdm\n",
     "\n",
     "from sagemaker.workflow.pipeline import Pipeline\n",
     "from sagemaker.workflow.steps import CreateModelStep\n",
@@ -96,6 +123,208 @@
     "bucket = sagemaker.Session().default_bucket()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# check if directory exists\n",
+    "if not os.path.isdir(\"data\"):\n",
+    "    os.mkdir(\"data\")\n",
+    "\n",
+    "# download zip file from public s3 bucket\n",
+    "!wget -P data https://sagemaker-sample-files.s3.amazonaws.com/datasets/image/pcam/medical_images.zip"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with zipfile.ZipFile(\"data/medical_images.zip\") as zf:\n",
+    "    zf.extractall()\n",
+    "with open(\"data/camelyon16_tiles.h5\", \"rb\") as hf:\n",
+    "    f = h5py.File(hf, \"r\")\n",
+    "\n",
+    "    X = f[\"x\"][()]\n",
+    "    y = f[\"y\"][()]\n",
+    "\n",
+    "print(\"Shape of X:\", X.shape)\n",
+    "print(\"Shape of y:\", y.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# write to session s3 bucket\n",
+    "s3_client.upload_file(\"data/medical_images.zip\", bucket, f\"data/medical_images.zip\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# delete local copy\n",
+    "import os\n",
+    "\n",
+    "if os.path.exists(\"data/medical_images.zip\"):\n",
+    "    os.remove(\"data/medical_images.zip\")\n",
+    "else:\n",
+    "    print(\"The file does not exist\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## View Sample Images from Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preview_images(X, y, n, cols):\n",
+    "    sample_images = X[:n]\n",
+    "    sample_labels = y[:n]\n",
+    "\n",
+    "    rows = int(np.ceil(n / cols))\n",
+    "    fig, axs = plt.subplots(rows, cols, figsize=(11.5, 7))\n",
+    "\n",
+    "    for i, ax in enumerate(axs.flatten()):\n",
+    "        image = sample_images[i]\n",
+    "        label = sample_labels[i]\n",
+    "        ax.imshow(image)\n",
+    "        ax.axis(\"off\")\n",
+    "        ax.set_title(f\"Label: {label}\")\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "\n",
+    "\n",
+    "preview_images(X, y, 15, 5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Shuffle and Split Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "X_numpy = X[:]\n",
+    "y_numpy = y[:]\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X_numpy, y_numpy, test_size=1000, random_state=0\n",
+    ")\n",
+    "X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=2000, random_state=1)\n",
+    "\n",
+    "print(X_train.shape)\n",
+    "print(X_val.shape)\n",
+    "print(X_test.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert Splits to RecordIO Format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def write_to_recordio(X: np.ndarray, y: np.ndarray, prefix: str):\n",
+    "    record = mx.recordio.MXIndexedRecordIO(idx_path=f\"{prefix}.idx\", uri=f\"{prefix}.rec\", flag=\"w\")\n",
+    "    for idx, arr in enumerate(tqdm(X)):\n",
+    "        header = mx.recordio.IRHeader(0, y[idx], idx, 0)\n",
+    "        s = mx.recordio.pack_img(\n",
+    "            header,\n",
+    "            arr,\n",
+    "            quality=95,\n",
+    "            img_fmt=\".jpg\",\n",
+    "        )\n",
+    "        record.write_idx(idx, s)\n",
+    "    record.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "write_to_recordio(X_train, y_train, prefix=\"data/train\")\n",
+    "write_to_recordio(X_val, y_val, prefix=\"data/val\")\n",
+    "write_to_recordio(X_test, y_test, prefix=\"data/test\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upload Data Splits to S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prefix = \"cv-metastasis\"\n",
+    "\n",
+    "try:\n",
+    "    s3_client.create_bucket(\n",
+    "        Bucket=bucket, ACL=\"private\", CreateBucketConfiguration={\"LocationConstraint\": region}\n",
+    "    )\n",
+    "    print(f\"Created S3 bucket: {bucket}\")\n",
+    "\n",
+    "except Exception as e:\n",
+    "    if e.response[\"Error\"][\"Code\"] == \"BucketAlreadyOwnedByYou\":\n",
+    "        print(f\"Using existing bucket: {bucket}\")\n",
+    "    else:\n",
+    "        raise (e)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s3_client.upload_file(\"data/train.rec\", bucket, f\"{prefix}/data/train/train.rec\")\n",
+    "s3_client.upload_file(\"data/val.rec\", bucket, f\"{prefix}/data/val/val.rec\")\n",
+    "s3_client.upload_file(\"data/test.rec\", bucket, f\"{prefix}/data/test/test.rec\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -110,13 +339,15 @@
    "outputs": [],
    "source": [
     "training_image = sagemaker.image_uris.retrieve(\"image-classification\", region)\n",
+    "num_training_samples = X_train.shape[0]\n",
+    "num_classes = len(np.unique(y_train))\n",
     "\n",
     "hyperparameters = {\n",
     "    \"num_layers\": 18,\n",
     "    \"use_pretrained_model\": 1,\n",
     "    \"augmentation_type\": \"crop_color_transform\",\n",
     "    \"image_shape\": \"3,96,96\",\n",
-    "    \"num_classes\": 2,\n",
+    "    \"num_classes\": num_classes,\n",
     "    \"num_training_samples\": num_training_samples,\n",
     "    \"mini_batch_size\": 64,\n",
     "    \"epochs\": 5,\n",
@@ -255,6 +486,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "mpg_name = \"cv-metastasis-{}\".format(datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))\n",
+    "\n",
     "model_approval_status = ParameterString(\n",
     "    name=\"ModelApprovalStatus\", default_value=\"PendingManualApproval\"\n",
     ")\n",
@@ -287,7 +520,7 @@
    "source": [
     "model = sagemaker.model.Model(\n",
     "    name=f\"{mpg_name}-pipline\",\n",
-    "    image_uri=train_step.properties.AlgorithmSpecification.TrainingImage,\n",
+    "    image_uri=training_image,\n",
     "    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,\n",
     "    sagemaker_session=sagemaker_session,\n",
     "    role=sagemaker_role,\n",
@@ -315,6 +548,7 @@
     "    Filename=\"deploy_model.py\", Bucket=bucket, Key=f\"{prefix}/code/deploy_model.py\"\n",
     ")\n",
     "deploy_model_script_uri = f\"s3://{bucket}/{prefix}/code/deploy_model.py\"\n",
+    "deploy_instance_type = \"ml.m4.xlarge\"\n",
     "\n",
     "deploy_model_processor = SKLearnProcessor(\n",
     "    framework_version=\"0.23-1\",\n",
@@ -355,7 +589,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pipeline_name = f\"{prefix}-pipeline\"\n",
+    "pipeline_name = \"{}-pipeline-{}\".format(prefix, datetime.now().strftime(\"%Y-%m-%d-%H-%M-%S\"))\n",
     "\n",
     "pipeline = Pipeline(\n",
     "    name=pipeline_name,\n",
@@ -419,7 +653,49 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "best_model.sagemaker_session.delete_endpoint(mpg_name)"
+    "def delete_model_package_group(sm_client, package_group_name):\n",
+    "    try:\n",
+    "        model_versions = sm_client.list_model_packages(ModelPackageGroupName=package_group_name)\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        print(\"{} \\n\".format(e))\n",
+    "        return\n",
+    "\n",
+    "    for model_version in model_versions[\"ModelPackageSummaryList\"]:\n",
+    "        try:\n",
+    "            sm_client.delete_model_package(ModelPackageName=model_version[\"ModelPackageArn\"])\n",
+    "        except Exception as e:\n",
+    "            print(\"{} \\n\".format(e))\n",
+    "        time.sleep(0.5)  # Ensure requests aren't throttled\n",
+    "\n",
+    "    try:\n",
+    "        sm_client.delete_model_package_group(ModelPackageGroupName=package_group_name)\n",
+    "        print(\"{} model package group deleted\".format(package_group_name))\n",
+    "    except Exception as e:\n",
+    "        print(\"{} \\n\".format(e))\n",
+    "    return\n",
+    "\n",
+    "\n",
+    "def delete_sagemaker_pipeline(sm_client, pipeline_name):\n",
+    "    try:\n",
+    "        sm_client.delete_pipeline(\n",
+    "            PipelineName=pipeline_name,\n",
+    "        )\n",
+    "        print(\"{} pipeline deleted\".format(pipeline_name))\n",
+    "    except Exception as e:\n",
+    "        print(\"{} \\n\".format(e))\n",
+    "        return"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client = sagemaker.Session().sagemaker_client\n",
+    "delete_model_package_group(client, mpg_name)\n",
+    "delete_sagemaker_pipeline(client, pipeline_name)"
    ]
   },
   {
@@ -433,9 +709,9 @@
  "metadata": {
   "instance_type": "ml.t3.medium",
   "kernelspec": {
-   "display_name": "conda_python3",
+   "display_name": "conda_mxnet_p36",
    "language": "python",
-   "name": "conda_python3"
+   "name": "conda_mxnet_p36"
   },
   "language_info": {
    "codemirror_mode": {