diff --git a/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py b/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py new file mode 100644 index 00000000..3760fd5e --- /dev/null +++ b/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py @@ -0,0 +1,71 @@ +# Load dependency packages +import sys +import csv +import numpy as np +import pandas as pd +from xgboost import XGBClassifier, Booster +import warnings + +# pickle will issue a caution warning, if model pickling was done with +# different library version than used here. The following disables any warnings +# that might otherwise show in the scriptlog files on the Advanced SQL Engine +# nodes in this case. Yet, do keep an eye for incompatible pickle versions. +warnings.filterwarnings('ignore') + +# Know your data: You must know in advance the number and data types of the +# incoming columns from the SQL Engine database! +# For this script, the input expected format is: +colNames = ['txn_id', + 'txn_type_CASH_OUT', + 'txn_type_CASH_IN', + 'txn_type_TRANSFER', + 'txn_type_DEBIT', + 'txn_type_PAYMENT', + 'txn_type_other', + 'amount', + 'oldbalanceOrig', + 'newbalanceOrig', + 'oldbalanceDest', + 'newbalanceDest', + 'isFraud'] + + + +model = XGBClassifier() +booster = Booster() +booster.load_model('xgb_model') +model._Booster = booster + + +d = csv.DictReader(sys.stdin.readlines(), fieldnames = colNames) + +df = pd.DataFrame(d, columns = colNames) + +# Use try...except to produce an error if something goes wrong in the try block +try: + # Exit gracefully if DataFrame is empty + if df.empty: + sys.exit() + + # Specify the rows to be scored by the model and call the predictor. + X_test = df[['txn_type_CASH_OUT', 'txn_type_CASH_IN','txn_type_TRANSFER', 'txn_type_DEBIT','txn_type_PAYMENT', 'txn_type_other','amount','oldbalanceOrig', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest']].astype(float) + + y_prob = model.predict_proba(X_test) + df[['prob_0', 'prob_1']] = y_prob + + y_pred = model.predict(X_test) + df['prediction'] = y_pred + + # Export results to the Database through standard output. + for index, value in df.iterrows(): + my_str = str(value['txn_id']) + ',' + str(value['prob_0']) + ',' + str(value['prob_1']) + ',' + str(value['prediction']) + ',' + str(value['isFraud']) + print(my_str) + + +except (SystemExit): + # Skip exception if system exit requested in try block + pass +except: # Specify in standard error any other error encountered + print("Script Failure :", sys.exc_info()[0], file=sys.stderr) + raise + sys.exit() diff --git a/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb b/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb new file mode 100644 index 00000000..57ef699d --- /dev/null +++ b/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb @@ -0,0 +1,1037 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "hawaiian-daniel", + "metadata": {}, + "source": [ + "\n", + "\n", + "
\n", + "

\n", + " Leveraging Open Source Machine Learning with ClearScape Analytics and Open Analytics Framework\n", + "
\n", + " \"Teradata\"\n", + "

\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "c5a2465a", + "metadata": {}, + "source": [ + "

Introduction:

\n", + "\n", + "

Open-source Machine Learning, AI, and Advanced Analytics tools, techniques, and resources offer enterprises limitless opportunities to drive new insights and business value from their internal and external data landscape. Unfortunately, with these opportunities come significant challenges to realizing success. Some of these challenges include:

\n", + "\n", + " \n", + " \n", + " \n", + "

VantageCloud Lake Edition Open Analytics Framework is the only enterprise-class platform that addresses these challenges with a simple, powerful architecture. The following demonstration will illustrate how users can use any open-source tool or package of choice, deploy it to a custom, isolated environment; and then execute in parallel and at massive scale.

\n", + "\n", + "
\n", + "\n", + "Environment Overview\n", + "\n", + "

This demonstration utilizes a VantageCloud Lake Analytic Cluster architecture, using the shared data sets created in the previous demonstration. Specifically the \"Txn_History\" data that represents \"CashApp\" style transaction history stored in the Vantage Object File System (OFS).

\n", + "\n", + "

The high level process is as follows:

\n", + "\n", + "\n", + " \n", + "
\n", + "
    \n", + "
  1. The Data Scientist conducts analytics activities using his or her own python tools and packages of choice, then connects to VantageCloud Lake through teradataml client library and teradatasql python driver.
  2. \n", + "
    \n", + "
  3. Teradataml provides APIs to create and manage artifacts in User Environment Service, including custom libraries, dependencies, model artifacts, and scoring scripts. The user can leverage these APIs to create one or many custom, dedicated environments to host their code.
  4. \n", + "
    \n", + "
  5. The Data Scientist will then execute their pipeline that will;\n", + "
    • Call ClearScape Analytics functions on Compute Clusters (data prep, transformation, etc.)
    • \n", + "
    • Prepared data is passed to the python container running in parallel on cluster nodes.
    • \n", + "
    • Results (inference/predictions) are returned as \"virtual\" dataframes; where the data resides in Vantage
    • \n", + "
    • Data can be persisted in the Object Filesystem, written to open object storage, or copied to the client
    • \n", + "
  6. \n", + "
\n", + "
\n", + "\n", + "Demonstration Overview\n", + "\n", + "

This notebook consists of three primary demonstrations

\n", + "
    \n", + "
  1. Custom Environment Management - Create a server-side, custom python container with explicit package and versions installed
  2. \n", + "
  3. File Management - Upload model files, scoring scripts, and any other asset type
  4. \n", + "
  5. Analytics - Execute powerful feature engineering and statistical functions and pass this directly to the python container running in parallel
  6. \n", + "
  7. Appendix - Model Training and Testing - The process for creating and testing the model using open-source tools is provided in the Appendix
  8. \n", + "
\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "transsexual-poverty", + "metadata": {}, + "source": [ + "
\n", + "1. Configure the environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "southeast-density", + "metadata": {}, + "outputs": [], + "source": [ + "# install other required packages\n", + "%pip install xgboost dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "great-shadow", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Import the Python library teradataml and the specific environment setup modules.\n", + "import warnings\n", + "from teradataml import *\n", + "warnings.filterwarnings('ignore')\n", + "display.suppress_vantage_runtime_warnings = True\n", + "\n", + "from IPython.display import display as ipydisplay\n", + "from IPython.display import clear_output \n", + "from dotenv import load_dotenv, dotenv_values\n", + "from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Account for the data types to be used with the script.\n", + "from teradatasqlalchemy.types import BIGINT, VARCHAR, FLOAT, INTEGER\n", + "from collections import OrderedDict\n", + "\n", + "# Other case-specific imports.\n", + "import json, os, sys, getpass\n", + "import pandas as pd\n", + "from time import sleep\n", + "import time\n", + "# container name - set here for easier notebook navigation\n", + "# User will also be asked to change it \n", + "oaf_name = 'OAF_demo_env'\n", + "print(f'using \"{oaf_name}\" for the OAF environment')\n", + "\n", + "# get the current python version to match deploy a custom container\n", + "python_version = str(sys.version_info[0]) + '.' + str(sys.version_info[1])\n", + "print(f'Using Python version {python_version} for user environment')" + ] + }, + { + "cell_type": "markdown", + "id": "muslim-intention", + "metadata": {}, + "source": [ + "
\n", + "

2. Connect to VantageCloud Lake

\n", + "

Connect to VantageCloud using `create_context` from the teradataml Python library. Input your connection details, including the host, username, password and Analytic Compute Group name.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700c32b0-bd3d-4cee-85db-788889f0c7a7", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=Opensource_Data_Science_OAF.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "offshore-watch", + "metadata": {}, + "source": [ + "
\n", + "

3. Demo 1 - Custom Container Management

\n", + "\n", + "\n", + "\n", + "

The Teradata Vantage Python Client Library provides simple, powerful methods for the creation and maintenance of custom Python runtime environments in the VantageCloud environment . This allows practitioners complete control over the behavior and quality of their model performance and analytic accuracy running on the Analytic Cluster. The following demonstration will show how easy it is to create a custom xgboost-based scoring environment.

\n", + "\n", + "\n", + "\n", + "

Custom environments are persistent. Users only need to create these once and then can be saved, updated, or modified only as needed.

\n", + "\n", + "
\n", + "

Container Management Process

\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
    \n", + "
  1. Set up a connection to the Environment Service
  2. \n", + "
    \n", + "
  3. Create a unique User Environment based on available base images
  4. \n", + "
    \n", + "
  5. Install custom libraries and specifc versions if required
  6. \n", + "
    \n", + "
  7. Monitor packages installation/view installed packages
  8. \n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "bridal-matrix", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

3.1 Connect to the Environment Service

\n", + "\n", + "

To better support integration with Cloud Services and commong automation tools; the User Environment Service is accessed via RESTful APIs. These APIs can be called directly or in the examples shown below that leverage the Python Package for Teradata (teradataml) methods.

\n", + "\n", + "

In order to properly authenticate to the UES infrastructure, the user must log in with the same credentials that are used to connect to the database. When the following cell executes, follow the instructions to open a browser window, and log in with that user.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734d8327-f92c-4843-84be-b89b8fdf690f", + "metadata": {}, + "outputs": [], + "source": [ + "# We've already loaded all the values into our environment variables and into a dictionary, env_vars.\n", + "# username=env_vars.get(\"username\") isn't required when using base_url, pat and pem.\n", + "\n", + "if set_auth_token(base_url=env_vars.get(\"ues_uri\"),\n", + " pat_token=env_vars.get(\"access_token\"), \n", + " pem_file=env_vars.get(\"pem_file\"),\n", + " valid_from=int(time.time())\n", + " ):\n", + " print(\"UES Authentication successful\")\n", + "else:\n", + " print(\"UES Authentication failed. Check credentials.\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "869bec81-31cd-4ec3-97e7-1802ae2cfd7b", + "metadata": {}, + "outputs": [], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\") " + ] + }, + { + "cell_type": "markdown", + "id": "eligible-newfoundland", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

3.2 Create a Custom Container in Vantage

\n", + "\n", + "

If desired, the user can create a new custom environment by starting with a \"base\" image and customizing it. The steps are:

\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "consistent-component", + "metadata": {}, + "outputs": [], + "source": [ + "# List available Base Python environments\n", + "ipydisplay(list_base_envs())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c613d90f-4943-4d13-a97b-ede53b30c901", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new environment, or connect to an existing one\n", + "try:\n", + " ipydisplay(list_user_envs())\n", + "except Exception as e:\n", + " \n", + " if str(e).find('No user environments found') > 0:\n", + " print('No user environments found')\n", + " pass\n", + " else:\n", + " raise\n", + "\n", + "print('Use an existing environment, or create a new one:')\n", + "print(f'OAF Environment is set to {oaf_name}.')\n", + "print('Enter to accept, or input a new value.')\n", + "print('If the environment is not in the list, a new one will be created')\n", + "i = input()\n", + "if len(i) != 0:\n", + " oaf_name = i\n", + " print(f'OAF Environment is now {oaf_name}')\n", + "\n", + "try:\n", + " demo_env = create_env(env_name = oaf_name,\n", + " base_env = f'python_{python_version}',\n", + " desc = 'OAF Demo environment')\n", + "except Exception as e:\n", + " if str(e).find('same name already exists') > 0:\n", + " print('Environment already exists, obtaining a reference to it')\n", + " demo_env = get_env(oaf_name)\n", + " pass\n", + " elif 'Invalid value for base environment name' in str(e):\n", + " print('Unsupported base environment version, using defaults')\n", + " demo_env = create_env(env_name = oaf_name,\n", + " desc = 'OAF Demo environment')\n", + " else:\n", + " raise\n", + "\n", + "# Note create_env seems to be asynchronous - sleep a bit for it to register\n", + "sleep(5)\n", + "\n", + "try:\n", + " ipydisplay(list_user_envs())\n", + "except Exception as e:\n", + " if str(e).find('No user environments found') > 0:\n", + " print('No user environments found')\n", + " pass\n", + " else:\n", + " raise" + ] + }, + { + "cell_type": "markdown", + "id": "breeding-shame", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

3.3 Install Dependencies

\n", + "\n", + "

The second step in the customization process is to install Python package dependencies. This set of code:\n", + "

\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "plain-psychology", + "metadata": {}, + "outputs": [], + "source": [ + "# View existing libraries in the user environment.\n", + "demo_env.libs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "premier-agenda", + "metadata": {}, + "outputs": [], + "source": [ + "# Install any Python add-ons needed by the script in the user environment\n", + "# Using option asynchronous=True for an asychronous execution of the statement.\n", + "# Note: Avoid asynchronous installation when batch-executing all notebook statements,\n", + "# as execution will continue even without installation being complete.\n", + "#\n", + "claim_id = demo_env.install_lib(['numpy','pandas','scikit-learn', 'xgboost==1.6.2'], asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "blond-reliance", + "metadata": {}, + "outputs": [], + "source": [ + "# Check the status of installation using status() API.\n", + "# Create a loop here for demo purposes\n", + "\n", + "ipydisplay(demo_env.status(claim_id))\n", + "stage = demo_env.status(claim_id)['Stage'].iloc[-1]\n", + "while stage == 'Started':\n", + " stage = demo_env.status(claim_id)['Stage'].iloc[-1]\n", + " clear_output()\n", + " ipydisplay(demo_env.status(claim_id))\n", + " sleep(5)\n", + " \n", + "# Verify the Python libraries have been installed correctly.\n", + "ipydisplay(demo_env.libs)" + ] + }, + { + "cell_type": "markdown", + "id": "innovative-monster", + "metadata": {}, + "source": [ + "
\n", + "

4. Demo 2 - Install Custom Models and Scripts

\n", + "\n", + "

Once the custom runtime environment has been created, the user can then load custom user-created assets. For the purposes of this Demonstration, we will load two files;

\n", + "\n", + "
    \n", + "
  1. 'xgb_model' - This is a simple XGBoost Classifier model that was trained on the \"Financial Fraud\" data in the OFS table. It has an accuracy score of approximately 97.4%. The Appendix provides the code used to train, test, and save this model file.
  2. \n", + "
    \n", + "
  3. 'Demo_XBG_Scoring.py' - This file is a simple python program that acts as the bridge between EDW processing on the Analytics Cluster and the XGBoost model scoring. It simply formats the incoming data, calls the model, and outputs the model predictions. When executed on the individual parallel Analytic Cluster Nodes, it will us the XGBoost model file to score it's portion of the data.
  4. \n", + "
\n", + " \n", + "

Once again, the Vantage Python Library makes this process straightforward by calling two simple methods:

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • \"install_file\" for each of the two assets
  • \n", + "
    \n", + "
  • Verification using the \"files\" property
  • \n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "configured-skiing", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

4.1 Install User Files in the Cluster Container

\n", + "\n", + "

Users can load any asset to the environment using the install_file method. This ensures that only authenticated users can install specific files into a dedicated filesystem, and helps prevent malicious code injection. Users pass the file name, and whether to replace an existing file.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "large-luther", + "metadata": {}, + "outputs": [], + "source": [ + "# Install xgboost model file.\n", + "demo_env.install_file('xgb_model', replace = True)\n", + "\n", + "# Install the desired Python script into the environment.\n", + "demo_env.install_file('Demo_XGB_Scoring.py', replace = True)" + ] + }, + { + "cell_type": "markdown", + "id": "minimal-transport", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

4.2 List all installed files

\n", + "\n", + "

files property lists the asset, size, and last updated timestamp. As above, these methods are available to manage the container remotely, since these containers live in the Vantage environment.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "running-tribute", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify the files have been installed correctly.\n", + "demo_env.files" + ] + }, + { + "cell_type": "markdown", + "id": "responsible-switzerland", + "metadata": {}, + "source": [ + "
\n", + "

5. Demo 3 - Model Scoring at Scale

\n", + "\n", + "

VantageCloud Lake Edition Analytic Clusters combine the power and scale of native ClearScape Analytics Functions with the open and flexible runtime environments; offering users the flexibility to balance built-in data prep, transformation and feature engineering functions with custom code and models at massive scale.

\n", + "\n", + "

Enterprise Class customers report the ability to reduce data prep and model scoring times from several hours per run to seconds; effectively allowing model scoring in near-real-time.

\n", + "\n", + "

This demonstration will illustrate these key concepts:

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
    \n", + "
  • Leverage native data preparation functions to process incoming data for the model scoring
  • \n", + "
    \n", + "
  • Execute the combined native query and the python scoring functions together, in parallel
  • \n", + "
    \n", + "
  • Analyze the results of the process to determine ongoing model accuracty and efficacy
  • \n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "involved-assist", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

5.1 Data Transformation/Feature Engineering

\n", + "\n", + "

Create a reference to the data set in Vantage, and apply powerful transformation functions directly on the Data. ClearScape Analytics is a suite of in-database massively-parallel-processing functions for statistical analysis, data cleaning and transformation, machine learning, text analytics, and model scoring. Practictioners can leverage these functions together with open-source modeling as illustrated here, or create powerful, native end-to-end pipelines using just these functions.

\n", + "\n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "material-personality", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Create a reference to the data set in-Vantage\n", + "# by creating a \"Teradata DataFrame\"\n", + "# which is a reference to the data.\n", + "tdf_test = DataFrame(in_schema(\"DEMO_GLM_Fraud\", \"transaction_data\"))\n", + "tdf_test.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "signal-induction", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

5.2 Engineer Features

\n", + "\n", + "

Call the ClearScape One Hot Encoding function to transform the categorical column into numeric features.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imposed-match", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Perform native one-hot encoding on the data\n", + "# These functions use a \"fit-and-transform\" pattern\n", + "# that supports reuse and easier operationalization of the transformation process\n", + "\n", + "from teradataml import OneHotEncodingFit, OneHotEncodingTransform\n", + "\n", + "res_ohe = OneHotEncodingFit(data = tdf_test, \n", + " target_column = 'type', \n", + " categorical_values = ['CASH_OUT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'PAYMENT'], \n", + " other_column = 'other',\n", + " is_input_dense = True)\n", + "\n", + "res_transformed = OneHotEncodingTransform(data = tdf_test, object = res_ohe.result, is_input_dense = True)\n", + "res_transformed.result.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "collectible-gather", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

5.3 Execute the Scoring function

\n", + "\n", + "

Now that the categorical column has been encoded, the XGBoost model can be called. This is executed via the Apply method, where we pass;

\n", + "\n", + "\n", + " \n", + "\n", + "

Finally, the script is executed by calling the \"execute_script\" method; this \"lazy\" evaluation allows for more modular and performant architecture.

\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4af9a813-110a-4b7a-b46f-3b4a12aaa585", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(res_transformed.result, table_name = 'Transformed_Tbl', if_exists = 'replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36b7284c-9522-453b-9a8b-59adebeb335e", + "metadata": {}, + "outputs": [], + "source": [ + "res_transformed = DataFrame.from_query(\"SELECT TOP 1000 * FROM Transformed_Tbl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unlimited-liver", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "apply_obj = Apply(data = res_transformed.drop(['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1),\n", + " apply_command = 'python3 Demo_XGB_Scoring.py',\n", + " returns = {'Actual': VARCHAR(2) , 'Prob_0': VARCHAR(30), 'Prob_1': VARCHAR(30), 'Prediction':VARCHAR(2), 'txn_id': VARCHAR(20)},\n", + " env_name = demo_env,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "opening-manner", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Execute the Python script inside the remote user environment.\n", + "# The result is a teradataml DataFrame. \n", + "scored_data = apply_obj.execute_script()\n", + "\n", + "# Only return five rows - minimize network overhead\n", + "scored_data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "chief-falls", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

5.4 Analyze the Results

\n", + "\n", + "

It is common practice to measure the efficacy of a model. For this demonstration, a \"Confusion Matrix\" is generated that shows the quantity of true vs. false positives and negatives for the model.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "distinguished-motor", + "metadata": {}, + "outputs": [], + "source": [ + "# Copy the predictions to the client\n", + "# to generate the simple Confusion Matrix\n", + "# and print the AUC (Area Under Curve)\n", + "\n", + "df_test = scored_data.to_pandas(all_rows = True)\n", + "cm = confusion_matrix(df_test['Actual'].astype(int), df_test['Prediction'].astype(int))\n", + "disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['0', '1'])\n", + "fig, ax = plt.subplots(figsize=(10,10))\n", + "disp.plot(ax=ax)\n", + "\n", + "plt.show()\n", + "\n", + "#Get AUC score - anything over .75 is decent\n", + "AUC = roc_auc_score(df_test['Actual'].astype(int), df_test['Prediction'].astype(int))\n", + "print(f'AUC: {AUC}')" + ] + }, + { + "cell_type": "markdown", + "id": "conceptual-crash", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

5.5 Disconnect from Vantage

\n", + "\n", + "

Once complete, one can remove the custom environment (if desired) and close the \"context\" to the Vantage system.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tired-purple", + "metadata": {}, + "outputs": [], + "source": [ + "# uninstall the libraries from the environment first before removing it\n", + "demo_env.uninstall_lib(libs = demo_env.libs['name'].to_list())\n", + "remove_env(demo_env.env_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fiscal-animal", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "material-groove", + "metadata": {}, + "source": [ + "
\n", + "

6. Appendix - Model Training and Evaluation

\n", + "\n", + "

VantageCloud Lake Edition Analytic Clusters and ClearScape Analytics functions can also be leveraged for model training. This brief addendum shows an abbreviated process for developing and testing an open-source fraud detection model with Vantage and XGBoost.

" + ] + }, + { + "cell_type": "markdown", + "id": "abroad-underground", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.1 Connect to Vantage

\n", + "\n", + "

If necessary, connect to Vantage. If the context is still valid from above this doesn't need to be run. The below code will read in a variables file (vars.json - this has been used in prior environment setup and data engineering examples) and will connect to Vantage with this information. The Vantage connection is referred to as a \"Context\" - a common python-rdbms connection architecture.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "contemporary-rouge", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=Opensource_Data_Science_OAF.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "modified-services", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.2 Get a reference to the data

\n", + "\n", + "

Create a Teradataml DataFrame which references the data set in Vantage. This could be a table stored in direct-attach block storage, Performance-Optimized Object Storage (OFS), or stored in an open format in any Object Store.

\n", + "\n", + "

Teradataml DataFrames do not copy data into local memory, so complex analytic and transformation operations can run against data at any scale, while leveraging the parallel processing and workload isolation of Vantage Analytic Clusters.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "american-centre", + "metadata": {}, + "outputs": [], + "source": [ + "# Updated variables to insure they are the same\n", + "tdf_test = DataFrame(in_schema(\"DEMO_GLM_Fraud\", \"transaction_data\"))\n", + "tdf_test.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "terminal-network", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.3 Engineer Features

\n", + "\n", + "

Call the ClearScape One Hot Encoding function to transform the categorical column into numeric features.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "higher-courage", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OneHotEncodingFit, OneHotEncodingTransform\n", + "\n", + "res_ohe = OneHotEncodingFit(data = tdf_test, \n", + " target_column = 'type', \n", + " categorical_values = ['CASH_OUT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'PAYMENT'], \n", + " other_column = 'other',\n", + " is_input_dense = True)\n", + "\n", + "res_transformed = OneHotEncodingTransform(data = tdf_test, object = res_ohe.result, is_input_dense = True)\n", + "res_transformed.result.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "billion-drawing", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

Design for Operations

\n", + "\n", + "

Persist the \"Fit\" table to reuse it for the Operational transformation of new data

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "meaning-trading", + "metadata": {}, + "outputs": [], + "source": [ + "# copy the fit table to a permanent table for use later\n", + "res = copy_to_sql(res_ohe.result, table_name = 'OHE_FIT_TABLE', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "cognitive-dream", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.4 Test/Train Split

\n", + "\n", + "

Extraordinarily fast \"Sample\" function can split the data into multiple data sets in seconds.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ignored-scholar", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_samples = res_transformed.result.sample(frac = [0.2, 0.8])\n", + "copy_to_sql(tdf_samples[tdf_samples['sampleid'] == 2], table_name = 'txns_train', if_exists = 'replace')\n", + "copy_to_sql(tdf_samples[tdf_samples['sampleid'] == 1], table_name = 'txns_test' , if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "major-nudist", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.5 Train the Model

\n", + "\n", + "

Use open-source XGBoost Classifier to train the model using the \"training\" data split above.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "demanding-bouquet", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Pandas DataFrame\n", + "df_train = DataFrame(\"txns_train\").to_pandas(all_rows = True)\n", + "\n", + "# define the input columns and target variable:\n", + "X_train = df_train[['type_CASH_OUT', 'type_CASH_IN', 'type_TRANSFER',\n", + " 'type_DEBIT', 'type_PAYMENT', 'type_other', 'amount','oldbalanceOrig', 'newbalanceOrig',\n", + " 'oldbalanceDest', 'newbalanceDest']]\n", + "y_train = df_train[['isFraud']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "strong-lottery", + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the Model\n", + "warnings.filterwarnings('ignore')\n", + "from xgboost import XGBClassifier\n", + "\n", + "model = XGBClassifier()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "atmospheric-occasions", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.6 Test the Model

\n", + "\n", + "

It is common practice to measure the efficacy of a model. For this demonstration, a \"Confusion Matrix\" is generated that shows the quantity of true vs. false positives and negatives for the model.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "australian-religion", + "metadata": {}, + "outputs": [], + "source": [ + "# Return a Pandas DataFrame from the split data above\n", + "\n", + "df_test = DataFrame(\"txns_test\").to_pandas(all_rows = True)\n", + "\n", + "# Define the input columns and target\n", + "X_test = df_test[['type_CASH_OUT', 'type_CASH_IN', 'type_TRANSFER',\n", + " 'type_DEBIT', 'type_PAYMENT', 'type_other', 'amount','oldbalanceOrig', 'newbalanceOrig',\n", + " 'oldbalanceDest', 'newbalanceDest']]\n", + "y_test = df_test[['isFraud']]\n", + "\n", + "\n", + "# Predict the class and the probability of Fraud\n", + "y_pred = model.predict(X_test)\n", + "y_prob = model.predict_proba(X_test)\n", + "\n", + "\n", + "# Generate the Confusion Matrix\n", + "df_test[['prob_0', 'prob_1']] = y_prob\n", + "df_test['prediction'] = y_pred\n", + "\n", + "cm = confusion_matrix(df_test['isFraud'], df_test['prediction'])\n", + "disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['0', '1'])\n", + "fig, ax = plt.subplots(figsize=(10,10))\n", + "disp.plot(ax=ax)\n", + "\n", + "plt.show()\n", + "\n", + "#Get AUC score - anything over .75 is decent\n", + "AUC = roc_auc_score(df_test['isFraud'], df_test['prediction'])\n", + "print(f'AUC: {AUC}')" + ] + }, + { + "cell_type": "markdown", + "id": "proper-friendship", + "metadata": {}, + "source": [ + "
\n", + "\n", + "

6.7 Save the Model

\n", + "\n", + "

Save the model file in native xgboost format. This is used above in the main demonstration.

" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "assured-progressive", + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('xgb_model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "formed-sheet", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "1a4db3dc-2241-4735-9a1e-a489c8986bdb", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "toc-autonumbering": false, + "toc-showmarkdowntxt": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/Getting_Started/images/Container_Layout.png b/VantageCloud_Lake/Getting_Started/images/Container_Layout.png new file mode 100644 index 00000000..79fac5d8 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/Container_Layout.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png b/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png new file mode 100644 index 00000000..7445ea5f Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/Model.png b/VantageCloud_Lake/Getting_Started/images/Model.png new file mode 100644 index 00000000..228bf77b Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/Model.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Env.png b/VantageCloud_Lake/Getting_Started/images/OAF_Env.png new file mode 100644 index 00000000..1be627c3 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Env.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png b/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png new file mode 100644 index 00000000..73b29048 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png b/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png new file mode 100644 index 00000000..239be028 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png b/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png new file mode 100644 index 00000000..a6811164 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png differ diff --git a/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb new file mode 100644 index 00000000..fa2b09ac --- /dev/null +++ b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb @@ -0,0 +1,1900 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b1378a69-ac58-4d0c-af22-7ef881abac45", + "metadata": {}, + "source": [ + "
\n", + "

\n", + " Anomaly Detection in Robot Welding Process\n", + "
\n", + " \"Teradata\"\n", + "

\n", + "
\n", + "\n", + "

Introduction

\n", + "\n", + "

Detecting anomalies reduces issues and delays in many industries, especially in the manufacturing field. There have been approaches to detect anomalies in the past, such as engineering rules and graph and deep learning. However, it still proves difficult to detect all the existing anomalies. Plus, companies are striving to minimize false positives, cope with the diversity of sensors and metrology issues, and deliver actionable insights at a business pace. Fortunately, Teradata and ClearScape Analytics have the solution. In ClearScape Analytics, users can execute all steps of anomaly detection from data preparation and exploration to model training and evaluations and adjustments. These analyses can improve the process and ensure accuracy in anomaly detection.

\n", + "\n", + "

Spot Welding Quality Assessment

\n", + "

Spot welding is a common technique used for welding car body panels, particularly in the assembly of smaller parts and components. Spot welding involves using a pair of copper electrodes to apply a series of short, high-current welding pulses to the metal, fusing the parts together at specific points or “spots”.

\n", + "\n", + "

The automotive industry is known for its high level of automation, and spot welding is one of the most automated processes, heavily reliant on robots to improve efficiency, reduce labor costs, and improve the consistency and quality of the finished product. Poor welding quality is rare, but even so, the consequences of poor quality may not be negligible in terms of rework costs and customer satisfaction, especially when quality issues are detected too late.

\n", + "\n", + "\n", + "\n", + "

Spot welding is a resistance welding process that uses large electrical current. There are many ways to assess the quality of a spot, like tensile or ultrasonic testing to assess the weld strength or the analysis of the welding current measured and recorded during the welding process. In this demo, we focus on the analysis of the anomalies in the welding spot due to welding current, and more specifically the resistance, i.e. the voltage-current ratio which impacts the quality of the welding. The shape of the resistance curve depends on many factors like the nature of the materials, the geometry, and the quality of the electrodes etc.

\n", + "\n", + "\n", + "

Business Values

\n", + "
  • Improve accuracy in the production and manufacturing process.
  • \n", + "
  • Reduce the number of false positive anomalies detected in a system.
  • \n", + "
  • Decrease additional costs and time wasted due to undetected anomalies.
  • \n", + "
  • Determine patterns and significant factors that lead to anomalies.
  • \n", + "

    Why Vantage?

    \n", + "

    Many organizations fail to realize value from their ML and AI investments due to a lack of scale. It is estimated that for broad adoption across many industries, the number of models and model deployments needs to scale 100-1000x larger than their organizations currently support.

    \n", + "

    The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "

    In this particular use case, the volume of machine sensor data was so great that millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.

    \n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "b33aebf1-80cf-4043-99de-b2ac0356ea64", + "metadata": { + "tags": [] + }, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caa4ef22-2129-4713-9483-2b64565deda7", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3a0b76c2-b211-452f-949c-676da6da9540", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import getpass\n", + "import pandas as pd\n", + "import datetime\n", + "from teradataml import *\n", + "\n", + "# import tdsense\n", + "# from tdsense.plot import plotcurves \n", + "import numpy as np # linear algebra\n", + "import matplotlib.pyplot as plt\n", + "import sklearn\n", + "from sklearn import preprocessing\n", + "# from tdsense.clustering import hierarchy_dendrogram, hierarchy_clustering\n", + "# from tdnpathviz.visualizations import plotcurves\n", + "%matplotlib inline\n", + "\n", + "from sklearn import datasets\n", + "from sklearn2pmml.pipeline import PMMLPipeline\n", + "from sklearn2pmml import sklearn2pmml\n", + "from sklearn.model_selection import train_test_split\n", + "from imblearn.over_sampling import SMOTE\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score,confusion_matrix, roc_curve, ConfusionMatrixDisplay\n", + "import time\n", + "import pytz\n", + "\n", + "\n", + "import os\n", + "from jdk4py import JAVA, JAVA_HOME, JAVA_VERSION\n", + "# Set java path\n", + "\n", + "os.environ['PATH'] = os.environ['PATH'] + os.pathsep + str(JAVA_HOME)\n", + "os.environ['PATH'] = os.environ['PATH'] + os.pathsep + str(JAVA)[:-5]\n", + "\n", + "from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n", + "from collections import defaultdict\n", + "import plotly.offline as offline\n", + "offline.init_notebook_mode()\n", + "from dotenv import load_dotenv, dotenv_values\n", + "\n", + "from teradataml.dataframe.sql_functions import case\n", + "from teradataml import db_drop_table\n", + "configure.byom_install_location = \"td_mldb\"\n", + "\n", + "display.max_rows = 5\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)" + ] + }, + { + "cell_type": "markdown", + "id": "8c250746-66ba-40aa-b41b-c791786f61a0", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2be07d96-51d3-4aee-b025-582af97119da", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_AnomalyDetection_Python.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a182d300-1103-4902-8973-715b805f3059", + "metadata": {}, + "source": [ + "
    \n", + "

    3.Load the data

    \n" + ] + }, + { + "cell_type": "markdown", + "id": "9476f53a-7115-4018-a58f-dd09f7fc8b88", + "metadata": {}, + "source": [ + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + "\n", + "

    **Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99598e0a-8a6c-4539-a06d-f6723f67134f", + "metadata": {}, + "outputs": [], + "source": [ + "Sensor_Data = DataFrame(in_schema('DEMO_AnomalyDetection', 'Sensor_Data'))\n", + "Sensor_Data" + ] + }, + { + "cell_type": "markdown", + "id": "d4b9b958-737d-41a0-adec-91614fa0fe2e", + "metadata": {}, + "source": [ + "

    We get the above data from sensors. We focus on one plant (PLANT=1) and one robot (ROBOT_ID=41). The Partition_ID is the type of welding, ID is the WELDING_ID, X is time required for welding in ms and Y is the RESISTANCE. We create a view with the columns required to get data with proper column names.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88cde234-6107-487e-92f2-7f045576cc1d", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "query = f\"\"\"\n", + "REPLACE VIEW DEMO_AnomalyDetection.V_dataset_01 AS\n", + "SELECT\n", + " 1 AS PLANT\n", + ", {41} AS ROBOT_ID\n", + ", CAST(A.PARTITION_ID AS BIGINT) AS WELDING_TYPE\n", + ", CAST((DATE '{str(datetime.datetime.now()).split(' ')[0]}' + FLOOR((WELDING_ID-700*WELDING_TYPE)/100)) AS DATE FORMAT 'YYYY-MM-DD') AS WELDING_DAY\n", + ", CAST(A.ID AS BIGINT) AS WELDING_ID\n", + ", CAST(A.X AS INTEGER) AS TIME_MS\n", + ", A.Y AS RESISTANCE\n", + "FROM DEMO_AnomalyDetection.Sensor_Data A\n", + "\"\"\"\n", + "execute_sql(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ec3a959-c5e0-4039-88f8-846adca6f113", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new = DataFrame(in_schema('DEMO_AnomalyDetection', 'V_dataset_01'))\n", + "welding_dataset_new" + ] + }, + { + "cell_type": "markdown", + "id": "09198aa2-6ab7-4339-a01a-365cba02c772", + "metadata": {}, + "source": [ + "
    \n", + "

    3.1 - Some aggregations and visualization.

    \n" + ] + }, + { + "cell_type": "markdown", + "id": "f83b1b1a-eece-487a-97d7-b4759ea624ce", + "metadata": {}, + "source": [ + "

    We will check the histogram based on the minimum and maximum Time for welding.

    \n", + "

    A histogram is a better way to assess distribution, to cope with the scalability, it is recommended to compute the histogram bins in-database to leverage the Massively Parallel Architecture of Teradata Vantage. For that, we use the Histogram function of teradataml that pushes down the computations to Vantage.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a5d38c3-ebb9-47a2-b8ad-f00acd9d769b", + "metadata": {}, + "outputs": [], + "source": [ + "welding_duration_ms = welding_dataset_new. \\\n", + " groupby(['PLANT','ROBOT_ID','WELDING_TYPE', 'WELDING_ID']). \\\n", + " agg({'TIME_MS':['min','max','count']})\n", + "welding_duration_ms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "642bf739-a421-4ffd-8fc1-53f273db9bd9", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Histogram\n", + "obj = Histogram(data=welding_duration_ms,\n", + " target_columns=\"count_TIME_MS\",\n", + " method_type=\"Scott\")\n", + "res = obj.result.sort('MinValue')\n", + "res" + ] + }, + { + "cell_type": "markdown", + "id": "62b099f0-eb76-45a2-9c0e-983399c59570", + "metadata": {}, + "source": [ + "

    We can see that we have calculated the histogram values using the teradataml functions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visualization tools, this will not only make the calculation faster but also reduce the time due to less data movement between tools. We do the data transfer for this and the subsequent visualizations wherever necessary.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9b72ab-7d3c-4964-9199-ee1dcc17c928", + "metadata": {}, + "outputs": [], + "source": [ + "res = obj.result.sort('MinValue').to_pandas()\n", + "res['duration_ms'] = [str(row['MinValue'])+'-'+str(row['MaxValue']) for i,row in res.iterrows()]\n", + "res.plot(x='duration_ms',y='CountOfValues',kind='bar', figsize=(15,10), legend=False,xlabel='Duration(ms)', ylabel='Welding Counts')" + ] + }, + { + "cell_type": "markdown", + "id": "88429a10-aa8b-459f-976a-6276ab121bbc", + "metadata": {}, + "source": [ + "

    In the above histogram we can see the bins between the Min and the Max value of the durations and the welding counts.

    \n", + "
    \n", + "

    3.2 - More advanced processing using window functions and delta_t

    \n", + "

    Resistance is an important parameter in resistance welding. The resistance should not vary too much. If there are any significant changes in resistance over time, it could indicate an issue with the weld quality. For example, an unusually high resistance could indicate poor contact between the parts being welded or a problem with the welding equipment.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5615026-52eb-4aae-8bb2-146e88ef4502", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c72091-f7f3-4ed3-a436-ee5c44335f4e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from tdnpathviz.visualizations import plotcurves\n", + "plotcurves(welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854],field='RESISTANCE',row_axis='TIME_MS', series_id='WELDING_ID',select_id=None)" + ] + }, + { + "cell_type": "markdown", + "id": "ae924828-6e92-4003-93c9-b66aeec1821f", + "metadata": {}, + "source": [ + "

    The above graph shows the variation of the resistance of the welding with respect to time. We see that the most interesting part lies between 40 and 400ms from the start of the curve.

    \n", + "\n", + "

    Next we apply the window function on the resistance to smooth the resistance and taking the mean value.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "516d8fd4-ab2c-44cd-89d2-d8075e40cf82", + "metadata": {}, + "outputs": [], + "source": [ + "# curve smoothing\n", + "window_for_smoothing = welding_dataset_new.RESISTANCE.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS',\n", + " window_start_point = -15,\n", + " window_end_point = 15\n", + ")\n", + "welding_dataset_smooth = welding_dataset_new.assign(RESISTANCE_SMOOTHED = window_for_smoothing.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c351bab-cd80-452c-b600-79efaec9f769", + "metadata": {}, + "outputs": [], + "source": [ + "id_curve = 854\n", + "single_welding = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44d1ffb7-1bf2-4770-8b0d-f21ed5a589e4", + "metadata": {}, + "outputs": [], + "source": [ + "figure = Figure(width=1000, height=400, image_type=\"jpg\",\n", + " heading=\"RESISTANCE and RESISTANCE SMOOTHED\")\n", + "plot = single_welding.plot(x=single_welding.TIME_MS, y=[single_welding.RESISTANCE, single_welding.RESISTANCE_SMOOTHED],\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ',figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "299bf795-653e-45a4-8f39-5143d81173cf", + "metadata": {}, + "source": [ + "

    The above graph shows the variation of the resistance of the welding with respect to time and the smoothed resistance, as shown by the Red line, after applying the window function.

    \n", + "\n", + "

    The window function generates a Window object on a teradataml DataFrame Column to run window aggregate functions.\n", + "

    Function allows user to specify window for different types of computations:\n", + "

  • Cumulative\n", + "
  • Group\n", + "
  • Moving\n", + "
  • Remaining\n", + "

    By default, window with Unbounded Preceding and Unbounded following is considered for calculation.

    \n", + "\n", + "

    Next we calculate the derivative by using the lead function and taking the difference of the lead value and the mean value of the resistance. Applying a window function to smooth the resistance curve helps to eliminate noise and makes it easier to see the overall trend. The derivative of the resistance gives an indication of how quickly the resistance is changing, which can be a useful measure for detecting anomalies and predicting potential issues.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e489b230-97b7-4f91-a001-3355da9b20bd", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(welding_dataset_smooth,table_name='welding_dataset_smooth', if_exists='replace')\n", + "welding_dataset_smooth = DataFrame('welding_dataset_smooth')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fccb6149-ce72-4601-983b-a87f2bc52417", + "metadata": {}, + "outputs": [], + "source": [ + "# let's compute the lead\n", + "window_for_lead = welding_dataset_smooth.RESISTANCE_SMOOTHED.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a9bc90-f330-467f-8765-5a00578c6c6e", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_smooth = welding_dataset_smooth.assign(RESISTANCE_SMOOTHED_AFTER = window_for_lead.lead())\n", + "welding_dataset_smooth = welding_dataset_smooth.assign(DERIVATIVE = (welding_dataset_smooth.RESISTANCE_SMOOTHED_AFTER - welding_dataset_smooth.RESISTANCE_SMOOTHED).zeroifnull())\n", + "welding_dataset_smooth.sort(['WELDING_ID','TIME_MS'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d019941f-4422-4012-8984-0dce20d10e48", + "metadata": {}, + "outputs": [], + "source": [ + "id_curve = 854\n", + "single_welding_subplot = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding_subplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bd9b71a-b668-44f9-a0bd-e74b2c82462e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import subplots\n", + "# fig, axes = subplots(grid = {(1, 1): (1, 1),(2, 1): (1, 2)})\n", + "# Plot 1980 data at first Axis.\n", + "fig, axes = subplots(nrows=2, ncols=1)\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=[single_welding_subplot.RESISTANCE, single_welding_subplot.RESISTANCE_SMOOTHED],\n", + " legend=[\"RESISTANCE\", \"RESISTANCE SMOOTHED\"],\n", + " figure=fig,\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ', \n", + " ax=axes[0])\n", + "\n", + "# Plot 1981 data at second Axis.\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=single_welding_subplot.DERIVATIVE,\n", + " legend=[\"DERIVATIVE\"],\n", + " figure=fig,\n", + " style=\"red\",xlabel='time in ms', ylabel='derivative ' , \n", + " ax=axes[1])\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "92c9f6e0-7b26-4fed-9b43-1d35989affad", + "metadata": {}, + "source": [ + "

    We see that the most interesting part lies between 40 and 400ms from the start of the curve, so we plot only that subset.

    " + ] + }, + { + "cell_type": "markdown", + "id": "4615d965-6892-4729-81b0-9dd39f7d9411", + "metadata": { + "tags": [] + }, + "source": [ + "

    It is hard to assess the diversity of curve shapes in this plot since many of them are superimposed. However, we see in the middle of the picture a sharp drop that looks unusual. Moreover, we guess that there are shifts in time and height.

    \n", + "\n", + "
    \n", + "

    4. Feature Engineering

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da82ee40-3e38-49af-a6ca-a678ba240ca2", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new.columns" + ] + }, + { + "cell_type": "markdown", + "id": "539a4c25-f868-44af-bca3-13b4ca477445", + "metadata": {}, + "source": [ + "

    We will create a feature table by using different functions on the Resistance column. Valid values for functions are: 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var', 'skew', 'kurtosis'.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa37d2af-c185-4a84-9ca5-8628a216aa27", + "metadata": {}, + "outputs": [], + "source": [ + "features = welding_dataset_new.loc[welding_dataset_new.TIME_MS > 20,:]. \\\n", + " groupby(welding_dataset_new.columns[0:5]). \\\n", + " agg({\n", + " 'TIME_MS':['min','max'],\n", + " 'RESISTANCE':['count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var','skew','kurtosis']\n", + " })\n", + "features" + ] + }, + { + "cell_type": "markdown", + "id": "0196e16a-9d9d-4d44-a0ed-e5220c3314e2", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Anomaly Detection on Sensor Data

    \n", + " \n", + "

    Let's start by getting the feature columns from the features tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27cdf0f8-e0b3-41b5-b18d-b77cdbc5652b", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = features.columns[7::]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "9655f048-ffbd-4785-9e8b-39d192ff7808", + "metadata": {}, + "source": [ + "
    \n", + "

    5.1 Clustering by curve shape

    \n", + "

    To cluster time series by shapes, we will use the Dynamic Time Warping (DTW) distance that measures the similarity between two time series. This distance is well adapted to this kind of problem since it provides robustness to shifts in time and height.

    \n", + "\n", + "

    Distance Matrix in-database Computations

    \n", + "\n", + "

    The ClearScape Analytics DTW function computes at scale distances between one reference curve to a set of curves, a many-to-one approach. ClearScape Analytics offers in database dynamic time warping function, callable in SQL as TD_DTW. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. This function computes at scale the DTW distances between one reference curve to a set of curves, a many-to-one approach. We want to compute the distance matrix of our subset, i.e. the DTW distance between each curve. The distance matrix is symmetric, since the DTW is, hence we only need to compute the triangular matrix. We wrapped this computation in the tdsense package that calls the TD_DTW function and iterates on the matrix row to compute and store the whole triangular distance matrix in a table.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207e72c8-41e3-481a-9727-a4c7510f4206", + "metadata": {}, + "outputs": [], + "source": [ + "overview = welding_dataset_new.groupby('WELDING_DAY').count(distinct=True)\n", + "dates = list(overview.to_pandas().reset_index()['WELDING_DAY'].values.astype('str'))\n", + "dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c7180b4-a8b5-450a-96be-8aed93d1199a", + "metadata": {}, + "outputs": [], + "source": [ + "subset = welding_dataset_new[ \\\n", + " (welding_dataset_new['PLANT'] == 1) & \\\n", + " (welding_dataset_new['ROBOT_ID'] == 41) & \\\n", + " (welding_dataset_new['WELDING_TYPE'] in (8,9)) & \\\n", + " (welding_dataset_new['WELDING_DAY'].isin(dates)) \\\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dda2eca-af26-4741-abeb-b63758f8c996", + "metadata": {}, + "outputs": [], + "source": [ + "subset_zoom = subset[(subset.TIME_MS < 400) & (subset.TIME_MS > 40)]\n", + "subset_zoom.shape" + ] + }, + { + "cell_type": "markdown", + "id": "9d40f422-886d-48e5-a4ce-03b259523917", + "metadata": {}, + "source": [ + "

    The subset of data we have taken contains 7 columns and 344,622 rows.

    \n", + "\n", + "

    Since this is a 2CPU system, the below computation takes around more than 2 hours for 350k rows and so we have pre calculated it and stored in the table in database.

    \n", + "\n", + "

    **In case we still want to compute the matrix please set the If part of the below code to True instead of False

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "276fd1b7-e057-4c0c-b8b0-4e063d70eb7a", + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " dtw_matrix = dtw_distance_matrix_computation2(subset_zoom,field='RESISTANCE',\n", + " table_name=dtw_result_table,\n", + " schema_name = Param['database'],\n", + " row_axis='TIME_MS',\n", + " series_id = 'WELDING_ID')\n", + "else:\n", + " dtw_matrix = DataFrame(in_schema('DEMO_AnomalyDetection','DTW_Matrix'))" + ] + }, + { + "cell_type": "markdown", + "id": "42f770a5-f3b2-4862-8256-b1cc1f969750", + "metadata": {}, + "source": [ + "
    \n", + "

    5.2 Hierarchical clustering with Scipy

    \n", + "\n", + "

    Now the distance matrix is available, we can perform the clustering. Here, we will use the open-source package Scipy and its cluster.hierarchy modules, that have been used in a tdsense for convenience.

    \n", + "\n", + "

    Hierarchical clustering is an alternative class of clustering algorithms that produce 1 to n clusters, where n is the number of observations in the data set. As you go down the hierarchy from 1 cluster (contains all the data) to n clusters (each observation is its own cluster), the clusters become more and more similar (almost always).

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b87b35b-c283-42d8-845b-5c9c7851c822", + "metadata": {}, + "outputs": [], + "source": [ + "dtw_matrix_loc = dtw_matrix.sort(columns=['WELDING_ID_2','WELDING_ID_1']).to_pandas(all_rows=True)\n", + "dtw_matrix_loc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49f64fd3-1f33-4b7c-9d8f-b0636bffc2f4", + "metadata": {}, + "outputs": [], + "source": [ + "from tdsense.clustering import hierarchy_dendrogram, hierarchy_clustering\n", + "linked, labelList = hierarchy_dendrogram(dtw_matrix_loc, cluster_distance = 'ward')" + ] + }, + { + "cell_type": "markdown", + "id": "df0a3961-8cd1-43b8-9c11-9e229648d1eb", + "metadata": {}, + "source": [ + "

    The dendrogram is useful for visualizing the structure of the hierarchical clustering and identifying the optimal number of clusters to use for further analysis. The optimal number of clusters can be determined by examining the dendrogram to identify a level at which the clusters start to merge more slowly or by using a threshold for the maximum distance between clusters.

    \n", + "\n", + "

    The resulting dendrogram as above shows how the hierarchical clustering algorithm has merged the data points into clusters based on their pairwise distances using the Ward linkage criterion. The dendrogram is a summary of the distance matrix. The X axis has the WELDING_ID but not visible as we have more than 450k rows. Looking at the dendrogram, we see that we have about 6 clusters. When selected 6, here is what we have got.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e168ff-626b-47b8-bc2b-ecfaac22a8f4", + "metadata": {}, + "outputs": [], + "source": [ + "cluster = hierarchy_clustering(linked, labelList, n_clusters=6)\n", + "cluster.head()" + ] + }, + { + "cell_type": "markdown", + "id": "48b62135-409c-45a9-b604-6e98ccf059fd", + "metadata": {}, + "source": [ + "

    The above dendogram is for only 6 clusters with the colors representing the different clusters. Now, we plot the Resistance curves for each cluster.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d31bafdc-9f43-4083-9677-ef7d94c18eb1", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(2,3,figsize=(20,10))\n", + "colors = cluster[['cluster','leaves_color_list']].copy().drop_duplicates()\n", + "for k in range(6):\n", + " plt.subplot(2,3,k+1)\n", + " img = plotcurves( subset_zoom,\n", + " field='RESISTANCE',\n", + " row_axis='TIME_MS',\n", + " series_id='WELDING_ID',\n", + " select_id=list(cluster[cluster.cluster ==k].CURVE_ID.values),\n", + " noplot=True)\n", + " plt.imshow(img)\n", + " plt.title('cluster : ' +str(k) + '\\n' + str(cluster.groupby('cluster').count()['CURVE_ID'][k]) + ' obs.',fontdict = {'fontsize' : 10, 'color':colors.leaves_color_list.values[k]})\n", + " plt.axis('off')" + ] + }, + { + "cell_type": "markdown", + "id": "f50fab99-9231-410d-bdd3-1132fc98575f", + "metadata": {}, + "source": [ + "

    And if we plot the curves per cluster, we spot the curves with a sharp drop(cluster 4) and these are the curves we are interested in, i.e. the curve exhibiting the anomaly we are looking for. We note also the other clusters are looking more or less similar. By monitoring the resistance over time and calculating its derivative, you can detect any sudden changes or anomalies. Anomalies might indicate a problem with the welding process, such as a sudden drop in current or a sudden increase in resistance.

    " + ] + }, + { + "cell_type": "markdown", + "id": "9b99a7ac-6a99-4c9e-9ead-0f6d6e5c4759", + "metadata": {}, + "source": [ + "
    \n", + "

    5.3 Create the anomaly dataset

    \n", + "

    Now we create a table containing the anomaly flag that will be the target of a supervised machine learning model or a relevant KPI to monitor in production dashboards.

    \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cec5b577-b0dd-45c8-8fad-fee1fb1f952a", + "metadata": {}, + "outputs": [], + "source": [ + "target = cluster.copy().drop('leaves_color_list',axis=1)\n", + "target = target[target.cluster.isin([1,2])]\n", + "target['WELDING_ID'] = target['CURVE_ID']\n", + "target['anomaly'] = 0\n", + "target.loc[target.cluster==2,'anomaly'] = 1\n", + "target.drop(['cluster','CURVE_ID'],axis=1, inplace=True)\n", + "target.groupby('anomaly').count().plot(y='WELDING_ID',kind='bar',figsize=(10,10))\n", + "copy_to_sql( target,\n", + " table_name = 'Anomaly_Target',\n", + " if_exists='replace',\n", + " primary_index='WELDING_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ac7c451-2fb3-45fa-895d-e881cc88a9ba", + "metadata": {}, + "outputs": [], + "source": [ + "anomalies = DataFrame('Anomaly_Target')\n", + "anomalies" + ] + }, + { + "cell_type": "markdown", + "id": "da6297fd-6f49-4619-af30-791db2af90da", + "metadata": {}, + "source": [ + "

    The above anomaly data has the welding ID and the anomaly flag.

    \n", + "
    \n", + "

    5.4 Build the analytical dataset

    \n", + "\n", + "

    We prepare the analytical dataset by joining the feature table with the anomaly table using the Welding ID so that we get the anomalies for the weldings.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4cfcfb-7d91-47e5-a4cc-e44428e51cfe", + "metadata": {}, + "outputs": [], + "source": [ + "ADS = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "ADS = ADS.assign(WELDING_ID=ADS.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "ADS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20a2163c-9fea-4f3d-ab0b-696b3cccaad9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ADS.shape" + ] + }, + { + "cell_type": "markdown", + "id": "c03b26f4-0fa4-4478-922e-9cb850acbe34", + "metadata": {}, + "source": [ + "

    The analytical dataset we created has 14 columns and 391 rows which will be used to build the model below.

    " + ] + }, + { + "cell_type": "markdown", + "id": "09b3168b-8c53-4ffd-ba75-b26f40608654", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    6. Build the model

    \n", + "

    We have datasets in which different columns have different units – like one column can be in kilograms, while another column can be in centimetres. If we feed these features to the model as is, there is every chance that one feature will influence the result more due to its value than the others. But this doesn’t necessarily mean it is more important as a predictor. So, to give importance to all the features we need feature scaling.

    \n", + " \n", + "

    Here, we apply the Standard scale and transform functions which are ScaleFit and ScaleTransform functions in Vantage. ScaleFit() function outputs statistics to input to ScaleTransform() function, which scales specified input DataFrame columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d0898e-53a7-4aca-9f24-2e2f06ac73dc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ScaleFit , ScaleTransform\n", + "scaler = ScaleFit(\n", + " data=ADS,\n", + " target_columns=feature_names,\n", + " scale_method=\"STD\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76af7c0a-b1cf-4914-a099-aeaeeb0c4977", + "metadata": {}, + "outputs": [], + "source": [ + "ADS_scaled = ScaleTransform(data=ADS,\n", + " object=scaler.output,\n", + " accumulate=\"anomaly\").result\n", + "ADS_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cc1ed77-bd6e-4476-9b76-abb448c7199b", + "metadata": {}, + "outputs": [], + "source": [ + "df = ADS_scaled.to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "3b3a8548-555a-48fd-88e4-795abaff2cc5", + "metadata": {}, + "source": [ + "
    \n", + "

    6.1 Create a model file using the python libraries.

    \n", + "\n", + "

    The Vantage Bring Your Own Model (BYOM) package gives data scientists and analysts the ability to operationalize predictive models in Vantage. Predictive models trained in external tools with sample data can be used to score data stored in Vantage using the BYOM Predict. Create or convert your predictive model using a supported model interchange format (PMML, MOJO, ONNX, Dataiku, and DataRobot are currently available), store it in a Vantage table, and use the BYOM PMMLPredict, H2OPredict, ONNXPredict, DataikuPredict, or DataRobotPredict to score your data with the model.

    \n", + "\n", + "

    A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary. One way to solve this problem is to oversample the examples in the minority class. the most widely used approach to synthesizing new examples is called the Synthetic Minority Oversampling Technique, or SMOTE for short. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

    \n", + "\n", + "

    Then we use the RandomForestClassifier to create the model. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. It is basically a set of decision trees (DT) from a randomly selected subset of the training set and then It collects the votes from different decision trees to decide the final prediction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d847d16a-9735-4482-953d-66c80faf0bdc", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df[feature_names]\n", + "y_train = df['anomaly']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4350a66c-2ff9-483c-ae30-8f17c5d375b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Balance the training set using SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_train, y_train = smote.fit_resample(X_train, y_train)\n", + "\n", + "\n", + "# Create a random forest classifier\n", + "model = RandomForestClassifier(n_estimators=10,max_depth= 3, random_state=42)\n", + "\n", + "# Create a pipeline that includes the SMOTE transformer and the model\n", + "pipeline = PMMLPipeline([ ('model', model)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "455a3ff5-e8ee-4c9b-909e-3e1a79fa6612", + "metadata": {}, + "outputs": [], + "source": [ + "# Train the pipeline\n", + "start = time.time()\n", + "pipeline.fit(X_train, y_train)\n", + "end = time.time()\n", + "print('duration : ', end-start, 's')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61ff634a-aea7-4966-bf38-30b77547f0a3", + "metadata": {}, + "outputs": [], + "source": [ + "# make predictions on the training set\n", + "y_train_pred = pipeline.predict(X_train)\n", + "\n", + "# calculate and print the accuracy score\n", + "acc = accuracy_score(y_train, y_train_pred)\n", + "print(\"Accuracy: {:.2f}%\".format(acc * 100))\n", + "\n", + "# calculate and print precision, AUC and F1-score\n", + "prec = precision_score(y_train, y_train_pred)\n", + "print(\"Precision: {:.2f}%\".format(prec * 100))\n", + "\n", + "# calculate AUC, AUC requires probability for positive class\n", + "prob = pipeline.predict_proba(X_train)[:, 1]\n", + "auc = roc_auc_score(y_train, prob)\n", + "print(\"AUC: {:.2f}%\".format(auc * 100))\n", + "\n", + "f1 = f1_score(y_train, y_train_pred)\n", + "print(\"F1-Score: {:.2f}%\".format(f1 * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a0b3c9-4a3f-478c-a9f9-2ddd786aa332", + "metadata": {}, + "outputs": [], + "source": [ + "pmml_metrics=pd.DataFrame([{'Model':'PMML using BYOM','Accuracy':acc, 'Precision':prec, 'F1-Score':f1}])\n", + "pmml_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da084cfa-5c7b-4899-9c9b-41b065546bf6", + "metadata": {}, + "outputs": [], + "source": [ + "sklearn2pmml(pipeline, \"my_model.pmml\", with_repr = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c35b23c2-c4c4-4601-b374-9d021a4845b0", + "metadata": {}, + "outputs": [], + "source": [ + "additional_columns = {\"Description\": type(\"RandomForestClassifier model\"),\n", + " \"UserId\": type('demo_user'),\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": prec,\n", + " \"ModelAUC\": auc,\n", + " \"Modelf1Score\": f1,\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": end-start,\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + "for k in additional_columns.keys():\n", + " print(type(additional_columns[k]))" + ] + }, + { + "cell_type": "markdown", + "id": "8351d68c-fed5-4034-b00f-fe0379625090", + "metadata": {}, + "source": [ + "
    \n", + "

    6.2 Save the model file

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ffc1be2-d980-4468-9fc9-58ef30e5cb27", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + "except Exception as e: \n", + " # if our model exists, delete and rewrite \n", + " if str(e.args).find('TDML_2200') >= 1: \n", + " delete_byom(model_id = 'model_anomaly1', table_name = 'BYOM_PMMLMODELS_REPOSITORY') \n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + " else: \n", + " raise ValueError(f\"Unable to save the model due to the following error: {e}\")\n", + "# pass \n", + "# else: \n", + "# raise \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "60c0f97c-52b2-407e-921c-75a61ca2d3fa", + "metadata": {}, + "source": [ + "

    The model file is saved as can be found in the left navigation pane in /UseCases/Anomaly_Detection.

    \n", + "\n", + "

    We create new scaled data to apply this model and predict data. New dataset is created by joining the features and the anomalies.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60fe7dff-a0fa-43a6-aa03-d11aeed2904e", + "metadata": {}, + "outputs": [], + "source": [ + "newdata = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "newdata = newdata.assign(WELDING_ID=newdata.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "newdata" + ] + }, + { + "cell_type": "markdown", + "id": "bd7108ab-49b6-411a-a919-4ab7f859252e", + "metadata": {}, + "source": [ + "

    We create new transformed data by using the same Scalefit object we used earlier and get the transformed data for this new data.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099b4d80-3bb8-4e96-ba57-c85c84ae990a", + "metadata": {}, + "outputs": [], + "source": [ + "newdata_scaled = ScaleTransform(data=newdata,\n", + " object=scaler.output,\n", + " # DataFrame(in_schema('demo_user','scaler_anomaly')),\n", + " accumulate=[\"WELDING_ID\",\"anomaly\"]).result\n", + "newdata_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "46bb63a9-35eb-40e9-a4d4-d1aa558b19d1", + "metadata": {}, + "source": [ + "
    \n", + "

    6.3 Retrieve the model file and use it to predict

    \n", + "

    We use the PMMLPredict function from the teradataml library to predict the anomalies.

    \n", + "

    Predictive Model Markup Language (PMML) is an XML-based standard established by the Data Mining Group (DMG) for defining statistical and data-mining models. PMML models can be shared between PMML-compliant platforms and across organizations so that business analysts and developers are unified in designing, analyzing, and implementing PMML-based assets and services.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f0c6bb-3551-4337-a4e3-8c2a79fd55cc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import PMMLPredict\n", + "modeldata_anomaly = retrieve_byom(\"model_anomaly1\", table_name=\"BYOM_PMMLMODELS_REPOSITORY\")\n", + "result=PMMLPredict(\n", + " modeldata = modeldata_anomaly,\n", + " newdata = newdata_scaled,\n", + " accumulate = ['WELDING_ID'],\n", + " model_output_fields=['probability(0)','probability(1)'],\n", + " overwrite_cached_models = '*'\n", + " )\n", + "pmml_predict=result.result\n", + "pmml_predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f03ec30-32a9-4b13-af64-78eaa88b79e1", + "metadata": {}, + "outputs": [], + "source": [ + "pmml_predict_result = pmml_predict.join(other=newdata_scaled, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "pmml_predict_result = pmml_predict_result.assign(prob_0=pmml_predict_result['probability(0)'])\n", + "pmml_predict_result = pmml_predict_result.assign(prob_1=pmml_predict_result['probability(1)'])\n", + "pmml_predict_result = pmml_predict_result.assign(WELDING_ID=pmml_predict_result.WELDING_ID_l)\n", + "pmml_predict_result = pmml_predict_result.assign(prediction=case([(pmml_predict_result.prob_1>pmml_predict_result.prob_0, 1 )],else_ = 0))\n", + "pmml_predict_result = pmml_predict_result.select(['WELDING_ID']+['anomaly']+['prob_0']+['prob_1']+['prediction'])\n", + "pmml_predict_result" + ] + }, + { + "cell_type": "markdown", + "id": "220bb477-2d63-4672-98a1-cb50d40f960f", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    7. Decision Forest

    \n", + " \n", + "

    We will now use the DecisionForest model to predict the anomalies. A decision forest is a generic term to describe models made of multiple decision trees. The prediction of a decision forest is the aggregation of the predictions of its decision trees. The implementation of this aggregation depends on the algorithm used to train the decision forest. The goal of using a Decision Tree is to create a training model that can use to predict the class or value of the target variable by learning simple decision rules inferred from prior data(training data).

    \n", + "\n", + "

    We start by creating a subset for the most interesting part lies between 40 and 400ms from the start of the curve.

    \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8a84c6-2c67-43c7-86e2-1f31c6bd1c18", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = welding_dataset_new[(welding_dataset_new.TIME_MS > 40) & (welding_dataset_new.TIME_MS < 400) ]\n", + "DF_curves_zoom" + ] + }, + { + "cell_type": "markdown", + "id": "58c9f479-f2ff-4863-b969-b9b8a873e6d4", + "metadata": {}, + "source": [ + "

    We create various features by using the window function on the Resistance and taking the difference between the previous and current resistance based on time. We will create these features by using the aggregation function on this resistance and the difference of the resistance.

    \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a227337c-3b57-443c-a256-dd5230ed98dd", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = DF_curves_zoom.assign(\n", + " resistance_diff = DF_curves_zoom.RESISTANCE \n", + " - DF_curves_zoom.RESISTANCE.window(\n", + " partition_columns=['WELDING_ID'],\n", + " order_columns=[\"TIME_MS\"]\n", + " ).lag(1)\n", + ")\n", + "# DF_curves_zoom[DF_curves_zoom.WELDING_ID==138].sort(\"TIME_MS\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb8c00e7-c465-46ba-99ae-c094969a2eed", + "metadata": {}, + "outputs": [], + "source": [ + "DF_features = DF_curves_zoom.groupby(\"WELDING_ID\").agg({\n", + " 'RESISTANCE':['sum', 'min', 'max', 'mean', 'std', 'var','skew','kurtosis'],\n", + " 'resistance_diff':['min']\n", + "})\n", + "DF_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6498373-8b50-49fb-ac0b-b0db7b0cb522", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = DF_features.columns[1:]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "57712977-e195-4ce9-9867-a7cdbc772279", + "metadata": {}, + "source": [ + "
    \n", + "

    7.1 Build the analytical dataset.

    \n", + "

    We create the analytical dataset joining the anomaly table created above and the dataset with the features created.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55686241-b413-45eb-a495-9888c946c634", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_target = DataFrame('Anomaly_Target')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0b595e-d794-4797-9125-b0bd2e9b046a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_ADS_train = DF_features[['WELDING_ID']+feature_names].join(\n", + " other=DF_target, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "DF_ADS_train = DF_ADS_train.assign(WELDING_ID=DF_ADS_train.WELDING_ID_l\n", + " ).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1\n", + " ).select(['WELDING_ID']+feature_names+['anomaly']\n", + " ).assign(anomaly_int = DF_ADS_train.anomaly.cast(INTEGER()))\n", + "DF_ADS_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0199e5db-a881-4a2e-92df-0fcc0a54158b", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_score = DF_features[['WELDING_ID']+feature_names]\\\n", + " [DF_features.WELDING_ID>800]\n", + "DF_ADS_score" + ] + }, + { + "cell_type": "markdown", + "id": "d3865607-6205-43e4-a3be-2142af2dd340", + "metadata": {}, + "source": [ + "

    We store these training and scoring datasets into Vantage to be used by the In-DB functions.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d0d263-3183-4e37-aa05-6f5ccd61ac49", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_train.to_sql(\n", + " table_name = 'ADS_train_data',\n", + " primary_index= 'WELDING_ID',\n", + " if_exists = 'replace'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db8815e7-cdc4-40fb-9160-bfd466d7535f", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_score.to_sql(\n", + " table_name = 'ADS_test_data',\n", + " primary_index= 'WELDING_ID',\n", + " if_exists = 'replace'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f38cc3c9-6828-4c65-9b72-53ea02a172cd", + "metadata": {}, + "source": [ + "
    \n", + "

    7.2 Train Decision Forest

    \n", + "

    The DecisionForest is an ensemble algorithm used for classification and regression predictive modelling problems. It is an extension of bootstrap aggregation (bagging) of decision trees.

    \n", + "\n", + "

    This function takes the training data as input, as well as the following function parameters

    \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5217c805-010b-4184-b312-b22c7f0b1d49", + "metadata": {}, + "outputs": [], + "source": [ + "DecisionForest_out = DecisionForest(data = DF_ADS_train, \n", + " input_columns = ['sum_RESISTANCE', 'min_RESISTANCE', 'max_RESISTANCE', 'mean_RESISTANCE', 'std_RESISTANCE', 'var_RESISTANCE', 'skew_RESISTANCE',\n", + " 'kurtosis_RESISTANCE', 'min_resistance_diff'], \n", + " response_column = 'anomaly_int', \n", + " max_depth = 16, \n", + " num_trees = 8, \n", + " min_node_size = 1, \n", + " mtry = 1, \n", + " mtry_seed = 3, \n", + " seed = 3, \n", + " tree_type = 'CLASSIFICATION')\n", + "# Print the result DataFrame.\n", + "# print(DecisionForest_out.result) \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "aca1ef54-8f11-48af-9d9f-ffe19a08b050", + "metadata": {}, + "source": [ + "
    \n", + "

    7.3 Predict and Evaluate Decision Forest model

    \n", + "

    Execute a testing prediction using the split data above. Evaluate the model by creating a confusion matrix with the ClassificationEvaluator SQL Function.

    \n", + "\n", + "\n", + "
      \n", + "
    1. Execute DecisionForestPredict using the model built above
    2. \n", + "
    3. Execute ClassificationEvaluator and pass the actual classification and the predicted value
    4. \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12ca4daf-6b7f-453b-b690-3ca59df0fb6b", + "metadata": {}, + "outputs": [], + "source": [ + "decision_forest_predict_out = TDDecisionForestPredict(object = DecisionForest_out.result,\n", + " newdata = DF_ADS_train,\n", + " id_column = \"WELDING_ID\",\n", + " detailed = False,\n", + " output_prob = True,\n", + " output_responses = ['0','1'],\n", + " accumulate = 'anomaly_int')\n", + "decision_forest_predict_out.result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "46f0ff60-f38b-40a4-aafe-0d644a284e8f", + "metadata": {}, + "outputs": [], + "source": [ + "# df_predict= DataFrame('DF_Predict')\n", + "df_predict = decision_forest_predict_out.result\n", + "df_predict_char = df = df_predict.assign(anomaly = df_predict.anomaly_int.cast(type_=VARCHAR(2))\n", + " ,prediction_ch = df_predict.prediction.cast(type_=VARCHAR(2)))\n", + "df_predict_char" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7844e297-5036-4ea2-bf0d-aff9b7c8d5d0", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_obj = ClassificationEvaluator(data=df_predict_char,\n", + " observation_column='anomaly',\n", + " prediction_column='prediction_ch',\n", + " labels=['0','1'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c2c4cd5-6f62-4ad3-a8b0-3db15872b87c", + "metadata": {}, + "outputs": [], + "source": [ + "df_metrics = ClassificationEvaluator_obj.output_data\n", + "df_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c771f8db-68a9-4995-bcc9-872892b8bb85", + "metadata": {}, + "outputs": [], + "source": [ + "df_metric_pd = df_metrics.to_pandas(all_rows = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1c78bb0-a809-4a71-bc77-3f2d3bf31285", + "metadata": {}, + "outputs": [], + "source": [ + "df_metric_pd['Metric'] = df_metric_pd['Metric'].str.strip('\\x00')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7771b01-523d-4737-b7ef-75246f87d7f0", + "metadata": {}, + "outputs": [], + "source": [ + "accuracy = df_metric_pd[df_metric_pd['Metric'] == 'Accuracy']['MetricValue'][0]\n", + "precision = df_metric_pd[df_metric_pd['Metric'] == 'Micro-Precision']['MetricValue'][1]\n", + "f1score = df_metric_pd[df_metric_pd['Metric'] == 'Micro-F1']['MetricValue'][3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "07bf6fba-e68c-4a78-9eae-5aa134aa3655", + "metadata": {}, + "outputs": [], + "source": [ + "df_metrics_new=pd.DataFrame([{'Model':'In-DB DecisionForest','Accuracy':accuracy, 'Precision':precision, 'F1-Score':f1score}])\n", + "df_metrics_new" + ] + }, + { + "cell_type": "markdown", + "id": "0311a230-7911-4859-8658-4130c893b72d", + "metadata": {}, + "source": [ + "
    \n", + "

    7.4 Score new Data

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5fde359-12da-4038-963c-c34ce410de04", + "metadata": {}, + "outputs": [], + "source": [ + "decision_forest_predict_test_out = TDDecisionForestPredict(object = DecisionForest_out.result,\n", + " newdata = DF_ADS_score,\n", + " id_column = \"WELDING_ID\",\n", + " detailed = False,\n", + " output_prob = True,\n", + " output_responses = ['0','1'])\n", + "decision_forest_predict_test_out.result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58448648-772a-445e-9989-8b174ac9db2f", + "metadata": {}, + "outputs": [], + "source": [ + "# df_predict_test= DataFrame('DF_Predict_test')\n", + "df_predict_test=decision_forest_predict_test_out.result\n", + "df_predict_test" + ] + }, + { + "cell_type": "markdown", + "id": "cda02bba-235d-4f1a-b2a7-3e2ea619cce2", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    8. Compare PMML and DecisionForest

    \n", + "
    \n", + "

    8.1 Show AUC-ROC Curve

    \n", + "\n", + "

    The ROC curve shows the performance of a binary classification model as its discrimination threshold varies. For a range of thresholds, the curve plots the true positive rate against false-positive rate.

    \n", + "\n", + "

    This function accepts a set of prediction-actual pairs as input and calculates the following values for a range of discrimination thresholds.

    \n", + " \n", + "\n", + "

    ROC for PMML

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c4b179b-a334-4dc0-b3f8-71c35f87283e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ROC \n", + "roc_pmml = ROC(data = pmml_predict_result, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b946fb-e09e-4e62-b78a-c5325d84c175", + "metadata": {}, + "outputs": [], + "source": [ + "roc_data_pmml = roc_pmml.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data_pmml.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de67ebb4-b0f9-4a8c-9559-e6a44f1c9a21", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "auc_pmml = roc_pmml.result.to_pandas().iloc[0,0]\n", + "auc_pmml" + ] + }, + { + "cell_type": "markdown", + "id": "baf0989e-387a-4ee9-b99e-0687d5a97799", + "metadata": {}, + "source": [ + "

    ROC for DecisionForest

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02a1c9e2-be8c-44da-9e0a-9056a2ec8243", + "metadata": {}, + "outputs": [], + "source": [ + "roc_obj = ROC(data = df_predict, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly_int\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27834036-13cc-49e9-a34e-b2bcb2c192b1", + "metadata": {}, + "outputs": [], + "source": [ + "roc_data = roc_obj.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab90afd6-b0c1-4edd-9492-c97b16c8d4e0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "auc = roc_obj.result.to_pandas().iloc[0,0]\n", + "auc" + ] + }, + { + "cell_type": "markdown", + "id": "abb98428-872c-41d5-b8b1-79804c772a8a", + "metadata": {}, + "source": [ + "

    Plot ROC Curves

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93ab97d1-cbd3-4044-8546-0f170a5ca9ce", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot 1\n", + "plt.plot(roc_data_pmml['fpr'], roc_data_pmml['tpr'], color='orange', label='PMML ROC. AUC = {}'.format(str(auc_pmml)), drawstyle='steps') \n", + "# Plot 2\n", + "plt.plot(roc_data['fpr'], roc_data['tpr'], color='green', label='DecisionForest ROC. AUC = {}'.format(str(auc)), drawstyle='steps') \n", + "# Plot the diagonal dashed line\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') \n", + "# Set labels and title\n", + "plt.xlabel('False Positive Rate',fontsize=12) \n", + "plt.ylabel('True Positive Rate',fontsize=12) \n", + "plt.title('Receiver Operating Characteristic (ROC) Curve',fontsize=16) \n", + "# Add legend\n", + "plt.legend(loc=\"lower right\",fontsize=10) \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c721c745-be69-4eee-a8e2-9faa4ecff46e", + "metadata": {}, + "source": [ + "

    The closer the ROC curve is to the upper left corner of the graph, the higher the accuracy of the test because in the upper left corner, the sensitivity = 1 and the false positive rate = 0 (specificity = 1). The ideal ROC curve thus has an AUC = 1.0. As seen in the above graph the AUC for both the models is close to 1 so the accuracy of both models is very good.

    \n", + "\n", + "
    \n", + "

    8.2 Show Confusion Matrix

    \n", + "\n", + "

    Confusion Matrix is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.

    \n", + "\n", + "

    Confusion matrices represent counts from predicted and actual values. The output “TN” stands for True Negative which shows the number of negative examples classified accurately. Similarly, “TP” stands for True Positive which indicates the number of positive examples classified accurately. The term “FP” shows False Positive value, i.e., the number of actual negative examples classified as positive; and “FN” means a False Negative value which is the number of actual positive examples classified as negative.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cac3275-2854-464a-b240-03e7b836b96d", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate confusion matrix for PMML\n", + "DF_result=df_predict.to_pandas().reset_index()\n", + "pmml_result=pmml_predict_result.to_pandas()\n", + "cm_pmml = confusion_matrix(pmml_result['anomaly'], pmml_result['prediction']) \n", + "# Calculate confusion matrix for DecisionForest\n", + "cm_df = confusion_matrix(DF_result['anomaly_int'], DF_result['prediction']) \n", + "# Create figure and axes objects\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) \n", + "# Plot PMML confusion matrix\n", + "disp_pmml = ConfusionMatrixDisplay(confusion_matrix=cm_pmml, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_pmml.plot(ax=ax1, cmap='Blues', colorbar=False) \n", + "ax1.set_title('PMML Confusion Matrix') \n", + "ax1.set_xlabel('Predicted Label') \n", + "ax1.set_ylabel('True Label') \n", + "ax1.set_xticks([0, 1]) \n", + "ax1.set_yticks([0, 1]) \n", + "ax1.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax1.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_pmml.shape[0]): \n", + " for j in range(cm_pmml.shape[1]): \n", + " ax1.text(j, i, f'{cm_pmml[i, j]}', ha='center', va='center', color='white' if cm_pmml[i, j] > cm_pmml.max() / 2 else 'black') \n", + "\n", + "# Plot DecisionForest confusion matrix\n", + "disp_df = ConfusionMatrixDisplay(confusion_matrix=cm_df, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_df.plot(ax=ax2, cmap='Blues', colorbar=False) \n", + "ax2.set_title('DecisionForest Confusion Matrix') \n", + "ax2.set_xlabel('Predicted Label') \n", + "ax2.set_ylabel('True Label') \n", + "ax2.set_xticks([0, 1]) \n", + "ax2.set_yticks([0, 1]) \n", + "ax2.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax2.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_df.shape[0]): \n", + " for j in range(cm_df.shape[1]): \n", + " ax2.text(j, i, f'{cm_df[i, j]}', ha='center', va='center', color='white' if cm_df[i, j] > cm_df.max() / 2 else 'black') \n", + "\n", + "# Adjust layout and spacing\n", + "plt.tight_layout() \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6f7bd547-6020-42c0-b2a7-d1938a9bdb30", + "metadata": {}, + "source": [ + "

    The confusion matrix for this binary class classification problem has the below 4 quadrants:

    \n", + "\n", + "
  • True Positive (TP) refers to a sample belonging to the positive class being classified correctly.
  • \n", + "
  • True Negative (TN) refers to a sample belonging to the negative class being classified correctly.
  • \n", + "
  • False Positive (FP) refers to a sample belonging to the negative class but being classified wrongly as belonging to the positive class.
  • \n", + "
  • False Negative (FN) refers to a sample belonging to the positive class but being classified wrongly as belonging to the negative class.
  • \n", + "\n", + "
    \n", + "

    8.3 Show Metrices

    \n", + "\n", + "

    Below is the comparison for Accuracy, Precision and F1-Score of the 2 models.

    \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    ColumnDescription
    PrecisionThe positive predictive value. Refers to the fraction of relevant instances among\n", + "the total retrieved instances.\n", + " Precision answers the following question: what proportion of predicted Positives is truly Positive? \n", + " Precision = (TP)/(TP+FP)
    AccuracyAccuracy simply measures how often the classifier correctly predicts. We can define accuracy as the ratio of the number of correct predictions and the total number of predictions.
    F1F1 score, defined as the harmonic mean of the precision and recall and is a number between 0 and 1. F1 score maintains a balance between the precision and recall for your classifier. \n", + " F1 = 2*(precision*recall/precision+recall)
    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5559331d-05a2-417a-aa23-cd9eae401b9c", + "metadata": {}, + "outputs": [], + "source": [ + "combined_metrics=pd.concat([pmml_metrics, df_metrics_new], axis=0)\n", + "combined_metrics" + ] + }, + { + "cell_type": "markdown", + "id": "41344788-3979-4546-a6e2-ae16b6dccb79", + "metadata": {}, + "source": [ + "

    From the above metrics we can conclude that both the models are performing almost similar and have similar Accuracy and Precision.

    " + ] + }, + { + "cell_type": "markdown", + "id": "43be6263-22d8-43d2-94e2-1f58d730f567", + "metadata": {}, + "source": [ + "

    Conclusion

    \n", + "

    We have seen an end-to-end exploration process for labelling anomalous time series using ClearScape Analytics on Teradata Vantage. Thanks to the in-database capabilities offered by Teradata Vantage with ClearScape Analytics, we were able to run this exploration with the smallest notebook instance. The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "

    In this particular use case, we have observed that with large volume of machine sensor data millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.

    " + ] + }, + { + "cell_type": "markdown", + "id": "29e90d19-1b71-44e8-b6d5-aa53e3b673c1", + "metadata": {}, + "source": [ + "
    \n", + "

    8. Cleanup

    \n", + "

    Work Tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48a959e6-319f-4592-93af-482d391224b4", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['ADS_train_data', 'ADS_test_data','DF_train', 'DF_Predict', 'DF_Predict_test','additional_metrics_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf8f9bc-9f3a-47e9-b2d4-81fd00291bc8", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "d51fd98f-b9b2-48b9-b639-16cc51f9116f", + "metadata": {}, + "source": [ + "
    \n", + "

    9. Exploring the Versatility of this Analytical Approach in Alternative Use Case Settings

    \n", + "

    How this analytic approach can be levaraged in other use case settings

    \n", + "\n", + "

    The analytical approach of leveraging clustering followed by classification for anomaly detection in short time series data is highly adaptable and can be broadly applied across various industries, especially in settings where operations or processes are characterized by short, continuous time series with a defined start and end and where ground truth labels are not initially available.

    \n", + "

    This method begins with unsupervised learning to explore and understand the data, identifying patterns, similarities, and potential outliers through techniques like Dynamic Time Warping (DTW). Such exploration is crucial in settings where anomalies are not predefined or where the data’s inherent complexity requires initial unsupervised insight to develop an understanding of what constitutes normal behavior versus an anomaly. Following the clustering phase, supervised classification models are trained on the newly identified labels to predict anomalies. This generic approach is particularly effective for short time series data, where each sequence represents a process or event whose normal operational parameters need to be defined through exploratory analysis before precise anomaly detection can occur.

    \n", + "

    Potential Use Cases Across Industries:

    \n", + "
  • Telco & Utilities - Power Grid Load Monitoring: Analyzing short time series of electricity load during peak usage times to identify anomalies that could indicate equipment failure, energy theft, or inefficiencies in power distribution. Each series could represent the load profile for a brief, high-demand period.
  • \n", + "
  • Healthcare - ECG or EEG Analysis: Short segments of electrocardiogram (ECG) or electroencephalogram (EEG) readings can be analyzed to detect anomalies indicating cardiac arrhythmias or neurological issues, respectively. Each segment represents a complete heartbeat or a brief brain activity pattern.
  • \n", + "
  • Manufacturing - CNC Machine Operations: Monitoring the torque and force profiles of a CNC (Computer Numerical Control) machine during a single machining operation. Anomalies could indicate tool wear, material inconsistency, or operational errors.
  • \n", + "
  • Travel & Transport - Aircraft Engine Test Runs: Analyzing the time series data of engine parameters (e.g., temperature, pressure, vibration) during short test runs to identify deviations from normal operational profiles, suggesting maintenance or safety issues.
  • \n", + "
  • Hospitality & Entertainment - Theme Park Ride Operations: Analyzing sensor data from individual rides, where each ride cycle produces a time series of mechanical or operational parameters. Anomalies in these series could indicate safety concerns or maintenance needs.
  • \n", + "

    Conclusion

    \n", + "

    In each of these scenarios, the focus is on analyzing the shape or behavior of a curve within a short time frame, similar to observing a spot welding curve. These curves are shaped by the specific activity taking place, whether it’s a machine at work, a health test running, financial trades happening, or people interacting with a service. The method begins by sorting these curves into groups based on their patterns, without needing to know ahead of time which ones are out of the ordinary. Then, it moves on to use a more detailed approach to pinpoint which curves don’t fit the expected pattern, labeling them as either normal or not normal. This way of doing things is great for quickly finding and addressing issues, and it also helps in getting a better grasp of how these processes work. This can lead to making things run more smoothly and keeping equipment in good shape before problems even start.

    " + ] + }, + { + "cell_type": "markdown", + "id": "91bd8857-19e0-4200-b3ae-b2efdbca73d3", + "metadata": {}, + "source": [ + "
    \n", + "Resources\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    \n", + "Filters: \n", + "
  • Industry: Manufacturing
  • \n", + "
  • Functionality: Machine Learning
  • \n", + "
  • Use Case: Anomaly Detection
  • \n", + "Related Resources:\n", + "
  • Hyper-scale time series forecasting done right
  • \n", + "
  • Stay Ahead of Continuous and Rapid Change with a Dynamic Supply Chain
  • \n", + "
  • Achieve industry 4.0 using advanced manufacturing analytics at scale
  • \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "1da48da7-d4de-4693-9365-5d5f63810673", + "metadata": { + "tags": [] + }, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python_TrustedAI.ipynb b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python_TrustedAI.ipynb new file mode 100644 index 00000000..e1d35dbb --- /dev/null +++ b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python_TrustedAI.ipynb @@ -0,0 +1,80545 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b1378a69-ac58-4d0c-af22-7ef881abac45", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Anomaly Detection in Robot Welding Process
    Trusted AI\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    \n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "d0f87457-4516-4584-be1c-165cbf682c24", + "metadata": {}, + "source": [ + "

    Introduction

    \n", + "\n", + "

    Detecting anomalies reduces issues and delays in many industries, especially in the manufacturing field. There have been approaches to detect anomalies in the past, such as engineering rules and graph and deep learning. However, it still proves difficult to detect all the existing anomalies. Plus, companies are striving to minimize false positives, cope with the diversity of sensors and metrology issues, and deliver actionable insights at a business pace. Fortunately, Teradata and ClearScape Analytics have the solution. In ClearScape Analytics, users can execute all steps of anomaly detection from data preparation and exploration to model training and evaluations and adjustments. These analyses can improve the process and ensure accuracy in anomaly detection.

    \n", + "\n", + "

    Spot Welding Quality Assessment

    \n", + "

    Spot welding is a common technique used for welding car body panels, particularly in the assembly of smaller parts and components. Spot welding involves using a pair of copper electrodes to apply a series of short, high-current welding pulses to the metal, fusing the parts together at specific points or “spots”.

    \n", + "\n", + "

    The automotive industry is known for its high level of automation, and spot welding is one of the most automated processes, heavily reliant on robots to improve efficiency, reduce labor costs, and improve the consistency and quality of the finished product. Poor welding quality is rare, but even so, the consequences of poor quality may not be negligible in terms of rework costs and customer satisfaction, especially when quality issues are detected too late.

    \n", + "\n", + "\n", + "\n", + "

    Spot welding is a resistance welding process that uses large electrical current. There are many ways to assess the quality of a spot, like tensile or ultrasonic testing to assess the weld strength or the analysis of the welding current measured and recorded during the welding process. In this demo, we focus on the analysis of the anomalies in the welding spot due to welding current, and more specifically the resistance, i.e. the voltage-current ratio which impacts the quality of the welding. The shape of the resistance curve depends on many factors like the nature of the materials, the geometry, and the quality of the electrodes etc.

    \n", + "\n", + "\n", + "

    Business Values

    \n", + "
  • Improve accuracy in the production and manufacturing process.
  • \n", + "
  • Reduce the number of false positive anomalies detected in a system.
  • \n", + "
  • Decrease additional costs and time wasted due to undetected anomalies.
  • \n", + "
  • Determine patterns and significant factors that lead to anomalies.
  • \n", + "

    Why Vantage?

    \n", + "

    Many organizations fail to realize value from their ML and AI investments due to a lack of scale. It is estimated that for broad adoption across many industries, the number of models and model deployments needs to scale 100-1000x larger than their organizations currently support.

    \n", + "

    The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "

    In this particular use case, the volume of machine sensor data was so great that millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.

    \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7f41da40-f1e9-4979-9e0f-bd5ba6460443", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "d5933363-d749-42da-bde7-4524f5f013b9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv\n", + "!pip install lime\n", + "!pip install scikit-learn==1.1.3" + ] + }, + { + "cell_type": "markdown", + "id": "8f352f94-db8e-48e8-8040-a63c466fd4f5", + "metadata": {}, + "source": [ + "
    \n", + "

    Note: After installing the above libraries, Please restart the kernel. The simplest way is by typing zero zero: 0 0

    \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3a0b76c2-b211-452f-949c-676da6da9540", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import json\n", + "import getpass\n", + "import pandas as pd\n", + "import datetime\n", + "from teradataml import *\n", + "\n", + "import numpy as np # linear algebra\n", + "import matplotlib.pyplot as plt\n", + "import sklearn\n", + "from sklearn import preprocessing\n", + "# from tdsense.clustering import hierarchy_dendrogram, hierarchy_clustering\n", + "# from tdnpathviz.visualizations import plotcurves\n", + "%matplotlib inline\n", + "\n", + "from sklearn import datasets\n", + "from sklearn2pmml.pipeline import PMMLPipeline\n", + "from sklearn2pmml import sklearn2pmml\n", + "from sklearn.model_selection import train_test_split\n", + "# from imblearn.over_sampling import SMOTE\n", + "import imblearn\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import accuracy_score, precision_score, roc_auc_score, f1_score,confusion_matrix, roc_curve, ConfusionMatrixDisplay\n", + "import time\n", + "import pytz\n", + "import lime\n", + "from dotenv import load_dotenv, dotenv_values\n", + "from IPython.display import display as ipydisplay\n", + "from IPython.display import clear_output\n", + "from time import sleep\n", + "\n", + "#Set java path\n", + "from jdk4py import JAVA, JAVA_HOME, JAVA_VERSION\n", + "os.environ['PATH'] = os.environ['PATH'] + os.pathsep + str(JAVA_HOME)\n", + "os.environ['PATH'] = os.environ['PATH'] + os.pathsep + str(JAVA)[:-5]\n", + "\n", + "from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot\n", + "from collections import defaultdict\n", + "import plotly.offline as offline\n", + "offline.init_notebook_mode()\n", + "\n", + "from teradataml.dataframe.sql_functions import case\n", + "from teradataml import db_drop_table\n", + "configure.byom_install_location = \"td_mldb\"\n", + "\n", + "display.max_rows = 5\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)" + ] + }, + { + "cell_type": "markdown", + "id": "8c250746-66ba-40aa-b41b-c791786f61a0", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2be07d96-51d3-4aee-b025-582af97119da", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking if this environment is ready to connect to VantageCloud Lake...\n", + "Your environment parameter file exist. Please proceed with this use case.\n", + "Connected to VantageCloud Lake with: Engine(teradatasql://jd255091:***@54.156.178.22)\n" + ] + } + ], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_AnomalyDetection_TrustedAI.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c80f069f-5d34-4f18-93fd-c784897102c0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Authentication token is generated, authenticated and set for the session.\n", + "UES Authentication successful\n" + ] + } + ], + "source": [ + "# We've already loaded all the values into our environment variables and into a dictionary, env_vars.\n", + "# username=env_vars.get(\"username\") isn't required when using base_url, pat and pem.\n", + "\n", + "if set_auth_token(base_url=env_vars.get(\"ues_uri\"),\n", + " pat_token=env_vars.get(\"access_token\"), \n", + " pem_file=env_vars.get(\"pem_file\"),\n", + " valid_from=int(time.time())\n", + " ):\n", + " print(\"UES Authentication successful\")\n", + "else:\n", + " print(\"UES Authentication failed. Check credentials.\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "id": "9476f53a-7115-4018-a58f-dd09f7fc8b88", + "metadata": {}, + "source": [ + "
    \n", + "

    3.Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "99598e0a-8a6c-4539-a06d-f6723f67134f", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "add51f496db2440e9195e9a4369d27cc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PARTITION_IDIDXY
    10905105233.91074826423207
    10905208293.063489325248
    1090564221.8570197466893
    10905194299.3853606177206
    1090554219.24825515502602
    " + ], + "text/plain": [ + " X Y\n", + "PARTITION_ID ID \n", + "10 905 105 233.910748\n", + " 905 208 293.063489\n", + " 905 64 221.857020\n", + " 905 194 299.385361\n", + " 905 54 219.248255" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Sensor_Data = DataFrame(in_schema('DEMO_AnomalyDetection', 'Sensor_Data'))\n", + "Sensor_Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fb3791b3-ba71-4c52-bec7-bb6350a498e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(908440, 4)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Sensor_Data.shape" + ] + }, + { + "cell_type": "markdown", + "id": "d4b9b958-737d-41a0-adec-91614fa0fe2e", + "metadata": {}, + "source": [ + "

    We get the above data from sensors. We focus on one plant (PLANT=1) and one robot (ROBOT_ID=41). The Partition_ID is the type of welding, ID is the WELDING_ID, X is time required for welding in ms and Y is the RESISTANCE. We create a view with the columns required to get data with proper column names.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "88cde234-6107-487e-92f2-7f045576cc1d", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "query = f\"\"\"\n", + "REPLACE VIEW DEMO_AnomalyDetection.V_dataset_01 AS\n", + "SELECT\n", + " 1 AS PLANT\n", + ", {41} AS ROBOT_ID\n", + ", CAST(A.PARTITION_ID AS BIGINT) AS WELDING_TYPE\n", + ", CAST((DATE '{str(datetime.datetime.now()).split(' ')[0]}' + FLOOR((WELDING_ID-700*WELDING_TYPE)/100)) AS DATE FORMAT 'YYYY-MM-DD') AS WELDING_DAY\n", + ", CAST(A.ID AS BIGINT) AS WELDING_ID\n", + ", CAST(A.X AS INTEGER) AS TIME_MS\n", + ", A.Y AS RESISTANCE\n", + "FROM DEMO_AnomalyDetection.Sensor_Data A\n", + "\"\"\"\n", + "execute_sql(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8ec3a959-c5e0-4039-88f8-846adca6f113", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc7b3e698fe042d3847a1767b5cd7fc8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCE
    141102025-06-12905105233.91074826423207
    141102025-06-12905208293.063489325248
    141102025-06-1290564221.8570197466893
    141102025-06-12905194299.3853606177206
    141102025-06-1290554219.24825515502602
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE\n", + "0 1 41 10 2025-06-12 905 105 233.910748\n", + "1 1 41 10 2025-06-12 905 208 293.063489\n", + "2 1 41 10 2025-06-12 905 64 221.857020\n", + "3 1 41 10 2025-06-12 905 194 299.385361\n", + "4 1 41 10 2025-06-12 905 54 219.248255" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_dataset_new = DataFrame(in_schema('DEMO_AnomalyDetection', 'V_dataset_01'))\n", + "welding_dataset_new" + ] + }, + { + "cell_type": "markdown", + "id": "09198aa2-6ab7-4339-a01a-365cba02c772", + "metadata": {}, + "source": [ + "
    \n", + "

    3.1 - Some aggregations and visualization.

    \n" + ] + }, + { + "cell_type": "markdown", + "id": "f83b1b1a-eece-487a-97d7-b4759ea624ce", + "metadata": {}, + "source": [ + "

    We will check the histogram based on the minimum and maximum Time for welding.

    \n", + "

    A histogram is a better way to assess distribution, to cope with the scalability, it is recommended to compute the histogram bins in-database to leverage the Massively Parallel Architecture of Teradata Vantage. For that, we use the Histogram function of teradataml that pushes down the computations to Vantage.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1a5d38c3-ebb9-47a2-b8ad-f00acd9d769b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3f8e6f0f137346148636bb3c10aca42b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_IDmin_TIME_MSmax_TIME_MScount_TIME_MS
    14143201806806
    14154181642642
    1419801114781478
    1419884110851085
    141109381928928
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_ID min_TIME_MS max_TIME_MS count_TIME_MS\n", + "0 1 41 4 320 1 806 806\n", + "1 1 41 5 418 1 642 642\n", + "2 1 41 9 801 1 1478 1478\n", + "3 1 41 9 884 1 1085 1085\n", + "4 1 41 10 938 1 928 928" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_duration_ms = welding_dataset_new. \\\n", + " groupby(['PLANT','ROBOT_ID','WELDING_TYPE', 'WELDING_ID']). \\\n", + " agg({'TIME_MS':['min','max','count']})\n", + "welding_duration_ms" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "642bf739-a421-4ffd-8fc1-53f273db9bd9", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f522d71d68b84df89cb78c6b92684e2b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    ColumnNameLabelMinValueMaxValueCountOfValuesBin_Percent
    count_TIME_MS00.0100.0111.0999999999999999
    count_TIME_MS1100.0200.0141.4000000000000001
    count_TIME_MS2200.0300.0343.4000000000000004
    count_TIME_MS3300.0400.0393.9
    count_TIME_MS4400.0500.0474.7
    " + ], + "text/plain": [ + " ColumnName Label MinValue MaxValue CountOfValues Bin_Percent\n", + "0 count_TIME_MS 0 0.0 100.0 11 1.1\n", + "1 count_TIME_MS 1 100.0 200.0 14 1.4\n", + "2 count_TIME_MS 2 200.0 300.0 34 3.4\n", + "3 count_TIME_MS 3 300.0 400.0 39 3.9\n", + "4 count_TIME_MS 4 400.0 500.0 47 4.7" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from teradataml import Histogram\n", + "obj = Histogram(data=welding_duration_ms,\n", + " target_columns=\"count_TIME_MS\",\n", + " method_type=\"Scott\")\n", + "res = obj.result.sort('MinValue')\n", + "res" + ] + }, + { + "cell_type": "markdown", + "id": "62b099f0-eb76-45a2-9c0e-983399c59570", + "metadata": {}, + "source": [ + "

    We can see that we have calculated the histogram values using the teradataml functions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visualization tools, this will not only make the calculation faster but also reduce the time due to less data movement between tools. We do the data transfer for this and the subsequent visualizations wherever necessary.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7c9b72ab-7d3c-4964-9199-ee1dcc17c928", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "res = obj.result.sort('MinValue').to_pandas()\n", + "res['duration_ms'] = [str(row['MinValue'])+'-'+str(row['MaxValue']) for i,row in res.iterrows()]\n", + "res.plot(x='duration_ms',y='CountOfValues',kind='bar', figsize=(15,10), legend=False,xlabel='Duration(ms)', ylabel='Welding Counts')" + ] + }, + { + "cell_type": "markdown", + "id": "88429a10-aa8b-459f-976a-6276ab121bbc", + "metadata": {}, + "source": [ + "

    In the above histogram we can see the bins between the Min and the Max value of the durations and the welding counts.

    \n", + "
    \n", + "

    3.2 - More advanced processing using window functions and delta_t

    \n", + "

    Resistance is an important parameter in resistance welding. The resistance should not vary too much. If there are any significant changes in resistance over time, it could indicate an issue with the weld quality. For example, an unusually high resistance could indicate poor contact between the parts being welded or a problem with the welding equipment.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b5615026-52eb-4aae-8bb2-146e88ef4502", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d628ce44d1b549acb4bbd0b861240b89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCE
    14192025-06-18854714103.14563314224813
    14192025-06-18854130171.2900655657514
    14192025-06-18854352161.46970909346348
    14192025-06-18854353154.4865896266523
    14192025-06-1885472387.61515040184788
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE\n", + "0 1 41 9 2025-06-18 854 714 103.145633\n", + "1 1 41 9 2025-06-18 854 130 171.290066\n", + "2 1 41 9 2025-06-18 854 352 161.469709\n", + "3 1 41 9 2025-06-18 854 353 154.486590\n", + "4 1 41 9 2025-06-18 854 723 87.615150" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "50c72091-f7f3-4ed3-a436-ee5c44335f4e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from tdnpathviz.visualizations import plotcurves\n", + "plotcurves(welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854],field='RESISTANCE',row_axis='TIME_MS', series_id='WELDING_ID',select_id=None)" + ] + }, + { + "cell_type": "markdown", + "id": "ae924828-6e92-4003-93c9-b66aeec1821f", + "metadata": {}, + "source": [ + "

    The above graph shows the variation of the resistance of the welding with respect to time. We see that the most interesting part lies between 40 and 400ms from the start of the curve.

    \n", + "\n", + "

    Next we apply the window function on the resistance to smooth the resistance and taking the mean value.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "516d8fd4-ab2c-44cd-89d2-d8075e40cf82", + "metadata": {}, + "outputs": [], + "source": [ + "# curve smoothing\n", + "window_for_smoothing = welding_dataset_new.RESISTANCE.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS',\n", + " window_start_point = -15,\n", + " window_end_point = 15\n", + ")\n", + "welding_dataset_smooth = welding_dataset_new.assign(RESISTANCE_SMOOTHED = window_for_smoothing.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6c351bab-cd80-452c-b600-79efaec9f769", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28f1b58f8da242acbd3aae6f80816278", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCERESISTANCE_SMOOTHED
    14192025-06-188541353.0226900449998253.14252688841657
    14192025-06-188542335.5825378948886248.69892486462675
    14192025-06-188543324.9522665624502244.90621844326972
    14192025-06-188544297.6567809200622241.13418644931798
    14192025-06-188545288.1532672344735237.60587172607387
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE RESISTANCE_SMOOTHED\n", + "0 1 41 9 2025-06-18 854 1 353.022690 253.142527\n", + "1 1 41 9 2025-06-18 854 2 335.582538 248.698925\n", + "2 1 41 9 2025-06-18 854 3 324.952267 244.906218\n", + "3 1 41 9 2025-06-18 854 4 297.656781 241.134186\n", + "4 1 41 9 2025-06-18 854 5 288.153267 237.605872" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "id_curve = 854\n", + "single_welding = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "44d1ffb7-1bf2-4770-8b0d-f21ed5a589e4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/jpeg": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "figure = Figure(width=1000, height=400, image_type=\"jpg\",\n", + " heading=\"RESISTANCE and RESISTANCE SMOOTHED\")\n", + "plot = single_welding.plot(x=single_welding.TIME_MS, y=[single_welding.RESISTANCE, single_welding.RESISTANCE_SMOOTHED],\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ',figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "299bf795-653e-45a4-8f39-5143d81173cf", + "metadata": {}, + "source": [ + "

    The above graph shows the variation of the resistance of the welding with respect to time and the smoothed resistance, as shown by the Red line, after applying the window function.

    \n", + "\n", + "

    The window function generates a Window object on a teradataml DataFrame Column to run window aggregate functions.\n", + "

    Function allows user to specify window for different types of computations:\n", + "

  • Cumulative\n", + "
  • Group\n", + "
  • Moving\n", + "
  • Remaining\n", + "

    By default, window with Unbounded Preceding and Unbounded following is considered for calculation.

    \n", + "\n", + "

    Next we calculate the derivative by using the lead function and taking the difference of the lead value and the mean value of the resistance. Applying a window function to smooth the resistance curve helps to eliminate noise and makes it easier to see the overall trend. The derivative of the resistance gives an indication of how quickly the resistance is changing, which can be a useful measure for detecting anomalies and predicting potential issues.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c3719c73-495e-4651-b764-c01678f07417", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(welding_dataset_smooth,table_name='welding_dataset_smooth', if_exists='replace')\n", + "welding_dataset_smooth = DataFrame('welding_dataset_smooth')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fccb6149-ce72-4601-983b-a87f2bc52417", + "metadata": {}, + "outputs": [], + "source": [ + "# let's compute the lead\n", + "window_for_lead = welding_dataset_smooth.RESISTANCE_SMOOTHED.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e6a9bc90-f330-467f-8765-5a00578c6c6e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cce13b3b0c7b4f79878530fbd803efca", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCERESISTANCE_SMOOTHEDRESISTANCE_SMOOTHED_AFTERDERIVATIVE
    141125/08/0511474.381162422724316.7743268301616311.4723766637594-5.301950166402207
    141125/08/0512428.56930331230063311.4723766637594306.6978645868533-4.774512076906092
    141125/08/0513398.452732368239306.6978645868533302.25553180871196-4.442332778141349
    141125/08/0514372.90267653201755302.25553180871196298.4338994297387-3.821632378973277
    141125/08/0515350.6458154372735298.4338994297387294.38610093105206-4.047798498686632
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE RESISTANCE_SMOOTHED RESISTANCE_SMOOTHED_AFTER DERIVATIVE\n", + "0 1 41 1 25/08/05 1 1 474.381162 316.774327 311.472377 -5.301950\n", + "1 1 41 1 25/08/05 1 2 428.569303 311.472377 306.697865 -4.774512\n", + "2 1 41 1 25/08/05 1 3 398.452732 306.697865 302.255532 -4.442333\n", + "3 1 41 1 25/08/05 1 4 372.902677 302.255532 298.433899 -3.821632\n", + "4 1 41 1 25/08/05 1 5 350.645815 298.433899 294.386101 -4.047798" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_dataset_smooth = welding_dataset_smooth.assign(RESISTANCE_SMOOTHED_AFTER = window_for_lead.lead())\n", + "welding_dataset_smooth = welding_dataset_smooth.assign(DERIVATIVE = (welding_dataset_smooth.RESISTANCE_SMOOTHED_AFTER - welding_dataset_smooth.RESISTANCE_SMOOTHED).zeroifnull())\n", + "welding_dataset_smooth.sort(['WELDING_ID','TIME_MS'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "d019941f-4422-4012-8984-0dce20d10e48", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8ead2da622de485b9109f40a7498cb0d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCERESISTANCE_SMOOTHEDRESISTANCE_SMOOTHED_AFTERDERIVATIVE
    141925/06/188541353.0226900449998253.14252688841657248.69892486462675-4.443602023789822
    141925/06/188542335.5825378948886248.69892486462675244.90621844326972-3.7927064213570247
    141925/06/188543324.9522665624502244.90621844326972241.13418644931798-3.7720319939517424
    141925/06/188544297.6567809200622241.13418644931798237.60587172607387-3.528314723244108
    141925/06/188545288.1532672344735237.60587172607387234.07488665282776-3.5309850732461143
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE RESISTANCE_SMOOTHED RESISTANCE_SMOOTHED_AFTER DERIVATIVE\n", + "0 1 41 9 25/06/18 854 1 353.022690 253.142527 248.698925 -4.443602\n", + "1 1 41 9 25/06/18 854 2 335.582538 248.698925 244.906218 -3.792706\n", + "2 1 41 9 25/06/18 854 3 324.952267 244.906218 241.134186 -3.772032\n", + "3 1 41 9 25/06/18 854 4 297.656781 241.134186 237.605872 -3.528315\n", + "4 1 41 9 25/06/18 854 5 288.153267 237.605872 234.074887 -3.530985" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "id_curve = 854\n", + "single_welding_subplot = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding_subplot" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "0bd9b71a-b668-44f9-a0bd-e74b2c82462e", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from teradataml import subplots\n", + "# fig, axes = subplots(grid = {(1, 1): (1, 1),(2, 1): (1, 2)})\n", + "# Plot 1980 data at first Axis.\n", + "fig, axes = subplots(nrows=2, ncols=1)\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=[single_welding_subplot.RESISTANCE, single_welding_subplot.RESISTANCE_SMOOTHED],\n", + " legend=[\"RESISTANCE\", \"RESISTANCE SMOOTHED\"],\n", + " figure=fig,\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ', \n", + " ax=axes[0])\n", + "\n", + "# Plot 1981 data at second Axis.\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=single_welding_subplot.DERIVATIVE,\n", + " legend=[\"DERIVATIVE\"],\n", + " figure=fig,\n", + " style=\"red\",xlabel='time in ms', ylabel='derivative ' , \n", + " ax=axes[1])\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "92c9f6e0-7b26-4fed-9b43-1d35989affad", + "metadata": {}, + "source": [ + "

    We see that the most interesting part lies between 40 and 400ms from the start of the curve, so we plot only that subset.

    " + ] + }, + { + "cell_type": "markdown", + "id": "4615d965-6892-4729-81b0-9dd39f7d9411", + "metadata": { + "tags": [] + }, + "source": [ + "

    It is hard to assess the diversity of curve shapes in this plot since many of them are superimposed. However, we see in the middle of the picture a sharp drop that looks unusual. Moreover, we guess that there are shifts in time and height.

    \n", + "\n", + "
    \n", + "

    4. Feature Engineering

    " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "da82ee40-3e38-49af-a6ca-a678ba240ca2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['PLANT',\n", + " 'ROBOT_ID',\n", + " 'WELDING_TYPE',\n", + " 'WELDING_DAY',\n", + " 'WELDING_ID',\n", + " 'TIME_MS',\n", + " 'RESISTANCE']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_dataset_new.columns" + ] + }, + { + "cell_type": "markdown", + "id": "539a4c25-f868-44af-bca3-13b4ca477445", + "metadata": {}, + "source": [ + "

    We will create a feature table by using different functions on the Resistance column. Valid values for functions are: 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var', 'skew', 'kurtosis'.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "aa37d2af-c185-4a84-9ca5-8628a216aa27", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3d71e7bf77b44a3994113f0fe2dc8068", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDmin_TIME_MSmax_TIME_MScount_RESISTANCEsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEpercentile_RESISTANCEunique_RESISTANCEmedian_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCE
    14152025-07-1247121646626108176.805487928988.4331503589787308.8898639352117172.806398543017467.43470974015446155.80095377664534626155.800953776645344547.4400777388830.49796745320216573-1.1467845713561156
    14182025-06-2470521633613117433.3847865466882.76152873653831316.66867750259905191.5715901901250878.08923446856586206.63668033820593613206.636680338205936097.9285398866550.04364970537902622-1.5898532421335452
    14152025-07-124462111061086166518.945355507370.0355276366476315.4566895327382153.3323622058078472.20479609391711108.804345050039221086108.804345050039225213.5325789641480.9529411796445812-0.6269191898462032
    14182025-06-2474821764744109921.2726066986484.02047298240994282.98968717382587147.7436459767454859.15325216552249111.91280735495172744111.912807354951723499.1072417578910.8606279445787202-0.7317898244111711
    14172025-06-3060521590570108569.6322568252281.47252264867211299.90656597268855190.473039047061867.5249319900682195.99852023934073570195.998520239340734559.616440263336-0.07402858894022789-1.4208348503506993
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID min_TIME_MS max_TIME_MS count_RESISTANCE sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE percentile_RESISTANCE unique_RESISTANCE median_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE\n", + "0 1 41 5 2025-07-12 471 21 646 626 108176.805488 88.433150 308.889864 172.806399 67.434710 155.800954 626 155.800954 4547.440078 0.497967 -1.146785\n", + "1 1 41 8 2025-06-24 705 21 633 613 117433.384787 82.761529 316.668678 191.571590 78.089234 206.636680 613 206.636680 6097.928540 0.043650 -1.589853\n", + "2 1 41 5 2025-07-12 446 21 1106 1086 166518.945356 70.035528 315.456690 153.332362 72.204796 108.804345 1086 108.804345 5213.532579 0.952941 -0.626919\n", + "3 1 41 8 2025-06-24 748 21 764 744 109921.272607 84.020473 282.989687 147.743646 59.153252 111.912807 744 111.912807 3499.107242 0.860628 -0.731790\n", + "4 1 41 7 2025-06-30 605 21 590 570 108569.632257 81.472523 299.906566 190.473039 67.524932 195.998520 570 195.998520 4559.616440 -0.074029 -1.420835" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features = welding_dataset_new.loc[welding_dataset_new.TIME_MS > 20,:]. \\\n", + " groupby(welding_dataset_new.columns[0:5]). \\\n", + " agg({\n", + " 'TIME_MS':['min','max'],\n", + " 'RESISTANCE':['count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var','skew','kurtosis']\n", + " })\n", + "features" + ] + }, + { + "cell_type": "markdown", + "id": "0196e16a-9d9d-4d44-a0ed-e5220c3314e2", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Anomaly Detection on Sensor Data

    \n", + " \n", + "

    Let's start by getting the feature columns from the features tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "27cdf0f8-e0b3-41b5-b18d-b77cdbc5652b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['count_RESISTANCE',\n", + " 'sum_RESISTANCE',\n", + " 'min_RESISTANCE',\n", + " 'max_RESISTANCE',\n", + " 'mean_RESISTANCE',\n", + " 'std_RESISTANCE',\n", + " 'percentile_RESISTANCE',\n", + " 'unique_RESISTANCE',\n", + " 'median_RESISTANCE',\n", + " 'var_RESISTANCE',\n", + " 'skew_RESISTANCE',\n", + " 'kurtosis_RESISTANCE']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = features.columns[7::]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "9655f048-ffbd-4785-9e8b-39d192ff7808", + "metadata": {}, + "source": [ + "
    \n", + "

    5.1 Clustering by curve shape

    \n", + "

    To cluster time series by shapes, we will use the Dynamic Time Warping (DTW) distance that measures the similarity between two time series. This distance is well adapted to this kind of problem since it provides robustness to shifts in time and height.

    \n", + "\n", + "

    Distance Matrix in-database Computations

    \n", + "\n", + "

    The ClearScape Analytics DTW function computes at scale distances between one reference curve to a set of curves, a many-to-one approach. ClearScape Analytics offers in database dynamic time warping function, callable in SQL as TD_DTW. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. This function computes at scale the DTW distances between one reference curve to a set of curves, a many-to-one approach. We want to compute the distance matrix of our subset, i.e. the DTW distance between each curve. The distance matrix is symmetric, since the DTW is, hence we only need to compute the triangular matrix. We wrapped this computation in the tdsense package that calls the TD_DTW function and iterates on the matrix row to compute and store the whole triangular distance matrix in a table.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "207e72c8-41e3-481a-9727-a4c7510f4206", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2025-07-12',\n", + " '2025-07-06',\n", + " '2025-06-12',\n", + " '2025-07-30',\n", + " '2025-06-24',\n", + " '2025-07-18',\n", + " '2025-06-30',\n", + " '2025-08-05',\n", + " '2025-07-24',\n", + " '2025-06-18']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "overview = welding_dataset_new.groupby('WELDING_DAY').count(distinct=True)\n", + "dates = list(overview.to_pandas().reset_index()['WELDING_DAY'].values.astype('str'))\n", + "dates" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7c7180b4-a8b5-450a-96be-8aed93d1199a", + "metadata": {}, + "outputs": [], + "source": [ + "subset = welding_dataset_new[ \\\n", + " (welding_dataset_new['PLANT'] == 1) & \\\n", + " (welding_dataset_new['ROBOT_ID'] == 41) & \\\n", + " (welding_dataset_new['WELDING_TYPE'] in (8,9)) & \\\n", + " (welding_dataset_new['WELDING_DAY'].isin(dates)) \\\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0dda2eca-af26-4741-abeb-b63758f8c996", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(344622, 7)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subset_zoom = subset[(subset.TIME_MS < 400) & (subset.TIME_MS > 40)]\n", + "subset_zoom.shape" + ] + }, + { + "cell_type": "markdown", + "id": "9d40f422-886d-48e5-a4ce-03b259523917", + "metadata": {}, + "source": [ + "

    The subset of data we have taken contains 7 columns and 344,622 rows.

    \n", + "\n", + "

    Since this is a 2CPU system, the below computation takes around more than 2 hours for 350k rows and so we have pre calculated it and stored in the table in database.

    \n", + "\n", + "

    **In case we still want to compute the matrix please set the If part of the below code to True instead of False

    " + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "276fd1b7-e057-4c0c-b8b0-4e063d70eb7a", + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " dtw_matrix = dtw_distance_matrix_computation2(subset_zoom,field='RESISTANCE',\n", + " table_name=dtw_result_table,\n", + " schema_name = Param['database'],\n", + " row_axis='TIME_MS',\n", + " series_id = 'WELDING_ID')\n", + "else:\n", + " dtw_matrix = DataFrame(in_schema('DEMO_AnomalyDetection','DTW_Matrix'))" + ] + }, + { + "cell_type": "markdown", + "id": "42f770a5-f3b2-4862-8256-b1cc1f969750", + "metadata": {}, + "source": [ + "
    \n", + "

    5.2 Hierarchical clustering with Scipy

    \n", + "\n", + "

    Now the distance matrix is available, we can perform the clustering. Here, we will use the open-source package Scipy and its cluster.hierarchy modules, that have been used in a tdsense for convenience.

    \n", + "\n", + "

    Hierarchical clustering is an alternative class of clustering algorithms that produce 1 to n clusters, where n is the number of observations in the data set. As you go down the hierarchy from 1 cluster (contains all the data) to n clusters (each observation is its own cluster), the clusters become more and more similar (almost always).

    " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0b87b35b-c283-42d8-845b-5c9c7851c822", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    MATRIX_ROWWELDING_ID_1WELDING_ID_2ROW_IDDISTANCE
    012103404.181076
    123102879.840770
    234102035.367486
    345102641.395023
    4561010501.828495
    ..................
    49749899699999704333.889951
    497499997100099704851.558731
    49750099699999802143.966383
    497501997100099802132.062450
    497502997100099902119.041857
    \n", + "

    497503 rows × 5 columns

    \n", + "
    " + ], + "text/plain": [ + " MATRIX_ROW WELDING_ID_1 WELDING_ID_2 ROW_ID DISTANCE\n", + "0 1 2 1 0 3404.181076\n", + "1 2 3 1 0 2879.840770\n", + "2 3 4 1 0 2035.367486\n", + "3 4 5 1 0 2641.395023\n", + "4 5 6 1 0 10501.828495\n", + "... ... ... ... ... ...\n", + "497498 996 999 997 0 4333.889951\n", + "497499 997 1000 997 0 4851.558731\n", + "497500 996 999 998 0 2143.966383\n", + "497501 997 1000 998 0 2132.062450\n", + "497502 997 1000 999 0 2119.041857\n", + "\n", + "[497503 rows x 5 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dtw_matrix_loc = dtw_matrix.sort(columns=['WELDING_ID_2','WELDING_ID_1']).to_pandas(all_rows=True)\n", + "dtw_matrix_loc" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "49f64fd3-1f33-4b7c-9d8f-b0636bffc2f4", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "from tdsense.clustering import hierarchy_dendrogram, hierarchy_clustering\n", + "linked, labelList = hierarchy_dendrogram(dtw_matrix_loc, cluster_distance = 'ward')" + ] + }, + { + "cell_type": "markdown", + "id": "df0a3961-8cd1-43b8-9c11-9e229648d1eb", + "metadata": {}, + "source": [ + "

    The dendrogram is useful for visualizing the structure of the hierarchical clustering and identifying the optimal number of clusters to use for further analysis. The optimal number of clusters can be determined by examining the dendrogram to identify a level at which the clusters start to merge more slowly or by using a threshold for the maximum distance between clusters.

    \n", + "\n", + "

    The resulting dendrogram as above shows how the hierarchical clustering algorithm has merged the data points into clusters based on their pairwise distances using the Ward linkage criterion. The dendrogram is a summary of the distance matrix. The X axis has the WELDING_ID but not visible as we have more than 450k rows. Looking at the dendrogram, we see that we have about 6 clusters. When selected 6, here is what we have got.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "d2e168ff-626b-47b8-bc2b-ecfaac22a8f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    CURVE_IDclusterleaves_color_list
    67510#ff964f
    80520#ff964f
    47131#b2f396
    67040#ff964f
    83950#ff964f
    \n", + "
    " + ], + "text/plain": [ + " CURVE_ID cluster leaves_color_list\n", + "675 1 0 #ff964f\n", + "805 2 0 #ff964f\n", + "471 3 1 #b2f396\n", + "670 4 0 #ff964f\n", + "839 5 0 #ff964f" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7EAAANOCAYAAADH54uVAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAAsTAAALEwEAmpwYAADIIUlEQVR4nOzde3RX1Z3//+dGLoIhdpAgqCByEYWCWsBLGRTwghaLogjWUtGZKRahWusPtOPXGeyIFdovWhUEa20q9VY7Fr8gTZUiKHIRECgN2AIqIkUUBQFFELJ/f3w+oUn4JCQh8OHY52OtrJNzzn7vs0+G1TUv99nnhBgjkiRJkiQlQa1sD0CSJEmSpMoyxEqSJEmSEsMQK0mSJElKDEOsJEmSJCkxDLGSJEmSpMQwxEqSJEmSEqN2tgegf2jcuHFs2bJltochSZIkSVmxePHiTTHGvIraGGIPIy1btmTRokXZHoYkSZIkZUUIYe3+2vg4sSRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMWpnewCSJEmSMisqKmLz5s1s376dzz//nKKiomwPSdqvWrVqceSRR5KTk8O//Mu/UKtWzc6dGmIlSZKkw9Du3btZt24dtWvXplGjRjRo0IBatWoRQsj20KRyxRgpKiris88+Y8uWLWzdupXmzZtTu3bNRU8fJ5YkSZIOQx9//DH16tXjhBNOoGHDhhxxxBEGWB32QggcccQRNGzYkBNOOIF69erx8ccf1+g1DLGSJEnSYeiTTz7hmGOOMbgqsUIIHHPMMXzyySc12q8hVpIkSToM7d69m7p162Z7GNIBqVu3Lrt3767RPg2xkiRJ0mHKWVgl3cH4N2yIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiQl3vTp0/nOd75DmzZtyMnJ2ft5oj59+jBx4kS2bdtWqv2oUaMIITBq1KjsDPgw9de//pX77ruPiy++mGbNmlGnTh2OPvpozjnnHO6//3527tyZ7SFSc1+clSRJkqRD7IMPPmDAgAHMnj0bgFNPPZWLLrqIunXr8t577zFjxgymT5/OnXfeyaJFizjxxBOzPOKUHj16MHv2bF5++WV69OiR7eHsdf7557N+/XqOPPJIunTpQo8ePdi4cSPz5s1j/vz5PP7448yYMYNGjRplbYyGWEmSJEmJtGXLFrp168bq1as555xzmDhxIp06dSrVZtu2bTz88MOMHj2azZs3HzYh9nDVrl07fvzjHzNgwABycnL2Hn/nnXe49NJLWbJkCbfccgu//vWvszZGHyeWJEmSlEjDhw9n9erVnHnmmcycOXOfAAvQsGFDRo4cyeLFizn22GOzMMpk+dOf/sS//du/lQqwAC1btmTixIkA/Pa3v2XXrl3ZGB5giJUkSZKUQGvWrOGpp54CYOLEiRx55JEVtm/Tpg3NmjXbb7/7Wyubn59PCIHrrrtun3NPP/00vXr1olGjRtSpU4fGjRvTsWNHhg0bxpo1awCYNWsWIYS9jz/37NmTEMLen1mzZpXqc926ddx88820a9eO+vXrk5ubS7du3cjPzyfGuM8YevTosbefV155hT59+tC4cWNq1arFlClT9nv/FTnjjDMA+Pzzz/noo48OqK8D4ePEkiRJkhJn2rRpFBUV0bFjx73hKptGjRrFXXfdRZ06dfj617/Occcdx5YtW3jnnXeYMGEC3bt3p3Xr1jRt2pTBgwdTUFDAxo0b6d27N02bNt3bT8nfX375Zfr168cnn3xCmzZtuPjii9m+fTvz58/n+uuvZ+bMmTz++OMZx/Pss88yceJE2rdvz4UXXsimTZuoU6fOAd3jqlWrAKhbt65rYiVJkiSpKhYvXgxA165dszwS2LlzJ2PHjiUnJ4fFixdz8sknlzq/atUqatdORa9TTjmF/Pz8vS9Muv322zO+2GnDhg1ceeWVbN++nfz8fK699lpCCEBqdrZv375MnjyZXr16ZZwVnjBhApMmTWLIkCE1dp/33nsvAJdeein16tWrsX6ryhArSZIkJVR+j2yPoGqum1VzfX344YcANGnSpOY6raatW7eyY8cOTjvttH0CLEDbtm2r3Of999/P5s2bGTlyJIMHDy51rnnz5vziF7+ga9euPPjggxlD7IUXXlijATY/P59nnnmGBg0acM8999RYv9XhmlhJkiRJOgB5eXm0bNmSZcuWceutt/Lmm28ecJ/Tp08H4Kqrrsp4vnPnzuTk5LB06VI+//zzfc5fccUVBzyGYn/605+44YYbCCEwadIk2rVrV2N9V4czsZIkSVJC1eTMZtLk5eUBqe/EHg4ef/xx+vfvz7hx4xg3bhx5eXmcffbZ9O7dm0GDBnH00UdXqb+33noLqNzj0h999BHHH398qWM19SmhOXPmcNlll7Fr1y4eeOABBg0aVCP9HghDrCRJkqTE6dy5M5MnT2bhwoWH9LpFRUUZj3fv3p23336badOmMWvWLObOncu0adOYOnUqo0aN4sUXX6zSC6j27NkDwMCBA/f75uVM61Pr169f6WuVZ+7cuXzjG9/g008/ZezYsXz/+98/4D5rgiFWkiRJUuL06dOHH/7whyxfvpwlS5bU2BuK69atC8D27dsznl+7dm25tQ0aNGDAgAEMGDAASL2c6ZZbbuGZZ55h2LBhzJ07t9LjaN68OatXr+bOO++kQ4cOVbiDmjF//nwuvvhitm3bxt13382IESMO+RjK45pYSZIkSYnTpk0bBg4cCMDQoUPZuXNnhe3XrFnDhg0b9ttv8WO5mda1xhgpKCio9BibNWvG6NGjAVi2bFmpc8Vheffu3RlrL7nkEiD1qZxD7fXXX6d3795s27aNUaNGcccddxzyMVTEECtJkiQpkR566CFatWrFggUL6NWrF8uXL9+nzaeffsq4cePo3LkzGzdu3G+fPXv2pFatWhQUFPDaa6/tPb5nzx7uuOMOXn/99X1q1q5dy6OPPsrWrVv3OTd16lRg3zWqxWF55cqVGccxYsQIcnNzueeeexg/fnzGsFtYWMhzzz2333uqikWLFnHRRRexdetW7rzzTv77v/+7RvuvCT5OLEmSJCmRGjVqxJw5cxgwYABz5syhU6dOtG/fnlNOOYW6deuyfv16Xn/9dXbu3Mmxxx5Lo0aN9ttnixYtGDp0KOPHj6dnz550796d3Nxc3njjDTZv3sxNN93EAw88UKpm8+bNfPe732XYsGGcfvrpnHTSSRQVFbFixQoKCwupU6cOY8eOLVXTr18/8vPzGTFiBC+99NLeTwWNGDGCdu3a0bx5c6ZMmUL//v0ZPnw4o0ePpkOHDjRp0oQtW7awfPly1q1bx8CBA2v0TcQXXXQRn3zyCV/5yld49913M36+B+BnP/sZjRs3rrHrVoUhVpIkSVJiNWvWjFdffZVp06bx1FNPMW/ePAoKCti9ezd5eXlccMEFXHbZZVxzzTUcddRRlerzgQceoEWLFjz22GPMmTOH3Nxcevbsyd13351xXWvr1q257777mDVrFoWFhRQWFlKrVi2OP/54hgwZws0330z79u1L1fTt25cJEyYwadIkZsyYwY4dOwAYNGjQ3k/Y9OzZk8LCQh588EFeeOEF5s+fzxdffEHTpk1p1aoVN954Y7mf4KmuzZs3A7BlyxZ+/etfl9tu1KhRWQuxIcaYlQtrX126dImLFi3K9jAkSZJ0GFi5ciWnnnpqtochHbCq/FsOISyOMXapqI0zsToknlzwLs8vXZ/tYUiSDpHLTj+ea85qke1hSJK+hHyxkw6J55euZ8WGfRe6S5K+fFZs2Op/uJQkHTTOxOqQad8sl2duOCfbw5AkHWQDJ83L9hAkSV9izsRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkhJv+vTpfOc736FNmzbk5ORQr149TjjhBPr06cPEiRPZtm1bqfajRo0ihMCoUaOyM+AE+ctf/kK9evUIIfDVr34128MxxEqSJElKrg8++IAePXrQp08ffvOb31C3bl0uuugi+vXrR8uWLZkxYwZDhw6lVatWrF27NtvD3atHjx6EEJg1a1a2h1Kh3bt3M3jwYL744otsD2UvQ6wkSZKkRNqyZQvdunVj9uzZnHPOOSxbtowVK1bw3HPP8fTTTzNnzhw2bdrEmDFj2LVrF5s3b872kBPnnnvu4Y033uDGG2/M9lD2MsRKkiRJSqThw4ezevVqzjzzTGbOnEmnTp32adOwYUNGjhzJ4sWLOfbYY7MwyuRatmwZd999N1dccQX9+/fP9nD2MsRKkiRJSpw1a9bw1FNPATBx4kSOPPLICtu3adOGZs2a7bff/a2Vzc/PJ4TAddddt8+5p59+ml69etGoUSPq1KlD48aN6dixI8OGDWPNmjUAzJo1ixACs2fPBqBnz56EEPb+lH28eN26ddx88820a9eO+vXrk5ubS7du3cjPzyfGuM8YSj6m/Morr9CnTx8aN25MrVq1mDJlyn7vv9gXX3zBddddR8OGDZkwYUKl6w6F2tkegCRJkiRV1bRp0ygqKqJjx46cccYZ2R4Oo0aN4q677qJOnTp8/etf57jjjmPLli288847TJgwge7du9O6dWuaNm3K4MGDKSgoYOPGjfTu3ZumTZvu7afk7y+//DL9+vXjk08+oU2bNlx88cVs376d+fPnc/311zNz5kwef/zxjON59tlnmThxIu3bt+fCCy9k06ZN1KlTp9L3c/fdd7N06VJ+/etfc+yxx7Jy5crq/3FqmCFWkiRJUuIsXrwYgK5du2Z5JLBz507Gjh1LTk4Oixcv5uSTTy51ftWqVdSunYpep5xyCvn5+fTo0YONGzdy++2306NHj3363LBhA1deeSXbt28nPz+fa6+9lhACkJqd7du3L5MnT6ZXr14ZZ4UnTJjApEmTGDJkSJXvZ8mSJdxzzz1ccsklXHvttVWuP9gMsZIkSVJCXfn8Z9keQpX872UNaqyvDz/8EIAmTZrUWJ/VtXXrVnbs2MFpp522T4AFaNu2bZX7vP/++9m8eTMjR45k8ODBpc41b96cX/ziF3Tt2pUHH3wwY4i98MILqxVgd+3axeDBg6lfvz6TJk2qcv2h4JpYSZIkSToAeXl5tGzZkmXLlnHrrbfy5ptvHnCf06dPB+Cqq67KeL5z587k5OSwdOlSPv/8833OX3HFFdW67o9//GOWL1/O2LFjad68ebX6ONiciZUkSZISqiZnNpMmLy8PSH0n9nDw+OOP079/f8aNG8e4cePIy8vj7LPPpnfv3gwaNIijjz66Sv299dZbQOUel/7oo484/vjjSx078cQTq3Q9SD2iPWbMGHr06MENN9xQ5fpDxRArSZIkKXE6d+7M5MmTWbhw4SG9blFRUcbj3bt35+2332batGnMmjWLuXPnMm3aNKZOncqoUaN48cUXq/QCqj179gAwcODA/b55uV69evscq1+/fqWvVWzq1Kns3r2bjRs30rNnz1LntmzZAsDbb7+9dw3vo48+Sps2bap8nQNVqRAbQqgDnAt8AzgPOBk4EvgQmAc8FGOcVUH9NcBQoBNwBPAm8Cvg4Rhj5n8FqbqLgR8CXdLXewt4CvhZjHFnBXVnAbcD3YBcYB3we2B0jPGTCuraAXcCvYBjgPeB6cCPY4wbKqg7Ll33DaAp8BHwJ+B/Yox/K69OkiRJUvX06dOHH/7whyxfvpwlS5bU2BuK69atC8D27dsznl+7dm25tQ0aNGDAgAEMGDAASL2c6ZZbbuGZZ55h2LBhzJ07t9LjaN68OatXr+bOO++kQ4cOVbiDA7dy5cpy30b82Wef7f08UHl/o4OtsmtizwNmkAqUxwOvkAqFHwNXAi+HEH6cqTCEMB54glQQfRV4iVQIfgj4XQgh4xhCCCOBP5AKlG8ALwBNgLuBWSGEjM9OhBC+BbwGXA78DXgeqAuMABaFEDKu/A4hnAcsAb4NbEjf32fA94BlIYR9V2in6k4F/pxu91m67n1gELAkhNAtU50kSZKk6mvTpg0DBw4EYOjQoezcWe4cF5D6ruyGDeXOS+1V/FhupnWtMUYKCgoqPcZmzZoxevRoAJYtW1bqXHFY3r17d8baSy65BEh9KudQGTVqFDHGjD8vv/wyAB06dNh77PTTTz9kYyupsiG2CPhf4NwYY7MY46UxxoExxo7A1cAe4M4QQqk55xDClcCNpEJdp3RdP6AtsBLoB3y/7MVCCF2Ae0mFwm4xxgtijFcBrUgF6LOB0RnqTgB+CQTg8hjjv8YYBwKtgWeANsA+r9gKIRwFPA3UB74fY+wcY7w6xngq8H+BPOCpUPxO63/U1UrXHUNqdvjUdN3XgJuABsBvywvckiRJkqrvoYceolWrVixYsIBevXqxfPnyfdp8+umnjBs3js6dO7Nx48b99tmzZ09q1apFQUEBr7322t7je/bs4Y477uD111/fp2bt2rU8+uijbN26dZ9zU6dOBfZdo1oclsub8RwxYgS5ubncc889jB8/PmPYLSws5LnnntvvPX3ZVCrExhhnxhj7xxhfzXDuGSA/vTuozOkfpbe3xRhXlajZSOrxYoDbM8zG3k4qiI6JMS4oUbcduJ5UqL4xhPCVMnU/IBVEfx1jfL5E3W5gCLAVuDyE0L5M3fWkHgN+Ocb4UJlztwFrgK8Bl5Q59w1Sj0ivTo95rxjjg8As4DjgOiRJkiTVqEaNGjFnzhz+9V//lblz59KpUyc6dOjAlVdeybe+9S3OPfdcjjnmGG699VaOPPJIGjVqtN8+W7RowdChQ9mzZw89e/bk/PPPp1+/frRq1YqHHnqIm266aZ+azZs3893vfpe8vDzOOussrr76agYMGMBXv/pVbrnlFurUqcPYsWNL1fTr1w9IhdW+ffvyH//xH/zHf/wHf/3rX4HU48RTpkyhYcOGDB8+nBYtWnDhhRfy7W9/mz59+tCiRQu++tWv8tvf/rYG/pLJUlOf2FmS3p5QfCA9K9oZ2AXsMwceY5wNrCcVHs8uUVeXf4TFJzLUvUVqHW5dUiGypMsrqNsKTC3TrjJ1e0jNtlZU93S6XVlPlGknSZIkqQY1a9aMV199lalTp3LNNdewY8cOCgoKeO6553jrrbe44IILeOSRR1izZg0tWrSoVJ8PPPAAY8aMoVWrVsyZM4c5c+Zw1llnsWjRooxrb1u3bs19993HJZdcwscff8y0adMoKChgz549DBkyhKVLl3LppZeWqunbty8TJkzglFNOYcaMGfzyl7/kl7/8ZalHnnv27ElhYSH/+Z//SZMmTZg/fz7/+7//S2FhIa1ateInP/nJ3seV/5mEGOOBdxLC/cDNpGZAr0sf+ybw/4Al6cdrM9X9nlTAGx5jHJ8+1pHUGtOPY4zHlFN3H6lZ15/FGEekj+UCxS9tOjodWsvW3QzcD/wu/Xhy8fGPgEbAaTHGP2eouwyYAiyKMXYtcXwxqRnay2KM/y9D3WnAUmBTjDEv072U1KVLl7ho0aL9NUukgZPmAfDMDedkeSSSpIPN/82XasbKlSs59dRTsz0M6YBV5d9yCGFxjLFLRW0OeCY2hNCUfzwu+78lTp2U3pb/+i54t0zbkr+/S/ky1bVMb7dkCrDl1aXDb/FzBeWNNdP1Su7vr65xCCGnnDaSJEmSpEo6oBAbQqgN/AY4GvhTjHFqidPFoe3TCroofidzw8OgrqLaTHWVuWbJd06XrZUkSZIkVdGBzsROBM4n9R3Wsi91UiWEEIaEEBaFEBZ9+OGH2R6OJEmSJB3Wqh1iQwg/B/6d1Odzzo8xvl+mSfEs5FEVdFM8k7ntMKirqDZTXWWuWXKWt2wtADHGR2KMXWKMXfLy9rtsVpIkSZL+qVUrxIYQ/i+p76B+SCrArsrQ7J309sQM54o1L9O25O8VvTosU13xutSvpNe5VqouvX52c3q3vLFmul7J/f3VfZT+PJAkSZIk6QBUOcSGEMYCPwQ+Ai6IMa4op2nxZ3c6hBDql9Oma5m2AG8CO4BGIYTW5dSdWbYuxvgJqe+5lux3v3VpbxziOkmSJElSNVQpxIYQ7gVGkJq5vDDT52iKxRjXkQp5dYGryp4PIZxH6ruy75P67mtx3S7gD+ndb2eoawWcQ+r7sy+UOf18BXW5wDfTu7+vQt0RwNX7qbs63a6s4v7K1kmSJEmSqqHSITaEcDdwG7CFVICtzOziT9LbMSGENiX6agJMSO/eG2MsKlN3LxCB20IIZ5aoywEeS497QoxxS5m6+0nN4g4OIfQtUVcbmATkAlMyzB7/ilSY7hlCGJZhLK1Jzab+ocy5F0h907ZNiXstvuZwoAfwdyAfSZIkSdIBq12ZRulAeEd6dzXw/RBCpqZvxhjvLd6JMf4uhPAwMBRYHkKYAXxB6o3GucAU4KGyncQYF4YQbgfGAHNDCDNJhefzgCbAghLjKVm3LoTw78BkYEoIYQ6pEHk2qXWrq4EbMtRtDyFcTSqkPhRCuB5YBZwGnApsAr4VY4xl6opCCN8CXgFGhBAuBZYBbYHOpAL1wBjjZ5n+WJIkSZKkqqlUiAUalfi9S/onk9mkZi73ijHemA6Tw0iF0CNIrXt9DHg4wyxscd3YEMKfgVtJrTk9EngLeAD4WYxxZzl1T4UQ3gJ+BHQDziL1CaCfAqPTa2cz1c0OIZwB/BepkN0R2EhqBveuGOOGcupWhBA6peu+AVwBfAw8Afw4xvi3THWSJEmSpKqrVIiNMeZzAI/ExhifBJ6sRl0BUFCNugXA5dWo+ysZ1sVWou7vwPeqWidJkiRJqppqfydWkiRJkqRDzRArSZIkSUoMQ6wkSZIkKTEMsZIkSZISb/r06XznO9+hTZs25OTkUK9ePU444QT69OnDxIkT2bZtW6n2o0aNIoTAqFGjsjPgw9TcuXMZOnQoZ511Fscddxz16tUjJyeHTp06cfvtt/Phhx9me4iGWEmSJEnJ9cEHH9CjRw/69OnDb37zG+rWrctFF11Ev379aNmyJTNmzGDo0KG0atWKtWvXZnu4e/Xo0YMQArNmzcr2UEqZPn06EydO5IMPPqB9+/ZcccUVdOvWjffee48xY8bw1a9+lZUrV2Z1jJX9xI4kSZIkHVa2bNlCt27dWL16Neeccw4TJ06kU6dOpdps27aNhx9+mNGjR7N582ZOPPHELI02GQYNGsSQIUNo0aJFqeOffvop//7v/84zzzzD9773PWbPnp2lEToTK0mSJCmhhg8fzurVqznzzDOZOXPmPgEWoGHDhowcOZLFixdz7LHHZmGUyXLKKafsE2ABjjrqKH76058C8Oqrr7Jz585DPbS9DLGSJEmSEmfNmjU89dRTAEycOJEjjzyywvZt2rShWbNm++13f2tl8/PzCSFw3XXX7XPu6aefplevXjRq1Ig6derQuHFjOnbsyLBhw1izZg0As2bNIoSwdyazZ8+ehBD2/pR9vHjdunXcfPPNtGvXjvr165Obm0u3bt3Iz88nxrjPGEo+pvzKK6/Qp08fGjduTK1atZgyZcp+778itWvX3rs94ogjDqivAxpH1q4sSZIkSdU0bdo0ioqK6NixI2eccUa2h8OoUaO46667qFOnDl//+tc57rjj2LJlC++88w4TJkyge/futG7dmqZNmzJ48GAKCgrYuHEjvXv3pmnTpnv7Kfn7yy+/TL9+/fjkk09o06YNF198Mdu3b2f+/Plcf/31zJw5k8cffzzjeJ599lkmTpxI+/btufDCC9m0aRN16tSp9v3t2rWLO++8E4BLLrlkb6DNBkOsJEmSpMRZvHgxAF27ds3ySGDnzp2MHTuWnJwcFi9ezMknn1zq/KpVq/aGvlNOOYX8/Hx69OjBxo0buf322+nRo8c+fW7YsIErr7yS7du3k5+fz7XXXksIAUjNzvbt25fJkyfTq1evjLPCEyZMYNKkSQwZMqRa97Rq1SpGjx4NwKZNm1i4cCEffPABXbt25eGHH65WnzXFECtJkiQl1H9+tirbQ6iSexq0rbG+ij/10qRJkxrrs7q2bt3Kjh07OO200/YJsABt21b9vu+//342b97MyJEjGTx4cKlzzZs35xe/+AVdu3blwQcfzBhiL7zwwmoHWICNGzfy61//utSx888/n0ceeYTjjjuu2v3WBNfESpIkSdIByMvLo2XLlixbtoxbb72VN99884D7nD59OgBXXXVVxvOdO3cmJyeHpUuX8vnnn+9z/oorrjig6//rv/4rMUb27NnDu+++y+TJk3nrrbf46le/yu9+97sD6vtAORMrSZIkJVRNzmwmTV5eHpD6Tuzh4PHHH6d///6MGzeOcePGkZeXx9lnn03v3r0ZNGgQRx99dJX6e+utt4DKPS790Ucfcfzxx5c6VlOfEqpVqxbNmzdn0KBBdOvWjdNPP53rr79+77rfbDDESpIkSUqczp07M3nyZBYuXHhIr1tUVJTxePfu3Xn77beZNm0as2bNYu7cuUybNo2pU6cyatQoXnzxxSq9gGrPnj0ADBw4cL9vXq5Xr94+x+rXr1/pa1XWSSedxLnnnsu0adP44x//yPXXX1/j16gMQ6wkSZKkxOnTpw8//OEPWb58OUuWLKmxNxTXrVsXgO3bt2c8v3bt2nJrGzRowIABAxgwYACQejnTLbfcwjPPPMOwYcOYO3dupcfRvHlzVq9ezZ133kmHDh2qcAcH1+EwA+6aWEmSJEmJ06ZNGwYOHAjA0KFD2blzZ4Xt16xZw4YNG/bbb/FjuZnWtcYYKSgoqPQYmzVrtvcNv8uWLSt1rjgs7969O2PtJZdcAqQ+lXO42L17N6+88gpQvZdV1RRDrCRJkqREeuihh2jVqhULFiygV69eLF++fJ82n376KePGjaNz585s3Lhxv3327NmTWrVqUVBQwGuvvbb3+J49e7jjjjt4/fXX96lZu3Ytjz76KFu3bt3n3NSpU4F916gWh+WVK1dmHMeIESPIzc3lnnvuYfz48RnDbmFhIc8999x+76kq7r33XjZt2rTP8Q8++IB/+7d/Y82aNTRv3pyLL764Rq9bFT5OLEmSJCmRGjVqxJw5cxgwYABz5syhU6dOtG/fnlNOOYW6deuyfv16Xn/9dXbu3Mmxxx5Lo0aN9ttnixYtGDp0KOPHj6dnz550796d3Nxc3njjDTZv3sxNN93EAw88UKpm8+bNfPe732XYsGGcfvrpnHTSSRQVFbFixQoKCwupU6cOY8eOLVXTr18/8vPzGTFiBC+99NLeTwWNGDGCdu3a0bx5c6ZMmUL//v0ZPnw4o0ePpkOHDjRp0oQtW7awfPly1q1bx8CBAw/4TcQl/ehHP+L//J//Q6dOnWjdujVHHHEE7733Hm+88QY7duzg2GOP5bnnnqNBgwY1ds2qMsRKkiRJSqxmzZrx6quvMm3aNJ566inmzZtHQUEBu3fvJi8vjwsuuIDLLruMa665hqOOOqpSfT7wwAO0aNGCxx57jDlz5pCbm0vPnj25++67M65rbd26Nffddx+zZs2isLCQwsJCatWqxfHHH8+QIUO4+eabad++famavn37MmHCBCZNmsSMGTPYsWMHAIMGDaJdu3ZAala4sLCQBx98kBdeeIH58+fzxRdf0LRpU1q1asWNN95Y7id4quuhhx7ilVdeYenSpbz00kt8+umnHH300Xzta1/j0ksv5Xvf+x5f+cpXavSaVRVijFkdgP6hS5cucdGiRdkexkExcNI8AJ654Zwsj0SSdLD5v/lSzVi5ciWnnnpqtochHbCq/FsOISyOMXapqI1rYiVJkiRJiWGIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiRJiWGIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiRJiWGIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiRJiWGIlSRJkiQlhiFWkiRJUuJNnz6d73znO7Rp04acnBzq1avHCSecQJ8+fZg4cSLbtm0r1X7UqFGEEBg1alR2BnyY+vTTT3niiSf4wQ9+QLdu3TjqqKMIIXDppZdme2h71c72ACRJkiSpuj744AMGDBjA7NmzATj11FO56KKLqFu3Lu+99x4zZsxg+vTp3HnnnSxatIgTTzwxyyNO6dGjB7Nnz+bll1+mR48e2R7OXqtWrWLQoEHZHkaFDLGSJEmSEmnLli1069aN1atXc8455zBx4kQ6depUqs22bdt4+OGHGT16NJs3bz5sQuzhqmHDhvzbv/0bXbp0oXPnzixZsoTvfe972R5WKYZYSZIkSYk0fPhwVq9ezZlnnsnMmTM58sgj92nTsGFDRo4cyRVXXMFRRx2VhVEmS+vWrfnlL3+5d3/FihVZHE1mromVJEmSlDhr1qzhqaeeAmDixIkZA2xJbdq0oVmzZvvtd39rZfPz8wkhcN111+1z7umnn6ZXr140atSIOnXq0LhxYzp27MiwYcNYs2YNALNmzSKEsPfx5549exJC2Psza9asUn2uW7eOm2++mXbt2lG/fn1yc3Pp1q0b+fn5xBj3GUOPHj329vPKK6/Qp08fGjduTK1atZgyZcp+7z8JnImVJEmSlDjTpk2jqKiIjh07csYZZ2R7OIwaNYq77rqLOnXq8PWvf53jjjuOLVu28M477zBhwgS6d+9O69atadq0KYMHD6agoICNGzfSu3dvmjZturefkr+//PLL9OvXj08++YQ2bdpw8cUXs337dubPn8/111/PzJkzefzxxzOO59lnn2XixIm0b9+eCy+8kE2bNlGnTp2D/nc4FAyxkiRJkhJn8eLFAHTt2jXLI4GdO3cyduxYcnJyWLx4MSeffHKp86tWraJ27VT0OuWUU8jPz6dHjx5s3LiR22+/PeOLnTZs2MCVV17J9u3byc/P59prryWEAKRmZ/v27cvkyZPp1atXxlnhCRMmMGnSJIYMGVLj95tthlhJkiQpoeZ+9ny2h1AlX29wWY319eGHHwLQpEmTGuuzurZu3cqOHTs47bTT9gmwAG3btq1yn/fffz+bN29m5MiRDB48uNS55s2b84tf/IKuXbvy4IMPZgyxF1544ZcywIJrYiVJkiTpgOTl5dGyZUuWLVvGrbfeyptvvnnAfU6fPh2Aq666KuP5zp07k5OTw9KlS/n888/3OX/FFVcc8BgOV87ESpIkSQlVkzObSZOXlwekvhN7OHj88cfp378/48aNY9y4ceTl5XH22WfTu3dvBg0axNFHH12l/t566y2gco9Lf/TRRxx//PGljn2ZPyVkiJUkSZKUOJ07d2by5MksXLjwkF63qKgo4/Hu3bvz9ttvM23aNGbNmsXcuXOZNm0aU6dOZdSoUbz44otVegHVnj17ABg4cOB+37xcr169fY7Vr1+/0tdKGkOsJEmSpMTp06cPP/zhD1m+fDlLliypsTcU161bF4Dt27dnPL927dpyaxs0aMCAAQMYMGAAkHo50y233MIzzzzDsGHDmDt3bqXH0bx5c1avXs2dd95Jhw4dqnAHX36uiZUkSZKUOG3atGHgwIEADB06lJ07d1bYfs2aNWzYsGG//RY/lptpXWuMkYKCgkqPsVmzZowePRqAZcuWlTpXHJZ3796dsfaSSy4BUp/KUWmGWEmSJEmJ9NBDD9GqVSsWLFhAr169WL58+T5tPv30U8aNG0fnzp3ZuHHjfvvs2bMntWrVoqCggNdee23v8T179nDHHXfw+uuv71Ozdu1aHn30UbZu3brPualTpwL7rlEtDssrV67MOI4RI0aQm5vLPffcw/jx4zOG3cLCQp577rn93tOXjY8TS5IkSUqkRo0aMWfOHAYMGMCcOXPo1KkT7du355RTTqFu3bqsX7+e119/nZ07d3LsscfSqFGj/fbZokULhg4dyvjx4+nZsyfdu3cnNzeXN954g82bN3PTTTfxwAMPlKrZvHkz3/3udxk2bBinn346J510EkVFRaxYsYLCwkLq1KnD2LFjS9X069eP/Px8RowYwUsvvbT3U0EjRoygXbt2NG/enClTptC/f3+GDx/O6NGj6dChA02aNGHLli0sX76cdevWMXDgwBp/E3G/fv32zloXf8rotdde4+yzz97b5s4776RPnz41et3KMsRKkiRJSqxmzZrx6quvMm3aNJ566inmzZtHQUEBu3fvJi8vjwsuuIDLLruMa665hqOOOqpSfT7wwAO0aNGCxx57jDlz5pCbm0vPnj25++67M65rbd26Nffddx+zZs2isLCQwsJCatWqxfHHH8+QIUO4+eabad++famavn37MmHCBCZNmsSMGTPYsWMHAIMGDaJdu3ZAala4sLCQBx98kBdeeIH58+fzxRdf0LRpU1q1asWNN95Y7id4DsSSJUv2Wfu7ZcsWFixYsHe/ONxmQ4gxZu3iKq1Lly5x0aJF2R7GQTFw0jwAnrnhnCyPRJJ0sPm/+VLNWLlyJaeeemq2hyEdsKr8Ww4hLI4xdqmojWtiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYtTO9gAkfbk8ueBdnl+6PtvDkJRFKzZsBWDgpHlZHomUbMPOqE/dD7dnexjV9pX6dTgmp162h6EvIWdiJdWo55eu3/v/wEr659S+WS7tm+VmexiSsujzL/awZccX2R6GvqSciZVU49o3y+WZG87J9jAkSUq0lStX0jovJ9vDqJY1WZhBnj59Ok899RTz5s3j/fff54svviAvL4/TTjuNb37zm3z729+mYcOGe9uPGjWKu+66i//+7/9m1KhRh3y8h7u//vWv/M///A8zZ87ko48+omnTpnzjG9/gv/7rv2jWrFlWx+ZMrCRJkqTE+uCDD+jRowd9+vThN7/5DXXr1uWiiy6iX79+tGzZkhkzZjB06FBatWrF2rVrsz3cvXr06EEIgVmzZmV7KPuYPXs2Z5xxBk888QTNmjWjX79+NGjQgIkTJ3Laaafxt7/9LavjM8RKkiRJSqQtW7bQrVs3Zs+ezTnnnMOyZctYsWIFzz33HE8//TRz5sxh06ZNjBkzhl27drF58+ZsD/mw9+mnn3L11VezY8cOHnzwQRYvXszTTz/NypUrufXWW/nwww/51re+RYwxa2M0xEqSJElKpOHDh7N69WrOPPNMZs6cSadOnfZp07BhQ0aOHMnixYs59thjszDKZPnVr37F+++/T8+ePRk+fHipc2PGjKF169a88cYb/OEPf8jSCA2xkiRJkhJozZo1PPXUUwBMnDiRI488ssL2bdq0qdRazlGjRhFCKHedbH5+PiEErrvuun3OPf300/Tq1YtGjRpRp04dGjduTMeOHRk2bBhr1qwBYNasWYQQmD17NgA9e/YkhLD3p+zjxevWrePmm2+mXbt21K9fn9zcXLp160Z+fn7G2dCSjym/8sor9OnTh8aNG1OrVi2mTJmy3/svbvPtb397n3NHHHEEV199dal22eCLnSRJkiQlzrRp0ygqKqJjx46cccYZ2R7O3hdF1alTh69//escd9xxbNmyhXfeeYcJEybQvXt3WrduTdOmTRk8eDAFBQVs3LiR3r1707Rp0739lPz95Zdfpl+/fnzyySe0adOGiy++mO3btzN//nyuv/56Zs6cyeOPP55xPM8++ywTJ06kffv2XHjhhWzatIk6ders9z6WLFkCQNeuXTOeLz5e3C4bDLGSJEmSEmfx4sVA+WHrUNq5cydjx44lJyeHxYsXc/LJJ5c6v2rVKmrXTkWvU045hfz8fHr06MHGjRu5/fbb6dGjxz59btiwgSuvvJLt27eTn5/PtddeSwgBSM3O9u3bl8mTJ9OrV6+Ms8ITJkxg0qRJDBkypNL3sXXrVj7++GMATjzxxIxtWrRoAcDbb79d6X5rmiFWkiRJSqrn78j2CDI6btee1C91jyh94rLRNXaNDz/8EIAmTZrUWJ/VtXXrVnbs2MFpp522T4AFaNu2bZX7vP/++9m8eTMjR45k8ODBpc41b96cX/ziF3Tt2pUHH3wwY4i98MILqxRgAbZv/8enkY466qiMbXJyUp992rZtW5X6rkmuiZUkSZKkA5CXl0fLli1ZtmwZt956K2+++eYB9zl9+nQArrrqqoznO3fuTE5ODkuXLuXzzz/f5/wVV1xxwGM4XDkTK0mSJCVVDc5s1qS/f5ia0Wudl3PQrpGXlwekvhN7OHj88cfp378/48aNY9y4ceTl5XH22WfTu3dvBg0axNFHH12l/t566y2gco9Lf/TRRxx//PGljpX3OHBFimdZIfWpnUxjLp6tbdiwYZX7rymGWEmSJEmJ07lzZyZPnszChQsP6XWLiooyHu/evTtvv/0206ZNY9asWcydO5dp06YxdepURo0axYsvvlilF1Dt2ZN6JHvgwIH7ffNyvXr19jlWv379Sl+rWG5uLv/yL//C5s2bWbt2bcZPFq1btw6Ali1bVrn/mlLpEBtCaAdcDHQFugAnAwG4Ksb4uwztewAvV7L7E2OM75aozQcGl9+cv8YYTylnnLWAocD1wCnAHuDPwIQY41MVDSKEcE26thNwBPAm8Cvg4Rhj5n+tqbqLgR+S+rscCbwFPAX8LMa4s6JrSpIkSaq6Pn368MMf/pDly5ezZMmSGntDcd26dYHS60NLWrt2bbm1DRo0YMCAAQwYMABIvZzplltu4ZlnnmHYsGHMnTu30uNo3rw5q1ev5s4776RDhw5VuIMD87WvfY0//elPLFy4MGOIff311wGy+kboqqyJHQrcD3wbaEcqwFbkfeDXFfysTLdbA6wrp4/Xyqn9fabGIYQj0uceAtoCLwJzSAXvJ0MIPy9vsCGE8cATpILoq8BLpIL6Q8Dv0uE4U91I4A9AL+AN4AWgCXA3MCuE0KC8a0qSJEmqnjZt2jBw4EAAhg4dys6dFc8drVmzhg0bNuy33+LHcjOta40xUlBQUOkxNmvWjNGjU498L1u2rNS54rC8e/fujLWXXHIJkPpUzqF02WWXAfDEE0/sc27Pnj08/fTTAPTr1++QjqukqoTYvwA/BQYCbYDZFTWOMb4ZY7yuvJ8STR+Lmb7Sm/JoOfU/Kqf9D4C+wArg5BjjFTHGPkBHYCNwUwjhsrJFIYQrgRtJBe9OMcZLY4z9SAXhlUA/4PsZ6roA9wKfAd1ijBfEGK8CWgGvAGcDh+dCBUmSJCnhHnroIVq1asWCBQvo1asXy5cv36fNp59+yrhx4+jcuTMbN27cb589e/akVq1aFBQU8Nprr+09vmfPHu644469M5ElrV27lkcffZStW7fuc27q1KnAvmtUi8PyypUr96kBGDFiBLm5udxzzz2MHz8+Y9gtLCzkueee2+89VcX1119P06ZNefnllxk/fnypc7fffjtr1qzhjDPO2Buys6HSjxPHGB8tuV/8jaLqCCGcA5xK6lHf/Gp3VLrPI4CR6d2hMca9/0JjjKtCCLelr3UH8HyZ8uJQfFuMcVWJuo0hhKHALOD2EMKDZR4rvp3UjPSYGOOCEnXbQwjXA6uAG0MId8UYt9TAbUqSJElKa9SoEXPmzGHAgAHMmTOHTp060b59e0455RTq1q3L+vXref3119m5cyfHHnssjRo12m+fLVq0YOjQoYwfP56ePXvSvXt3cnNzeeONN9i8eTM33XQTDzzwQKmazZs3893vfpdhw4Zx+umnc9JJJ1FUVMSKFSsoLCykTp06jB07tlRNv379yM/PZ8SIEbz00kt7PxU0YsQI2rVrR/PmzZkyZQr9+/dn+PDhjB49mg4dOtCkSRO2bNnC8uXLWbduHQMHDqzRNxHn5OTw9NNPc8kllzB8+HB+9atf0bZtW5YtW8bKlStp3LgxTz311AHlwQOVrU/s/Ft6WxBj/HsN9XkOqcd434sxvpLh/LPAF0DXEMLeV3eFEE4AOgO70m1KiTHOBtYDTUnNrBbX1QWK//PDPnPtMca3gHlAXeAb1bslSZIkSRVp1qwZr776KlOnTuWaa65hx44dFBQU8Nxzz/HWW29xwQUX8Mgjj7BmzRpatGhRqT4feOABxowZQ6tWrZgzZw5z5szhrLPOYtGiRRnXgrZu3Zr77ruPSy65hI8//php06ZRUFDAnj17GDJkCEuXLuXSSy8tVdO3b18mTJjAKaecwowZM/jlL3/JL3/5y1KPPPfs2ZPCwkL+8z//kyZNmjB//nz+93//l8LCQlq1asVPfvKTvY8r16TzzjuPJUuWcM011/Dee+/x3HPPsX37dm644Qb+/Oc/065duxq/ZlWE8p/k3U9hCLOA8yjnxU4V1DUg9dhuQ+CKGOM+61tLvNjpceAjIIfU48BzgJcyvWQphPB94AHg9zHGjP8pIoSwBDgduDTG+EL62DeB/wcsiTF+rZy63wOXA8NjjOPTxzqSemHUxzHGY8qpu4/UI84/izGOyNSmpC5dusRFixbtr1kiDZw0D4BnbjgnyyPRweb/rSVJqhkrV67k1FNPzfYwqmXNIfjEjpKjKv+WQwiLY4xdKmqTjU/sXEUqwH4ATNtP22szHFsRQrg6xlj2gfeT0tvyXxcG75IKsSeVOFbZupJtS/7+LuXLVCdJkiRJqqZsPE5c/Cjx4zHGL8ppsxS4CWhPahb2OOBSYFn62IySjwSnFf9nnk8ruHbxe7JLfpn3UNdJkiRJkqrpkM7EhhDaAOemdx8rr12M8f4yhz4FXgghvETqrchnk3oZ0/CDMMxDKoQwBBgCVPoZfUmSJEn6Z3WoZ2KLZ2HnxRgzv0u6AjHGXcBP0rtlX5ZUPOt5VAVdFM+ebstiXSkxxkdijF1ijF3y8vIq6EqSJEmSdMhCbPoTOMVrXH95AF0Vf3W47OPE76S3J1K+5mXa1kRdRdOnmeokSZIkSdV0KGdie5MKntuBZw6gn+I3AW8vc/yN9LZrpqL0W5G/mt5dUuJU8e8dQgj1y7lm1zJtIRWmdwCNQgity6k7M0OdJEmSJKmaDmWI/ff09rcxxrIBtCoGpLcLyxyfB3wInBBCOJd9XQXUARbGGNcXH4wxriMVgOum25QSQjgPOIHUZ4HmlajbBfwhvfvtDHWtSH27dhfwQmVuTJIkSZJUsUMSYkMIjYFvpncrfJQ4hHB6COHS9OPHJY/XDiHcSuqtxQD3lTwfY9wDjE3vPhxCaFKiti1wb3o309eAi9fZjkm/fKq4rgkwIb17b4bv094LROC2EMKZJepySL24qhYwIca4pYJbliRJkiRVUqXfThxC+Br/CHSQ+tQNwD0hhP+v+GCM8ewM5d8hNQv6Zoxx7n4u1RL4PfBxCOENUt+TPQboSOpTO0XAyBjjHzPU3kfq7cffBFaFEP6Uvu4FwJHAgzHG58sWxRh/F0J4GBgKLA8hzAC+AM4HcoEpwEMZ6haGEG4HxgBzQwgzgS3AeUATYAFwx37uV5IkSZJUSVX5xE4ucFaG420rUXt9elvuZ3VKWAb8nNR60vZAd1Kzne8BvwLGxxgXZyqMMe4JIVwO3Ji+Zm9gD7CY1Izok+VdNMZ4YwhhDjCMVAg9gtS618eAhzPMwhbXjQ0h/Bm4ldTa2SOBt4AHgJ/FGHdW4p4lSZIkSZVQ6RAbY5wFhOpcJMbYqQpt3wZ+UJ3rpOuLSM2a7jNzWonaJ4Fyg24FdQVAQVXrJEmSJElVc6i/EytJkiRJUrUZYiVJkiRJiWGIlSRJkiQlhiFWkiRJUuJNnz6d73znO7Rp04acnBzq1avHCSecQJ8+fZg4cSLbtm0r1X7UqFGEEBg1alR2BnyYmjVrFiGECn/mz5+f1TFW5e3EkiRJknRY+eCDDxgwYACzZ88G4NRTT+Wiiy6ibt26vPfee8yYMYPp06dz5513smjRIk488cQsjzilR48ezJ49m5dffpkePXpkezj7OPbYY7n44osznsvLyzvEoynNECtJkiQpkbZs2UK3bt1YvXo155xzDhMnTqRTp9IfRtm2bRsPP/wwo0ePZvPmzYdNiD3cnXLKKeTn52d7GBkZYiVJkiQl0vDhw1m9ejVnnnkmM2fO5Mgjj9ynTcOGDRk5ciRXXHEFRx11VBZGqZrmmlhJkiRJibNmzRqeeuopACZOnJgxwJbUpk0bmjVrtt9+97dWNj8/nxAC11133T7nnn76aXr16kWjRo2oU6cOjRs3pmPHjgwbNow1a9YA/1hzWvz4c8+ePUutN501a1apPtetW8fNN99Mu3btqF+/Prm5uXTr1o38/HxijPuMoUePHnv7eeWVV+jTpw+NGzemVq1aTJkyZb/3nwTOxEqSJElKnGnTplFUVETHjh0544wzsj0cRo0axV133UWdOnX4+te/znHHHceWLVt45513mDBhAt27d6d169Y0bdqUwYMHU1BQwMaNG+nduzdNmzbd20/J319++WX69evHJ598Qps2bbj44ovZvn078+fP5/rrr2fmzJk8/vjjGcfz7LPPMnHiRNq3b8+FF17Ipk2bqFOnTqXvZ+PGjdx1112sX7+eo446io4dO3LZZZdxzDHHVP+PVEMMsZIkSZISZ/HixQB07do1yyOBnTt3MnbsWHJycli8eDEnn3xyqfOrVq2idu1U9Cpea9qjRw82btzI7bffnvHFThs2bODKK69k+/bt5Ofnc+211xJCAFKzs3379mXy5Mn06tUr46zwhAkTmDRpEkOGDKnWPb355pv7zEZ///vf59577+X73/9+tfqsKYZYSZIkKakOw7faAhz3xZ7UL3WOKH2izKOyB+LDDz8EoEmTJjXWZ3Vt3bqVHTt2cNppp+0TYAHatm1b5T7vv/9+Nm/ezMiRIxk8eHCpc82bN+cXv/gFXbt25cEHH8wYYi+88MJqBdijjz6aW265hSuuuIK2bdty1FFHsWrVKiZMmMBjjz3GTTfdRP369fmP//iPKvddU1wTK0mSJEkHIC8vj5YtW7Js2TJuvfVW3nzzzQPuc/r06QBcddVVGc937tyZnJwcli5dyueff77P+SuuuKJa1z3jjDMYN24c//qv/8qxxx5LTk4OZ5xxBr/4xS+4//77AbjtttvYuXNntfqvCc7ESpIkSUlVgzObNenvH24HoHVezkG7RvG3Sj/44IODdo2qePzxx+nfvz/jxo1j3Lhx5OXlcfbZZ9O7d28GDRrE0UcfXaX+3nrrLaByj0t/9NFHHH/88aWOHYxPCQ0bNowf//jHbNq0iQULFnDuuefW+DUqwxArSZIkKXE6d+7M5MmTWbhw4SG9blFRUcbj3bt35+2332batGnMmjWLuXPnMm3aNKZOncqoUaN48cUXq/QCqj17Uo9kDxw4cL9vXq5Xr94+x+rXr1/pa1VWrVq1aNu2LZs2bWL9+vU13n9lGWIlSZIkJU6fPn344Q9/yPLly1myZEmNvaG4bt26AGzfvj3j+bVr15Zb26BBAwYMGMCAAQOA1MuZbrnlFp555hmGDRvG3LlzKz2O5s2bs3r1au688046dOhQhTs4uD766CMAcnIO3iz7/rgmVpIkSVLitGnThoEDBwIwdOjQ/a7RXLNmDRs2bNhvv8WP5WZa1xpjpKCgoNJjbNasGaNHjwZg2bJlpc4Vh+Xdu3dnrL3kkkuA1KdyDhfLli3jb3/7GyEEunTpkrVxGGIlSZIkJdJDDz1Eq1atWLBgAb169WL58uX7tPn0008ZN24cnTt3ZuPGjfvts2fPntSqVYuCggJee+21vcf37NnDHXfcweuvv75Pzdq1a3n00UfZunXrPuemTp0K7LtGtTgsr1y5MuM4RowYQW5uLvfccw/jx4/PGHYLCwt57rnn9ntPVfHAAw/snW0tad68efTv3x9IPeLcrFmzGr1uVfg4sSRJkqREatSoEXPmzGHAgAHMmTOHTp060b59e0455RTq1q3L+vXref3119m5cyfHHnssjRo12m+fLVq0YOjQoYwfP56ePXvSvXt3cnNzeeONN9i8eTM33XQTDzzwQKmazZs3893vfpdhw4Zx+umnc9JJJ1FUVMSKFSsoLCykTp06jB07tlRNv379yM/PZ8SIEbz00kt7PxU0YsQI2rVrR/PmzZkyZQr9+/dn+PDhjB49mg4dOtCkSRO2bNnC8uXLWbduHQMHDqz2m4gz+a//+i9uvfXWvfcRY2TVqlX8+c9/JsZIt27dmDRpUo1drzoMsZIkSZISq1mzZrz66qtMmzaNp556innz5lFQUMDu3bvJy8vjggsu4LLLLuOaa67hqKOOqlSfDzzwAC1atOCxxx5jzpw55Obm0rNnT+6+++6M61pbt27Nfffdx6xZsygsLKSwsJBatWpx/PHHM2TIEG6++Wbat29fqqZv375MmDCBSZMmMWPGDHbs2AHAoEGDaNeuHZCaFS4sLOTBBx/khRdeYP78+XzxxRc0bdqUVq1aceONN5b7CZ7quuOOO3j11VcpLCzkb3/7G5999hmNGjXiwgsv5Fvf+hbf+c53OOKII/bf0UEUYoxZHYD+oUuXLnHRokXZHsZBMXDSPACeueGcLI9EB5v/t5YkqWasXLmSU089NdvDqJY1h+ATO0qOqvxbDiEsjjFWuODWNbGSJEmSpMQwxEqSJEmSEsMQK0mSJElKDEOsJEmSJCkxDLGSJEmSpMQwxEqSJEmSEsMQK0mSJElKDEOsJEmSdJiKMWZ7CNIBORj/hg2xkiRJ0mGodu3a7Nq1K9vDkA7Irl27qF27do32aYiVJEmSDkNHH300H330kbOxSqwYIx999BFHH310jfZriJUkSZIOQ40aNWLnzp289957bNu2jT179hhoddiLMbJnzx62bdvGe++9x86dO2nUqFGNXqNm53UlSZIk1YjatWtz4oknsnnzZjZv3szf//53ioqKsj2sSvlw204Adm2ql+WRKBtq1apF/fr1Oeqoo/iXf/kXatWq2blTQ6wkSZJ0mKpVqxbHHHMMxxxzTLaHUiWjJs0D4JkbTs/uQPSl5OPEkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMSodYkMI7UIIN4cQfhNCeDOEUBRCiCGE/hXU5KfblPfzZgW1tUIIw0IIi0II20MIn4QQXg0hfKsSY70m3faTdO2idF8V3m8I4eIQwoshhI9DCJ+FEP4SQrgjhFBvP3VnhRB+H0L4IITweQhhVQhhbAjh6P2NVZIkSZJUebWr0HYocHM1r/MasDrD8Q2ZGocQjgCeA/oCW4EXgXrA+cCTIYSzY4wZxxJCGA/cCHwO/An4Il33EHB+CKF/jLEoQ91IYAywB5gFbAbOA+4GLg0hnB9j/CxD3beAycAR6ftcD5wNjAD6hRC6xRg/yDRWSZIkSVLVVCXE/gX4KbAIWAz8klTIq4xHY4z5VbjWD0gF2BVArxjjRoAQQlvgVeCmEMLMGOPzJYtCCFeSCrDvA+fGGFeljx8LvAz0A74P/LxMXRfgXuCz9PUWpI/nAC8A5wKjgVvK1J1A6u8QgMuLxxNCqA38BhgITEpfV5IkSZJ0gCr9OHGM8dEY48gY429jjGsO1oDSs7Aj07tDiwNsegyrgNvSu3dkKP9RentbcYBN120kNZMMcHuGx4pvJxVExxQH2HTdduB6oAi4MYTwlTJ1PwDqA78uGahjjLuBIaRmkS8PIbSv6J4lSZIkSZVzOL7Y6RygCfBejPGVDOefJfWIcNcQwvHFB9Ozop2BXek2pcQYZ5N61Lcpqcd9i+vqApekd5/IUPcWMA+oC3yjzOnLK6jbCkwt006SJEmSdAAOVYjtGUIYF0J4JITwPyGE3hW8ZOmM9HZhppPpdamF6d3TM9QVxhh3lNP3wjJtAdoBDYCPK5hh3qcuhJALtK5orOVcT5IkSZJUTVVZE3sgrs1wbEUI4eoY4/Iyx09Kb9dW0N+7pALsSSWOVbauZNuSv79L+TLVtUxvt6RnXStbJ0mSJEmqpoM9E7sUuAloD+QAxwGXAsvSx2aUfCQ4LSe9/bSCfrentw0TWFdKCGFI+hNAiz788MMKupIkSZIkHdQQG2O8P8b4YIxxZYzx0xjjhhjjC8CZwHxSa19/VHEvX24xxkdijF1ijF3y8vKyPRxJkiRJOqxl5cVOMcZdwE/Su2VfllQ8e3lUBV0Uz4JuS2CdJEmSJKmasvl24jfT27KPE7+T3p5YQW3zMm1roq5FFeuK195+Jf2Sp8rWSZIkSZKqKZsh9pj0dnuZ42+kt10zFYUQGgBfTe8uKXGq+PcOIYT65Vyza5m2kArTO4BGIYTW+5YAqcefS9XFGD8Bit9mnHGsmeokSZIkSdWXzRA7IL0t+3maecCHwAkhhHMz1F0F1AEWxhjXFx+MMa4jFYDrptuUEkI4DzgBeD99jeK6XcAf0rvfzlDXitS3a3cBL5Q5/XwFdbnAN9O7v89wH5IkSZKkKjpoITaEcHoI4dIQwhFljtcOIdxK6q3FAPeVPB9j3AOMTe8+HEJoUqK2LXBvend0hssWr7MdE0JoU6KuCTAhvXtvjLGoTN29QARuCyGcWaIuB3iM1N9pQoxxS5m6+0nN4g4OIfQteY/AJCAXmBJjXJFhrJIkSZKkKqr0d2JDCF/jH0EQUp/IAbgnhPD/FR+MMZ6d/rUlqRnIj0MIbwAfkHqEuCOpT+0UASNjjH/McLn7gHNJzWSuCiH8idTs6wXAkcCDMcbnyxbFGH8XQngYGAosDyHMAL4AzicdKIGHMtQtDCHcDowB5oYQZgJbgPNIvUF5AXBHhrp1IYR/ByYDU0IIc4C/A2eTWpu7Grghw/1JkiRJkqqh0iGWVAg8K8PxtuW0Xwb8nNS60PZAd1Kzne8BvwLGxxgXZyqMMe4JIVwO3AhcD/QG9gCLSc2IPlneIGOMN6bD5DBSIfQIUuteHwMezjALW1w3NoTwZ+BWUmtcjwTeAh4AfhZj3FlO3VMhhLdIfSqoG6m/0Trgp8Do9NpZSZIkSVINqHSIjTHOAkIV2r8N/KDqQ9pbX0Rq1nSfmdNK1D4JlBt0K6grAAqqUbcAuLyqdZIkSZKkqsnmi50kSZIkSaoSQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTFqZ3sAkiRJ0oF6csG7PL90fbaHobQVG7YCMHDSvCyPRMUuO/14rjmrRbaHUSOciZUkSVLiPb90/d7gpOxr3yyX9s1ysz0Mpa3YsPVL9R95nImVJEnSl0L7Zrk8c8M52R6GdNj5ss2IOxMrSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxKh0iA0htAsh3BxC+E0I4c0QQlEIIYYQ+pfTvk4I4fwQwv8NISwKIWwNIewKIawPIfwuhNCjgmvlp/su7+fNCmprhRCGpa+5PYTwSQjh1RDCtypxj9ek236Srl2U7qvCv1MI4eIQwoshhI9DCJ+FEP4SQrgjhFBvf9eUJEmSJFVe7Sq0HQrcXIX25wEvpX9/H3gF+BRoD1wJXBlC+J8Y439V0MdrwOoMxzdkahxCOAJ4DugLbAVeBOoB5wNPhhDOjjFmvIcQwnjgRuBz4E/AF+m6h4DzQwj9Y4xFGepGAmOAPcAsYHP63u8GLg0hnB9j/KyCe5QkSZIkVVJVQuxfgJ8Ci4DFwC9JhbXyFAH/C/w8xvhqyRMhhIHAE8CdIYSXY4wvl9PHozHG/CqM8QekAuwKoFeMcWP6em2BV4GbQggzY4zPlxnPlaQC7PvAuTHGVenjxwIvA/2A7wM/L1PXBbgX+Cx9vQXp4znAC8C5wGjglircgyRJkiSpHJV+nDjG+GiMcWSM8bcxxjWVaD8zxti/bIBNn3sGyE/vDqr0aCuQnoUdmd4dWhxg09dbBdyW3r0jQ/mP0tvbigNsum4jqRlogNszPFZ8OxCAMcUBNl23HbieVJC/MYTwlWrdlCRJkiSplGy+2GlJentCDfV3DtAEeC/G+EqG88+SekS4awjh+OKDIYQTgM7ArnSbUmKMs4H1QFPg7BJ1dYFL0rtPZKh7C5gH1AW+Ub1bkiRJkiSVlM0Q2za9zbi+Na1nCGFcCOGREML/hBB6V/CSpTPS24WZTqbXpRamd0/PUFcYY9xRTt8Ly7QFaAc0AD6uYGY6U50kSZIkqZqqsia2xoQQmgLXpXf/t4Km12Y4tiKEcHWMcXmZ4yelt2sr6O9dUgH2pBLHKltXsm3J39+lfJnqJEmSJEnVdMhnYkMItYHfAEcDf4oxTs3QbClwE6k3GecAxwGXAsvSx2aUfCQ4LSe9/bSCy29Pbxtmsa6UEMKQ9Kd8Fn344YcVdCVJkiRJysbjxBNJfbpmHeW81CnGeH+M8cEY48oY46cxxg0xxheAM4H5pNa+/ihTbdLEGB+JMXaJMXbJy8vL9nAkSZIk6bB2SENsCOHnwL+T+pTN+THG96tSH2PcBfwkvVv2ZUnFs55HVdBF8ezptizWSZIkSZKq6ZCF2BDC/yX1iPCHpALsqv2UlOfN9Lbs48TvpLcnVlDbvEzbmqhrUcU6SZIkSVI1HZIQG0IYC/wQ+Ai4IMa44gC6Oya93V7m+BvpbddyxtAA+Gp6d0mJU8W/dwgh1C/nml3LtIVUmN4BNAohtC6n7swMdZIkSZKkajroITaEcC8wAtgMXBhj/PMBdjkgvS37KZ15pGZ5TwghnJuh7iqgDrAwxri++GCMcR2pAFw33abs+M8j9S3b99PXKK7bBfwhvfvtDHWtSH27dhfwQmVuTJIkSZJUsYMaYkMIdwO3AVtIBdj9zkiGEE4PIVwaQjiizPHaIYRbST2SDHBfyfMxxj3A2PTuwyGEJiVq2wL3pndHZ7hs8TrbMSGENiXqmgAT0rv3xhiLytTdC0TgthDCmSXqcoDHSP19J8QYt1Rwy5IkSZKkSqr0d2JDCF/jH4EOUp+6AbgnhPD/FR+MMZ6dbt8XuCN9eDXw/RBCpq7fjDHeW2K/JfB74OMQwhvAB6QeIe5I6lM7RcDIGOMfM/R1H3Au8E1gVQjhT6RmXy8AjgQejDE+X7Yoxvi7EMLDwFBgeQhhBvAFqbco5wJTgIcy1C0MIdwOjAHmhhBmkgrs55F6g/KCEn8DSZIkSdIBqnSIJRXmzspwvG057RuV+L1L+ieT2fxjlhRS34L9Oan1pO2B7qRmO98DfgWMjzEuztRRjHFPCOFy4EbgeqA3sAdYTGpG9MlyxkCM8cYQwhxgGKkQegSpda+PAQ9nmIUtrhsbQvgzcCuptbNHAm8BDwA/izHuLO+akiRJkqSqqXSIjTHOAjJOpZbTPh/Ir+qAYoxvAz+oal2J+iJSs6b7zJxWovZJoNygW0FdAVBQ1TpJkiRJUtUc0u/ESpIkSZJ0IAyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSoxKh9gQQrsQws0hhN+EEN4MIRSFEGIIoX8laq8JIbwaQvgkhLA9hLAohDAshFDh9UMIF4cQXgwhfBxC+CyE8JcQwh0hhHr7qTsrhPD7EMIHIYTPQwirQghjQwhHV+IefxNC+HsIYWcIYW0I4eEQQrP91B2Xbrc2Xff3EMLkEMLJFdVJkiRJkqqmKjOxQ4H7gW8D7YBQmaIQwnjgCaAL8CrwEnAy8BDwu/KCbAhhJPAHoBfwBvAC0AS4G5gVQmhQTt23gNeAy4G/Ac8DdYERwKIQQpNy6s4DlqTvbwPwe+Az4HvAsvICaQjhVODP6XafpeveBwYBS0II3TL/ZSRJkiRJVVWVEPsX4KfAQKANMHt/BSGEK4EbSYW6TjHGS2OM/YC2wEqgH/D9DHVdgHtJhcJuMcYLYoxXAa2AV4CzgdEZ6k4AfkkqYF8eY/zXGONAoDXwTHrckzLUHQU8DdQHvh9j7BxjvDrGeCrwf4E84KkQQihTVytddwzwsxjjqem6rwE3AQ2A35YXuCVJkiRJVVPpEBtjfDTGODLG+NsY45pKlv0ovb0txriqRF8bSc3sAtyeYTb2dlJBdEyMcUGJuu3A9UARcGMI4Stl6n5AKoj+Osb4fIm63cAQYCtweQihfZm664GmwMsxxofKnLsNWAN8DbikzLlvAJ2A1ekx7xVjfBCYBRwHXIckSZIk6YAdtBc7pWdFOwO7gGfLno8xzgbWkwqPZ5eoq8s/wuITGereAuaRekT4G2VOX15B3VZgapl2lanbQ2q2taK6p9PtynqiTDtJkiRJ0gE4mG8nPiO9LYwx7iinzcIybSG13rYB8HEFM7771IUQckk9NlzyfGWuV3L/UNVJkiRJkqrhYIbYk9LbtRW0ebdM25K/v0v5MtW1TG+3pGddK1WXDr+N9jPWTNcrub+/usYhhJxy2kiSJEmSKulghtji0PZpBW22p7cND4O6imoz1VXmmttL/F62FoAQwpD0J4cWffjhh+V0I0mSJEmCgxtiVQkxxkdijF1ijF3y8vKyPRxJkiRJOqwdzBBbPAt5VAVtimcytx0GdRXVZqqrzDVLzvKWrZUkSZIkVdHBDLHvpLcnVtCmeZm2JX9vUcW64nWpX0mvc61UXXr97Ob0bnljzXS9kvv7q/so/XkgSZIkSdIBOJghdkl62yGEUL+cNl3LtAV4E9gBNAohtN63BIAzy9bFGD8h9T3Xkv3uty7tjUNcJ0mSJEmqhoMWYmOM60iFvLrAVWXPhxDOA04A3if13dfiul3AH9K7385Q1wo4h9T3Z18oc/r5CupygW+md39fhbojgKv3U3d1ul1Zxf2VrZMkSZIkVcPBfrHTT9LbMSGENsUHQwhNgAnp3XtjjEVl6u4FInBbCOHMEnU5wGOkxj0hxrilTN39pGZxB4cQ+paoqw1MAnKBKTHGFWXqfkUqTPcMIQzLMJbWpGZT/1Dm3AvAn4E2Je61+JrDgR7A34F8JEmSJEkHrHZlG4YQvsY/gidA+/T2nhDC/1d8MMZ4donffxdCeBgYCiwPIcwAvgDOJx0ogYfKXivGuDCEcDswBpgbQpgJbAHOA5oAC4A7MtStCyH8OzAZmBJCmEMqRJ5Nat3qauCGDHXbQwhXkwqpD4UQrgdWAacBpwKbgG/FGGOZuqIQwreAV4ARIYRLgWVAW6AzqUA9MMb4WdlrSpIkSZKqriozsbnAWSV+ir972rbM8VJijDeSeqz2DVIhtDepMDkcuDLGuCfTxWKMY4FLgJdJrTn9Jqkw+X+A88oLhjHGp4BuwP8jFUD7AbuBnwJdYowflFM3GzgDeJLUY85XkHq78CSgU4zxr+XUrQA6pdvlpOuOB54ATo8xzslUJ0mSJEmqukrPxMYYZwGhOheJMT5JKhxWta4AKKhG3QLg8mrU/ZUM62IrUfd34HtVrZMkSZIkVc3BXhMrSZIkSVKNMcRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhKjdrYHIP2zeHLBuzy/dH22h3HQrdiwFYCBk+ZleSQH32WnH881Z7XI9jAkSZL+qTgTKx0izy9dvzfgfZm1b5ZL+2a52R7GQbdiw9Z/iv8oIUmSdLhxJlY6hNo3y+WZG87J9jBUA/4ZZpolSZIOR87ESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEMsZIkSZKkxDDESpIkSZISwxArSZIkSUoMQ6wkSZIkKTEOaogNIfQIIcRK/rQoUZe/n7ZvVnDNWiGEYSGERSGE7SGET0IIr4YQvlWJ8V6TbvtJunZRuq8K/04hhItDCC+GED4OIXwWQvhLCOGOEEK9qv3FJEmSJEkVqX2Q+38f+HUF588ETgXWAOsynH8NWJ3h+IZMnYUQjgCeA/oCW4EXgXrA+cCTIYSzY4w3l1M7HrgR+Bz4E/BFuu4h4PwQQv8YY1GGupHAGGAPMAvYDJwH3A1cGkI4P8b4WaZrSpIkSZKq5qCG2Bjjm8B15Z0PIaxI//pYjDFmaPJojDG/Cpf8AakAuwLoFWPcmL5OW+BV4KYQwswY4/NlxnElqQD7PnBujHFV+vixwMtAP+D7wM/L1HUB7gU+S19vQfp4DvACcC4wGrilCvcgSZIkSSpH1tbEhhDOITULuwfIr4H+jgBGpneHFgdYgHQovS29e0eG8h+lt7cVB9h03UZgaHr39gyPFd8OBGBMcYBN120HrgeKgBtDCF+p1k1JkiRJkkrJ5oud/i29LYgx/r0G+jsHaAK8F2N8JcP5Z0k9Itw1hHB88cEQwglAZ2BXuk0pMcbZwHqgKXB2ibq6wCXp3Scy1L0FzAPqAt+o3i1JkiRJkkrKSogNITQABqZ3f1lB054hhHEhhEdCCP8TQuhdwUuWzkhvF2Y6mV6XWpjePT1DXWGMcUc5fS8s0xagHdAA+DjGuKYKdZIkSZKkajrYL3Yqz1VAQ+ADYFoF7a7NcGxFCOHqGOPyMsdPSm/XVtDfu6QC7EkljlW2rmTbkr+/S/ky1UmSJEmSqilbjxMXP0r8eIzxiwznlwI3Ae2BHOA44FJgWfrYjJKPBKflpLefVnDd7eltwyzWSZIkSZKq6ZDPxIYQ2pB6ay/AY5naxBjvL3PoU+CFEMJLwGxSa1N/BAw/SMM8ZEIIQ4AhAC1atNhPa0mSJEn655aNmdjiWdh5McaVVSmMMe4CfpLeLfuypOJZz6Mq6KJ49nRbFutKiTE+EmPsEmPskpeXV0FXkiRJkqRDGmLTn8EpXuda0QudKvJmelv2ceJ30tsTK6htXqZtTdRVNH2aqU6SJEmSVE2Heia2N6nwuR14ppp9HJPebi9z/I30tmumovQbkb+a3l1S4lTx7x1CCPXLuWbXMm0hFaZ3AI1CCK3LqTszQ50kSZIkqZoOdYj99/T2tzHGsiG0sgakt2U/pTMP+BA4IYRwLvu6CqgDLIwxri8+GGNcRyoA1023KSWEcB5wAvB++hrFdbuAP6R3v52hrhWpb9fuAl6ozI1JkiRJkip2yEJsCKEx8M30brmPEocQTg8hXJp+9Ljk8dohhFtJvbUY4L6S52OMe4Cx6d2HQwhNStS2Be5N747OcNnidbZj0i+eKq5rAkxI794bYywqU3cvEIHbQghnlqjLIfXSqlrAhBjjlvLuV5IkSZJUeYfy7cTfITUT+maMcW4F7VoCvwc+DiG8QepbsscAHUl9aqcIGBlj/GOG2vtIvfn4m8CqEMKf0te8ADgSeDDG+HzZohjj70IIDwNDgeUhhBnAF8D5QC4wBXgoQ93CEMLtwBhgbghhJrAFOA9oAiwA7qjgXiVJkiRJVXAoQ+z16W3Gz+qUsAz4Oan1pO2B7qRmO98DfgWMjzEuzlQYY9wTQrgcuDF9vd7AHmAxqRnRJ8u7aIzxxhDCHGAYqRB6BKl1r48BD2eYhS2uGxtC+DNwK6m1s0cCbwEPAD+LMe7cz/1KkiRJkirpkIXYGGOnSrZ7G/jBAVyniNSs6T4zp5WofRIoN+hWUFcAFFS1TpIkSZJUNdn4TqwkSZIkSdViiJUkSZIkJYYhVpIkSZKUGIZYSZIkSVJiGGIlSZIkSYlhiJUkSZIkJYYhVpIkSZKUGIZYSZIkSVJiGGIlSZIkSYlhiJUkSZIkJYYhVpIkSZKUGIZYSZIkSVJiGGIlSZIkSYlhiJUkSZIkJYYhVpIkSZKUGIZYSZIkSVJiGGIlSZIkSYlRO9sDkJQcTy54l+eXrs/2MA4LKzZsBWDgpHlZHsnh47LTj+eas1pkexiSJOlLzplYSZX2/NL1e8PbP7v2zXJp3yw328M4bKzYsNX/wCFJkg4JZ2IlVUn7Zrk8c8M52R6GDjPOSEuSpEPFmVhJkiRJUmI4EytJkiRJSfDII/Dkk1Wvaz8wte3xo+pd95prYMiQ6tUeBIZYSZIkSUqCJ5+EpUvh9NOrVPbMimeqf82lS1NbQ6wkSZIkqcpOPx1mzTp01+vR49Bdq5JcEytJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEqJ3tAUiSJCXaij/CqleyPQp91CG1fX5adsfxz67tudC+d7ZHoS85Q6wkSdKBWPUKbHobGp+U7ZH8U3umQ2G2h6BNb6e2hlgdZIZYSZKkA9X4JLhsdLZHIWXX83dkewT6J+GaWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJcdBDbAghP4QQK/h5s5y6WiGEYSGERSGE7SGET0IIr4YQvlWJa16TbvtJunZRuq8K7zeEcHEI4cUQwschhM9CCH8JIdwRQqi3n7qzQgi/DyF8EEL4PISwKoQwNoRw9P7GKkmSJEmqvNqH8FqvAaszHN9Q9kAI4QjgOaAvsBV4EagHnA88GUI4O8Z4c6aLhBDGAzcCnwN/Ar5I1z0EnB9C6B9jLMpQNxIYA+wBZgGbgfOAu4FLQwjnxxg/y1D3LWAycET6HtcDZwMjgH4hhG4xxg/K+ZtIkiRJkqrgUIbYR2OM+ZVs+wNSAXYF0CvGuBEghNAWeBW4KYQwM8b4fMmiEMKVpALs+8C5McZV6ePHAi8D/YDvAz8vU9cFuBf4LH29BenjOcALwLnAaOCWMnUnAL8EAnB58XhCCLWB3wADgUnp60qSJEmSDtBhtyY2PQs7Mr07tDjAAqRD6W3p3TsylP8ovb2tOMCm6zYCQ9O7t2d4rPh2UkF0THGATddtB64HioAbQwhfKVP3A6A+8OuSgTrGuBsYQmoW+fIQQvuK7lmSJEmSVDmHXYgFzgGaAO/FGF/JcP5ZUo8Idw0hHF98MD0r2hnYlW5TSoxxNqlHfZuSety3uK4ucEl694kMdW8B84C6wDfKnL68grqtwNQy7SRJkiRJB+BQhtieIYRxIYRHQgj/E0LoXc6Lls5Ibxdm6iS9LrUwvXt6hrrCGOOOcsawsExbgHZAA+DjGOOaytaFEHKB1hWNtZzrSZIkSZKq6VCuib02w7EVIYSrY4zLSxw7Kb1dW0Ff75IKsCeVOFbZupJtS/7+LuXLVNcyvd2SnnWtbJ0kSZIkqZoOxUzsUuAmoD2QAxwHXAosSx+bUfKx4HQbgE8r6HN7etswgXWSJEmSpGo66DOxMcb7yxz6FHghhPASMJvU+tQfAcMP9lgORyGEIaReAkWLFi2yPBpJB9vaL1aw/otV+2+YMFuLjgBg7mfP76dl8hxfpy0n1vH9fJIkHS4O5ePEpcQYd4UQfgI8T+kXJhXPXh5VQXnxLOi2BNaVEmN8BHgEoEuXLrGCviR9Caz/YhWfFG3i6FqNsz2UGnXn4D3ZHsJB8UnRJvgCQ6wkSYeRrIXYtDfT25KPE7+T3p5YQV3zMm1roq6iadBMdcVrb78SQsgtZ11spjpJ/+SOrtWYrze4LNvDUCV8GWeWJUlKumx/YueY9HZ7iWNvpLddMxWEEBoAX03vLilxqvj3DiGE+uVcr2uZtpAK0juARiGE1vuWAHBm2boY4ydA8duMM441U50kSZIkqfqyHWIHpLclP1EzD/gQOCGEcG6GmquAOsDCGOP64oMxxnWkAnDddJtSQgjnAScA76evUVy3C/hDevfbGepakfp27S7ghTKni/8Tfaa6XOCb6d3fZ7gPSZIkSVIVHdQQG0I4PYRwaQjhiDLHa4cQbiX11mKA+4rPxRj3AGPTuw+HEJqUqGsL3JveHZ3hkj9Jb8eEENqUqGsCTEjv3htjLCpTdy8QgdtCCGeWqMsBHiP1d5oQY9xSpu5+UrO4g0MIfUveHzAJyAWmxBhXZBirJEmSJKmKDvaa2JakZiE/DiG8AXxA6hHijqQ+tVMEjIwx/rFM3X3AuaRmMleFEP5Eavb1AuBI4MEY4z4LlWKMvwshPAwMBZaHEGYAXwDnkw6UwEMZ6haGEG4HxgBzQwgzgS3AeUATYAFwR4a6dSGEfwcmA1NCCHOAv5N64/KJwGrghsr8oSRJkiRJ+3ewHydeBvwc+Cupb8JeSSoYfgb8CjgzxvjTskXp2djLge+TCoK903WLgW/HGG8qW1Oi9kZSj/e+ka7pne5jOHBluu9MdWOBS4CXSa1x/SawCfg/wHkxxs/KqXsK6Ab8P+BUoB+wG/gp0CXG+EF5Y5UkSZIkVc1BnYmNMb4N/KCatUWkZk33mTmtRO2TwJPVqCsACqpRt4BU6JYkSZIkHUTZfrGTJEmSJEmVZoiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYhhiJUmSJEmJYYiVJEmSJCWGIVaSJEmSlBiGWEmSJElSYtTO9gAkSZIkVcOKP8KqV7I9in/Y9HZq+/wd2R1HWW3Phfa9sz0K1SBnYiVJkqQkWvXKP4Lj4aDxSamfw8mmtw+voK8a4UysJEmSlFSNT4LLRmd7FIevw21WWDXCmVhJkiRJUmIYYiVJkiRJiWGIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiRJieEndv7JLH4Elj956K/7fvvUNr/Hob92x2ug85BDf11JkiRJNc8Q+09m+ZPw/lJoevqhve71K845tBdMe39pamuIlSRJkr4cDLH/hJqeDtfNyvYoDo1szPxKkiRJOnhcEytJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhLDECtJkiRJSgxDrCRJkiQpMQyxkiRJkqTEMMRKkiRJkhKjdrYHIElSSWu/WMH6L1ZlexgAfFK0CYC5nz2f5ZGkHF+nLSfWaZ/tYUiSlFXOxEqSDivrv1i1Nzxm29G1GnN0rcbZHgaQCtSHS7iXJCmbnImVJB12jq7VmK83uCzbwzisHC6zwZIkZZszsZIkSZKkxDDESpIkSZISw8eJJUmSJB08K/4Iq17JzrU3vZ3aPn9Hdq7f9lxo3zs71/4ScyZWkiRJ0sGz6pV/hMlDrfFJqZ9s2PR29sL7l5wzsZIkSZIOrsYnwWWjsz2KQytbs7//BJyJlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmIYYiVJkiRJiWGIlSRJkiQlhiFWkiRJkpQYhlhJkiRJUmL4nVhJknR4WvFHWPVKtkexf5veTm2T8E3ItudC+97ZHoUkHRBnYiVJ0uFp1Sv/CIiHs8YnpX4Od5veTsZ/FJCk/XAmVpL+//bePE7O46z3/VbPPiNpJFm75N2yHTtO5DhO4iTYBickQCCEhM34QrjcE27CcuDwOU6AeziXJZAAIYQlAR8CBoLgAMEYCLHJZsvBTuw4lhfJi2zJWkcjzWhmpFm7p7vOH8/zTL3d6p7p2Wek56uPPj39dr21V731q6eqXsdxli7rLoV3fnixY3FusBwsxY7jOHXglljHcRzHcRzHcRxn2eAi1nEcx3Ecx3Ecx1k2uIh1HMdxHMdxHMdxlg0uYh3HcRzHcRzHcZxlg4tYx3Ecx3Ecx3EcZ9ngItZxHMdxHMdxHMdZNsyriA0hNIUQbgshfCyE8I0QwukQQj6EcDSE8I8hhFtr3Hd3CCFO8v+5ScLMhRB+SsMbDCEMhBAeCiH8cB3xvV3dDui931C/Js2nEMLbQwj/EUI4FUIYDiE8E0L45RBCy1RhOo7jOI7jOI7jOPUz3++JvQX4gv59HNgFDAHXAO8G3h1C+PUY46/UuP8/gRerXO+q5jiE0AD8E/A9wGngP4AW4DZgZwjhDTHG/1rj3j8GPgCMAl8CCnrfHwG3hRDeE2MsVbnvTuCjQBF4AOjTdP8G8I4Qwm0xxuEa6XMcx3Ecx3Ecx3GmwXyL2BLwWeATMcaHsj+EEH4Q+Bvgf4QQvhJj/EqV+/8sxnj3NML7OUTA7gW+LcbYrWFtBx4CfjaE8OUY470VcXk3ImCPAzfHGPfp9Y3AV4B3AT8DfKLivtcCHwGGNbyv6/UVwOeAm4EPAz8/jTQ4NXj8Lnh65/TuOb5bPu++dXr3XXc73PC+6d3jOI7jOI7jOM78M6/LiWOMX44xvqdSwOpv/xu4W7/eMduw1Ap7p359vwlYDWsf8EH9+stVbv9F/fygCVi9rxt4v379UJVlxR8CAvBRE7B63yDw44iI/0AIYfWMEuWU8fTOJErrZdMO+T8dju+evlh2HMdxHMdxHGdhmG9L7FQ8oZ/b5sCvm4ANwJEY464qv/8D8L+AG0MIW2OMRwFCCNuAG4C8uikjxvhgCOEosBV4A/Cw3tcMfIc6+5sq9+0PITwCvAn4TsBl0RywaQe894H5DWO6VlvHcRzHcRzHcRaOxT6deLt+Vt3jCnxrCOH3Qgh3hRB+PYTwtkkOWbpePx+r9qPuS92jX3dUuW9PjHGkht+PVbgFuApoB07FGF+axn2O4ziO4ziO4zjODFk0S2wIYRPwXv362RrOfrTKtb0hhB+KMT5dcf1S/Tw4SbCHEAF7aeZavfdl3Wb/PkRtqt3nnMN8Zm+Be/YVqv72fK+cC/bue6uf8/Wu7U3ccU3TvMXNcRzHcRzHcc4FFsUSG0JoBD4DdAJfijH+a4WT3cDPIqcYrwC2AO8AntRrXwwhbK24Z4V+Dk0S9KB+rlzE+5xzmHv2FdjTc9Yh1gBcde31XHVtdaP8np5STfHrOI7jOI7jOE5isSyxf4K8vuYwVQ51ijH+fsWlIeBzIYQvAA8ie1N/Efjp+Y3m/BNCeB/wPoCLLrpokWPjzAXXrsvx2Xe2T+ueWtZZx3Ecx3Ecx3HKWXBLbAjhE8BPIK+zuS3GeLzee2OMeeC39Ot3VvxsVs+OSbww6+mZRbyvjBjjXTHG18YYX7t+/fpJvHIcx3Ecx3Ecx3EWVMSGED6GLBM+iQjYfVPcUo3n9LNyOfHL+nnxJPdeWOF2Lu6bzHxa7T7HcRzHcRzHcRxnhiyYiA0h/Dbw34Be4C0xxr0z9OoC/RysuP5N/byxRvjtwCv16xOZn+zva0MIbTXCvLHCLYiYHgHWhhAur3Hf66rc5ziO4ziO4ziO48yQBRGxIYSPAP8d6APeGmN8ahbe/YB+Vr5K5xHEwrsthHBzlfu+H2gCHrN3xALEGA8jArhZ3VTG/RbkPbbHNQy7Lw98Xr/+SJX7LkPeXZsHPldPwhzHcRzHcRzHcZzJmXcRG0L4DeCDQD8iYCe1SoYQdoQQ3hFCaKi43hhC+AVkOTLAx7O/xxiLwG/r10+FEDZk7t0OfES/frhKsLbP9qMhhCsy920APqlfPxJjrDx29iNABD4YQnhd5r4VwJ8j+fvJGGP/JEl2HMdxHMdxHMdx6mReTycOIXwP8Mv69UXgZ0II1Zw+F2M0kXkJcA9wKoTwTeAEsoT4OuRVOyXgzhjj/VX8+ThwM/DdwL4QwpcQ6+tbgFbgD2OM91beFGP8xxDCp4D3A0+HEL4IFJATlFcB/wz8UZX7HgshfAj4KPBwCOHLiFi/BdgAfD2TfsdxHMdxHMdxHGeWzPcrdtZm/n6t/q/GgyRL6ZPAJ5D9pNcA34JYO48AfwH8cYzx8WqexBiLIYTvBT4A/DjwNqAIPI5YRHfWimiM8QMhhK8CP4WI0AZk3+ufA5+qYoW1+347hPAU8AvI3tlWYD/wB8DvxhjHaoU5Ux6/C56umZLJOb5bPu++debhX3c73PC+md/vOI7jOI7jOI4zU+ZVxMYY7wbunuY9B4Cfm0WYJcRqepbltI57dwLTlocxxvuA+6Z730x5eqeI0U07pn/vTO7JYiLYRazjOI7jOI7jOIvBfFtinXli0w547wMLH+5sLLiO4ziO4ziO4zizZUHfE+s4juM4juM4juM4s8FFrOM4juM4juM4jrNs8OXEjuM4juM4c8Xe+2HfrsWORXV6DsjnvUv4xQnbb4Zr3rbYsXAcZ4njlljHcRzHcZy5Yt+uJBaXGusulf9LlZ4DS3cCwHGcJYVbYh3HcRzHceaSdZfCOz+82LFYfixlC7HjOEsKt8Q6juM4juM4juM4ywYXsY7jOI7jOI7jOM6ywZcTO47jOI7jOI7jnE/cdRfs3Fmf29275fPWW6d2e/vt8L73zTRWdeOWWMdxHMdxHMdxnPOJnTuTOJ2KHTvk/1Ts3l2/MJ4lbol1HMdxHMdxHMc539ixAx54YO78q8dSO0e4JdZxHMdxHMdxHMdZNriIdRzHcRzHcRzHcZYNLmIdx3Ecx3Ecx3GcZYOLWMdxHMdxHMdxHGfZ4CLWcRzHcRzHcRzHWTa4iHUcx3Ecx3Ecx3GWDf6KHcc5T7iv0MOuQt+s/DhQGgHgl4b3zdiPm5vW8PamdbOKh+M4juM4jnP+4iLWcc4TdhX6OFAa4dJc24z9eMuPXTirOBwojUABF7GO4ziO4zjOjHER6zjnEZfm2vjN9u2LFv5sLLiO4ziO4ziOA74n1nEcx3Ecx3Ecx1lGuIh1HMdxHMdxHMdxlg2+nNhxHMdx5pCDhb0cLcz90vmBUg8ADw/fO6f+bm3azsVN18ypn47jOI4zn7iIdRxnRszktOP9pWFg+ntj/URjZzlxtLCPgVIPnbm5rbNz7R+oMC7gItZxHMdZVriIdRxnRszktOPLcu3TDmepn2g8XavbTKxpbilbfnTm1vHG9ncudjSmZK6tuo7jOI6zELiIdRxnxizEacdL/UTj6VrdpmtNc0uZ4ziO4zhOOS5infOex++Cp3em78d3y+fdt6Zr190ON7xvIWPlLCfm0+rmljLHcRzHcZxy/HRi57zn6Z1JuAJs2iH/jeO7y0Wu4ziO4ziO4ziLh1tiHQcRre99oPpvWYus4ziO4ziO4ziLi1tiHcdxHMdxHMdxnGWDi1jHcRzHcRzHcRxn2eAi1nEcx3Ecx3Ecx1k2uIh1HMdxHMdxHMdxlg1+sJMzQeWrZqpR7fUzlfjraBzHcRzHcRzHmS9cxDoT2Ktmsq+XqWSy3yCJ3JmI2KlEtAtox3Ecx3Ecx3FcxDplTPaqmXqYzetophLR8ymgHcdxHMdxFp2998O+XfW77zkgn/f+cn3ut98M17xt+vFynCWGi1hnSTEbEe3vc3Ucx1nGVBu81xqg+0B88ZiuyJoO0xVk02G51Jl9uyQf1l1an/t63UHK3+WQD44zBS5iHcdxHMdZfKoN3qsN0H0gvrhMV2RNh/nwE5ZfnVl3Kbzzw3Pv73xMDjjOIuEi1nEcx3GcpUE9g3cfiC8+8yWy5guvM85cU++KhOmsLlguqwWWCC5iHWeB+czeAvfsK5Rd29NTAuDd9w5PXHvX9ibuuKZpQePmOI7jOI7jTEG9KxLqXV2w3FYLLAFcxDrOAnPPvgJ7ekpcuy69pjn7N5ioLbiIdRzHcRxnaTGTfdEz3e+8lK2Tc7kiYTmtFrjrLthZ43Uiu3fL5623Vv/99tvhfXNzAquLWMdZBK5dl+Oz72yv+XvWIus4juM4jrNkmMm+6Jnsd3br5NJk504Rqzt2nP1btWuGCVwXsY7jOI7jOI7jLDgLsS96OVknzzd27IAHHpjePbWsszPERazjLAEq98lW2yMLvk/WcRxnwZjpq2Rm85qYpbx00nEcZwnhItZxlgCV+2Qr98iC75N1lg8HC3s5Wtg34/sHSj0APDx874zu39q0nYubrplx+MuVmeT7TPP6vMjjmb5KZqavifGlk47jOHXjItZxlgi+T/b8ox7RMR2RsVSExdHCPgZKPXTm1s3o/pneB5pfBZZEPiw0M8n3meT1eZXHC/kqGV866ThOLbKHKVUenjSHhyUtJ1zEOo7jLBL1iI56RcZSExaduXW8sf2dCx7uTK235woLke/nex47juMsONnDlLKHJ83xYUnLCRexjuM4i8hciQ4XFo7jOI5zDlPtMKU5PixpOeEi1nHmADuYqfJAJj+IyXEcx3Ecx3HmFhexjjMHVB7MBH4Qk+M4juM4juPMBy5iHWeOqDyYyQ9ichzHcRzHcRad7MFQRuUBUcYyOSjKRewS5vG74OmdZ18/vls+77717N+uux1uWPr1zjmPua/Qw65CX93u95dkMuCXhut/dcjNTWt4e9PMT7h1HMdxnGVNtfcc13qHsb+f+NwnezCUkf3bWEYHRbmIXcI8vVME66Yd5dcrvxsmbl3EOkuZXYU+DpRGuDTXVpf7y3K1XztUjQOlESjgItZxHMc5f6n2nuNq7zD29xOfP1Q7GKqSZXRQlIvYJc6mHfDeB+pzW80y6zhLkUtzbfxm+/Z58Xs6FlvHmQsq3/db692+S+U9vo7jnCfU855jfz/x4mIW86yV3C3jdeEi1nEcx3FmQeX7fqu923epvcfXcRzHWQJUWszdMl43LmIdpwrZ/cjZPci+59hxnGpM9b5ff4+v48wh1fZ7TkatvaCT4dYwZ6HIWszdMl43LmKdWVF5+FStQ6eWm/jL7ke2Pci+59hZLCqXq1aj1hLWSpb7ktap8uJ8yQfHOa+ptt9zMup1Z7g1bH6pdxJiOpMPPukw/9gJx5WnGtc6zbjyRORqpyHP4iRkF7HOrKg8fKraoVPzKf5qWUyN2Yjnyv3IvufYWSwql6tWY7LfjHNhSetUeXG+5IPjzCuTiYx6hMVCCIp69nvOlPmyhtUj3uoVbstZtNU7CVHv5MN8TDpMVVZLpR0sJNVOOJ7sNONK95WnIc/yJGQXsc6smerwqfkUf9UspoZbTp1ziamWq9bDubKkdbZ5ca7kg+PMG5OJjKmEhVsxa1OPeKtHuJ0LeTyXkxDzMekwVVmdr+2g8oTjqU4znuxE5FmehOwidhlTz1Le5baMdybUEtFuOT03meo9s/W+V3Ym75KttpR1suWrvmT1/MVPLHaWPTMVGb6nb3LmQrx5Hi8MsykrL6N5x0XsMmaqpbxuiXTORaZ6z2w975Wd6btkqy1lrbV81Zesnt/4icWO45wzVC6trVxKe64tm3WWBS5ilzmTLeV1S6RzrjLb98zO5l2y9S5lPVeXrPohU/XjJxY7jnNOULm0NruU9lxdNusseVzEnsdMtRz5fFiKXA+WT/W8auczewvcs68AwJ6eEgDvvneYd21v4o5rmhYmwo4zj/ghU+c2PkkxT8zlgT7gli9n4am1tNaXzTqLhIvY85jJliOfD0uR6xWnlfk0Wd7cs6/Anp4S167Lce26HGBitjArEfuZvYUJUfyZvbPzyzl/mGpf5kxFhh8yde7ikxTzxFwd6ANu+XIcx8FF7HnPYh6KtNgHU01HnGbzaaq8uXZdjs++M+3LfPe9w7OKJzBh3bW/a4nYyQ49qufAo5kcduQsXSbbl+kiw6nFkpukqLYfrzgOn7693F1WBE7HUlnLSlrLMjpTK+hcncbqli9nLrB67/tbnZlQ7Z2xs3jn60xwEessGkvhYKrpiNPFxiy7kzHZoUdTHXg008OOnOmRtY5mLaPztfSyliBxS6izbKi2H6/nABRGoan1bPfTtVTWspJWs4y6FdQ5V6hW75dL/Z7qoClwMT7fVL4DdpbvfJ0JLmKXGFnr5GK8MqfaEtuZhlvp1+N3ne2HH0w198z00KPZHHbk1E/WOmqWUbeKOs4UVFoxbbA6V3v06rWSuhXUOZeo1a6WOpMdNAXLR4wvBPVaTGdiWc2+A3aW73ydCS5ilxhZ6+RiWCYrraOzCTfr1/Hd8n2+99jWmgRYqodU2UFQy2W/a3a5cuXyZF+KXL+VtdI66lbRuWWqw4nqOZhoKR9KVCt9/s5ix3HOGyabeJqOGJ/udgKobeXN+pW9v16r8N770317758bEV6vxXQJWFani4vYJchi7lOtFn61cOs9FMn8Wqi4V5sEqFeEn+mCoW5J21xQKVDtlOIs2YOg9vSUJt3vuhTILlfOLk9eykuRq+0TrrU/eLZCfC6trLUEsbFcRMliCK6pDiea6mCipW4Zr5U+f2fxPFDPskWYu6WLc3WKsS+lrJ9qeT4TAeMsT6aznQAmt/Jm/bL7p2MVztbDfbvmrp7VazFdZMvqdHER68yI6RyKtNDUK8IrBfjKzXDqBXj4d2HoRHI3FXZy8Lq2wMaOMHE9K1AhnVJciR0ENRcHQNVLvcKumqirtlx5KS9FrrZPOCvAT8UC/bHAMCX2jw2zq9B3VrpNiJnwOljYW1MQzJWVtZogNpaTKJlLwXWwsLeuMjD/6zmcaDKRXVl2S2niYDqHL7mlfxZMtWwR5nbp4lycYuxLKadHtTyfiYBZKviBTdNnOoeuTWXlne0S7XpPKXdcxJ5rTLandq6X1M7HoUj1LgeuZQmuFwvDljpnMQGbdTcZ9+wrcDoP164LZ/2WPal4IUXqVEwl7GBurKuzEcu1/JjJMubJ9gn/0vA+BuI4r8ytAKqnOyvEBko9HC3sq0vMTEd0VWOxD2WqJvBmYhWeK8GVjUu9ZTAV1UR2NYE924mDucpLZxGYaoA71/sI6x1QT2a17Tkwf9bihWAqi/RU1ujppnW+9kTPtSW/mkCtvHcmBzZV+jtXS1mzfmeZ7xUNztxSuV/2rjlasjhLXMSeY9TaU7uULKWTkY1/x0ZZ3nvwwbSn1sRsLUvwdNi0Q/w7vluWEq/cXP5bJfUcVFUP3UORnpHIZ/aebZVdSKY6AGourKu1xPKpWJgQo1kLKJwtSiv9mI9lzNm8qJVuE2LTEZD1iC4Tui3h7BOl54NKq/Jk74ytR+AttFW4nneUzsTPqUT2bCcOlmJezhVzItDnYy/YYlNrfxzMzWD9XD5VeSqL9GTWqqWU1rm25Ff6V+ve6VoDs/72HJjbpaz1WrqXUrlVMpO9rvVMOCwXTMBu3AgvvAB33il/b9485a3ziYvYZcB0xVPWQlpp2ZyPQ46yS3NnKuyyZPfRDnXDxbfI9Uohbu4s/EohWg9P74SxgeqitZpbE84HH4Qv3Kk/bJxemBs7AvsHYtm7X89lai0/HojjZ70KqJYorSW4F3oZc9ayWnlgU7XfYGrRdbSwj3HydIZ1E/5UW7o82R7ZSnFQTaiam2piajLxVE3gVYqW7NLb6VoSq/mVTVuln6NxiLE4UpY388Vcvw5pKrE8lxb2euI+V9bhORHoc7EXbDqDxvkWmHD2/rihPvmeH05CYbZhzZUFcaavLJlPK9tsLNJLyRI915b8rH+zWQUw1Acj/ZJ/WX/r9XM67a2eslzKJyPPZK9rvRMOywU78Km7O33ORMSaVRfKT0KGab9n1kXsMqDaKb9Qn3CcbO9q9iCjWvdX2zta6fbpnVDMw/hIRthVxK8yrGrCt/Ia1Ldk2YQowFg/jI9V9zO75PjooxLnNZdX99PiC2IRNnFs8fnDq2T/7BfuhDO/Jb9V2xdr1No3W/m7/V3tcCez4Fa6mezEYJjZYUX3FXom/Lqv0DPlUt9suPWGlxWm5k8ksr80PGmYi4kN+G1ZsQ3KDxb2snfsEcbJ00gzvaVjDIz1kKOBEkVKFMnRUFPoVoZRbelyrT2y1cRBpaiodDPbfbuVcRmNQwyUehgnz8CYxLleoVcZ16nEjwlYu7de8Z6l3gmBqQ7qmmyvtF1bLOo5ZGwurcNzMtkx271g0xk0VhuUTnXPTKgUHCP9sOXa+QmrGvXuj5zpK0uWgpVtofctz5apJgwWSmx3rIGBY+Vxya6ImMpyOBcibSEmk6YKu970zmSv60wnHLITDJPlwV13Te+VOHNB5enFteJV63U92VOQza+sfy5ilyfDJ+H4i/J3pbCsPOXXhOzYQBKOtYSoLZvNit/H70oHGdmrb6oJy1p7RyvFZkNzis/DHyu3iFaLazXhm91/WrkXtZrAzdLSKf4X26r72bExLUvu2Chhx2ISqnbdsOtjA5AfLL/++F3yGRr02glYtQ1O5wEiGzvCWaLU9s1CnPArK2qzVtlaJxSbgK10U+3E4HoOK5qMXYU+AmHi71r3WdgW7kyX92b9OVAamTRMqE88V3MDtUX9VH6aBTBHAwOlHoqM01s6xucHP01LaGOcPDkaGCdPM23kGZkQsQAlilVFcJasyN3atJ3RvIjDb45+kb5SNyVKE25NAA+M9dBX6j5LsJmoOFjYy8BYDwOlHg4W9p4VXi0RNpnVOeu//T4WR7ggtwVgQsTXopqIzKarmuU5G/eW0MZIHKS31DWplbmaAMtOOHSEzrNE+Kmx4xwt7KM111EWt2x+W3wsvJbQxlAcYO/YIxMCfpx81TyuJuKmEt9TWatrTRjUM1lRyzpcmf8zsbRPZ+JlzpjOoLGadWgmr+awQfDOD4gwqGcgbGKh58D0lk9PNuiH2e2PnOkrS2rdV02sTSVQZnoo0XzuW55roVVNdGet9N374JG/TL9PlldTCbBsfhbH4dO3i7/bb5YwQT471qS4AbSthmN75Ld6830meTzXk0nTqT8LZSmdybLi7ATDZO7Morljx9QicDrvgb3rLnj0Ucjn69v/mhXT2XBqva4newqyMYPTkF3ELiFGTgEN5WIPRGAV8/CHV8LpIyIYG1vkt7VXihC97+dEsB1+GJrak9Azcfbwx8RdxwYRXV+4M4k2E7VZYfnwx8qX7dre0eO7oefZZPm87+dSPBua5d7Th+W/xdPiT07u+/L/gOKY3NPSKZ9ZsTnULZZQEH/uvhWOPgbjei7Swx+Tz/4D8JHV5eKzMpz8GbkWAjS2yfVCxflKJoCzFPMSf8tfS4+Vjf0+EaYyVBBx2j2ULv7J7jw9I5HWhiR07e98MU5YZu0U40pMEOeLcP3GHHt7SuztKZ0lukxw3Vfo4e6xowxTYkuQivJMaZBTeVFMds84kR8afHJCgFaKu8qlvrUwi+p9hR72jw1PaUmtZeU1f7JW5KzbrKjMit5TscAzpcGzxHqlwD4VCxyLYxPuLsg10VtKe3NP5QsMxPGqgnw0DjEUTwORRponBAogf+v8gonWPKNl3w0Toq25DvpK3fSWumgkTVi8lN894fdTY7voCKsYJ0/X+P4Jf86U+sgzQu+Y3CuW3hK9pWP0jnWxP/8kw/EMJUoTQsSswc+NPco4+bLfKkWYUWlZ7h3rmhB3lo6Hh+8t+w4q9goiZPeOPUIkUmS8zPKc9dv2AveWuoA4EY5NGmTj1hLaKFGcKAu5TwRzb/FY2QQAJIH/ucG7yNHANS03leXxUBygNXRkamcgEuktHSNXkrJrpo3e0jHOjPVRojhx70v53bSGjglxNhZHqk5OZAUzUGaxBsrEd0toYyyOlE0CVBPnM7F+Vx4yZnGB6pMU2TArw7J4Zf0tUeTzg5+mM7eu6mSKua0UxlOK4mp7ZacavNugsXsflMbPHsjW2n9bzd/JqDYIHjiWfu85AIVR+Nf/CZe/MQmJe39ZxEFxXB4gu++deuCcTVMxD83tIjQ61iSrTX4Yup5Ny5QtDpZ2i7Ndr0c81xqAm1+18rgyf2otp57snmw+wtwvMa/3MKOZLgufrO4Wx1PaQURrfhg6t0hZFkYlnIFjSUhW+lcpOLuehWN7U5l88eOw/xGpL+ZvfiSJ05F+jWiUexoaoaFiAj273HiqfK63TVXL+8rrUL2OTiUIp1t/TNBnw6xHZNYzgdDzcnkcrO4c21M7TlYubavLr09mcTVBOJUInM57YHfuFAFbLMrfXV2yfDi7DzafFz/uuAP+7d9gQAfStl92AV7X4yJ2CZEfgnyuXJiCCMPGNji1D0JOvpvF0RgfEUtjUBGcvReS0BvuTWKumE9/Z4UlQN9L8E93yLLbhuZy4VpUbZMVfyZG7VplPEODCMmIhDM2INdaVsry3+O7RfAO94qF1MS2pSvLULekp1Q4W3xCeTiW/tF+idv4iIj8avdNlMNgEqyGifuJ9DWk/DUxCxBjEqcmOA+ejhQjrGqGUdUzhWRQm7DIAhwbjBRK6Z2yd1zTNGHFXaXhDORh3fYB7h7rYZgSnTQywDh3jx1lT3GQx8YHGFZB0ROT2OqPBf5mrIsBxmkmME4kT5wQd3vHBifE3f7SME3kGKHIOJGPjb5Mb6nAC6UhAH5o8ElubOycEMM/PPiU5H3GegtM/P6ewd00Enhvy9YJK+8QRe4eO1qzHO4r9PBnY0fIE+mkkWdKg+wdG+Rf8ic4EfM0Ergg18SecTGVN5GbEOv/Od7PntIgtnD7glwTL4wPlblrKEERJvJvPOa5MtfBzU1r2FXoo0CJvaVB7iv0sCp0MBTLK02lmC0n1rhaAiLd4wcnRKn5cfbhTnFi6WzWAptEcZwQgiWKE/EZjP1aDpG9Y4+okC5N3GtxeGbsq0T9l4sNE3HJiufyNIq4C6Wc+kGZ2LPvfWPd5GiYsEqjcckusc5arcdjgY6wKpNnGg45IhEiE25LsZiZGAg00iR+kOfY+IviHjJiPU0glCiyd+yRsmsW50AgkMtMChTPmpBIAjZMxLWvlNIKTAj6rAXXwhuLIxN+W74+PfYQRwv76C+dpMg4uVhuyX9qLHV+1ayqZv0ejUNnWb8rrevZyQyrF+PkaaBpotyeGfsqORrY2Hhx2YTA0cI+RuJgWbz3jj3CxsaL6R4/OJEvJYoTEx7Hxl+amIgw0Vwp6MXtMY4W9vGa0jCtufazl9Ht25XE3iN/KYmrHLzbgNCwgWxDowzmTQyYu6yfu/5Uvnesg4OPJSFhA32zUGWptG5BsmyZsOk7In8DHHlSBIbNfh7bIw/Jpla5NtIvguPgY8m/T9+e0tixRvwbGYCcPnzywyJ0NmxPgirk5CE0IVIGUhx2/Sm0rpRrbZ3yuetPRfQM9dQWICaumttF4GTzxcRSLKU8NiF18DF5uMeY8mffrrSculIIGtVETdZibeU31Aenj0seFkYh1yiW8JH+JEys/tSytk11mFEtoZVdFp4VJVkxm807gAf/BJ78VzhzorwemHg0dyP9Kc+MkX7425+G/qNM9JOP/GWqKyZGYwlefEjvj1IGJXVz+jis2pTqzkh/5n4dEFq8inn5rbld3O7+Z3FTWT+yVGunlsdwdlutlfd2vW211Kevfrp8UiYr9uHs9m9/d6yTeg0Sr65ny63QWbfW59QSmdmJBxOstSzWQ30w2CPu8jLmmGgn22+GR/4qlcGjO6tPBFldsDjuvX96FtfJmI6wXLEiCdPNm+VAJ8P2xQ4MiIDN6zihra38HkgCfHAQ3vrW6mFVWnLrTFuIlY3FmREhhNuB9wOvAhqA54C/AD4VYyxNdq+xNbw2vo9vlInD0CCiLiuest8rrYgmJrPistJd9lp+8Gz/Kq9l75nMT6NaPE302bWsO7PgNmaE+WTpqxbPrH/Vwpkq3pPl0VTp+Y//f5jebSVO56EhQDFCawM0T1hehVXNTIjRoQITwjYrUIcKUJJxO4Fy4Zr186q3HuGCTaPkiTTAxLC8mUCeSDs5VocmjkWZmWgnxzCliU9jS2ihJ+bJZ0SX+ZFD4hEz9wPkgFLGnX1m/WvMiGRzb34DtNFAgRLDlGgmUFT5cU1uBYdLo5xhnDYaGKFIqSL8LNnr1dJociMbx2weDVMqS08jgSZyDDA+cb2dHHeEfcR4EkjiNSvwql3LXq/83ogUbO37Jebl35m4Nvm9lLmpdj17f3k8Q9Uwpopjpduz/QQbfJXnSWU6p87fFFaYsGLXDjt9z96Xzcta907uR3kYJngrLe/V4pJ1nyjPh+w9HaGTkTg44f/ahs1sabx8QpC3hRUTv1t4JlYNE7pZIZ2dDEg9TpyITyDoZEWgmVbyjGau1Y63Wc4r8zIbp0p3ALf983/SPlYSoTQyIAPoi29Mwq5B59zbOmGwV763rU4DeBuADurAtbkNigV1M5AGlCEng/nBnvJlNDZ4t98HjkHnZgmrWJB4NTRKnF58SARDQ3MScc3tOogvSD6afxNZlUvubJDaLKs+JkRifpiy9hJyMitbKib/svdn423Y71l3lWGb2M3G04QtQfLuph+TQXZW9FbmkVl/Qw6a2qAwcnYajbZOsQJaHtqEgbltaC4Xc9nyz6ajScvV8jnXIPkD5X/b/fsfEbdbroHxPJx8KcXRJgi235yE100/JiL8xYe0GErleXrTjyUBY4cgZUVXyMHNPyluju05Oy+q1YNsuVW6sfqRpVr+Zt1W3l8YLf9ey12taxN1O5/yAGQSJASxQDQ0wegZmUxoaJTy7libBNpEuakfMcokxxXfkqzFDc3iT1tnap9Wpm2dcr+lu3IpnJVn5X0NzWe3ycp7cw0Sn84tMHwqk7cBVm8tn3go5lO/UBpPfUW1+jpVmWfjUxqXCZlqed65Be6SicAyi6v93dUlgtFE4I4d5ZbaSrH6wAMiFO+8U77/9m+L20p/d+8WQdrZmU4l7tRBs10fGICGhiR47VpnZxLMV11VLoBvuSWFY1h4Fv8HHiCE8HiM8bVMglti54AQwh8DHwBGgS8hi8luA/4IuC2E8J56hGzIqhGlecXkVsP5ptpS27nGRKx9ngtUCtipyBfFOtuUk3tN5A5kxyYVfjaqGGvJiLZGFWLDlCBWbLisQn8sTIg7E21ZP0zIQhKJVpEtfPvM+pcVxSYEgUw8S1UF5zOlQZoJlIAhilVFauXfWSqvt+l3E9WWvsr4WXokTuXxHqZEfyyQmac5h8gKl/mlumBsmsSaPffhzVcY1irKLdC142ICsp44jsWRCeFYokhP8Qinil0TQrRSMGb9C9nl0WqRz2lbLBfSZ09IxMz94i5O1JTJJjmy1anaZMDEfRPuJOxC+wo4c1yFm26tsAFhyCVROtibrEZmgYMkJGxgWswIokpxMNKfLFQTIqoxCVITxiMDaQBsYR58TNxA+QDa4hmCDIgbGsvFZVPr2YPbqsQkMmMJtLzO8s/SUnk9G59qYVcKWbt/9MxEWZAfLrf0FQspDpV5ZP7lh84W6FlRZAKmmE+TACGX3NtvE/4NJ3eQBvwWjqW5sSX5YX+bH9n7zfpt3/PDkr7Tx2XZZ9aKWBxP7gjlfn710ykvvvhxEbwgdc/83/WnItqqMVk9qCZOlwIFWY1CQ3OqGw1NktaYyZ/sMHewFwa6Up5kyy1bL7LWYqsD1t4gWaVNwFoe2WQKpGvZ/sDqQjEzcdnQCCvWpSX/dp+57T9Sfp0o10JFGzzdndI60n92favW/uDssjX/SuMp3tbmIOVLtdUgWbq75X+1ZcJm4ax8Hc7HPla+/LeSri6xnGYtq7UoFs++duYMPPQQrF4tYddD9pCnOnERO0tCCO9GBOxx4OYY4z69vhH4CvAu4GeATyxaJJ0lT3ODLjUuyd8dui1lOkJ4ptQShAvpXw5ozQjO/AIJK8dZDjRUiP2YEc1Zi2ulYGzIWKrTSdn1t83KcOeCuiYXsgM6GxCOnkkDRLPYxVJGtFbxczIxVy/NbeWD7oWg1qB3vsgKThNf9n0+wrF8rFUeWXfZSYZqFufJqGXNrPSzmsCasJ61ld+brW8HH2NiNmakv7zczPJ/LjAxkdBYnkdwdv6UMhbbrGXdqFZu1epBLWvzVJMzpfGz3dvE0kzbU2X8avUDM+0frB+zfKvWl02FWT1NtO7enQ5YMhHa1QVDQ+XC0iynv/u7cOQIEwc5dXeLOG1uhpGMdWlwUKyuU1HSvKgmgO3AqFtvFYuxXWtuhpYW6O2tvey4Ahexs+cX9fODJmABYozdIYT3Aw8AHwoh/GG9y4odx3EcpxbzIS6XFNmBcD0sVQuWs7yotx6dz/VtuaZ9uca7ksq9o5XY3lkQYXriRLKmNjSIuKwmLE+cYOIgpzvvTHtcK6lmda1GdolxJeb3gw+KeAURyiMjIqqLRfjiF+sKZo6n2c4vQgjbgBuAPPAPlb/HGB8EjgKbgDcsbOwcx3Ecx3Ecx1m23P+ECNeurnRq8MCALAnOkl06HALs3y/umpuTm4aG6mEMDoo7s8zWErFzQXOzCNwrrywXrzPALbGz43r93BNjrLWb8zFgq7p9eEFi5TiO4ziO4zjO8mbXnmTRtCW/IEukbWnus8+KBdXcTbWPtRJbOrwQNDdPP341cBE7O/TMdg5O4uZQhVvHcRzHcRzHcZypMQtplhMn0n7V3l4RotXcncP4K3ZmQQjhl4APA38TY7yjhpsPA78E3BVj/Mkqv78P0HOwuQp4fp6i6ziO4ziO4ziOs9S5OMa4fjIHboldZGKMdwFVdmc7juM4juM4juM4lfjBTrNjUD87JnFjZ1Gfmee4OI7jOI7jOI7jnPO4iJ0dL+vnxZO4ubDCreM4juM4juM4jjNDXMTOjif089oQQlsNNzdWuHUcx3Ecx3Ecx3FmiIvYWRBjPAx8E2gGvr/y9xDCLcA24DjwyMLGznEcx3Ecx3Ec59zDRezs+S39/GgI4Qq7GELYAHxSv34kxlha8Jg5juM4juM4juOcY/grduaAEMIngfcDo8AXgQJwG7AK+GfgPTHG4qJF0HEcx3Ecx3Ec5xzBRewcEUK4Hfgp4DqgAXgO+HPgU26FdRzHcRzHcRzHmRtcxC4RQgirgDMxxhhCuBoYAG4F9gNPA98BHI4xPhpC2AJsUTcfAB5Qd2PIEvFjyCt93gg8BawHhoAfAF6DWIrvBb6MnKy8BtgNXAGcAq7V//vUzQ8AB4EjMcZ9Gt/Vel+fJqEEvBJ4H/CfGt44cH+McUDdrwACEPXegzHG0yGES4C8Xi/GGE+EEALwo5qmazX8ryOvM9qr4eU0nZ3I5EFB/78AXA60Ac8ikwqnNG9XA68HetXvfv0/iFjPh4HHgHcCh4FHgbcAJ4E9McaCxg0Nt6j+twBDMcbBEMIFwIj68TJwBGhHLPU3AieAA3r9jfpbJ/AfmkcbNO/2aHm9oGkZ0u9oXn8BWKdxt3c+F5C6cLHG6zhwWv37EaBV4/Yi8Ga9vx04CvyJ/nZhjHG/1rMh4MfUz26N3zeBbwNeATyOnMD9tP7/Ab2G5vGQllOr/taD1KnjwGqkHu9XNzfqvZcg9SMPvEnL8PPA92je3Qh06f//C6kXj2p+fouWxXOaJ5cDLwEXAY/HGI+GEK5F6q3l8wGk7l0EbM34+WXNl61adgAnY4zDWgferPE+hdTP1ZreVwPPxhi7ALQ9HwWi1o9VpHr788i++WGkPX9c8+Qg0ITUL7QsR5H2M6Rpuwk5MO5yTUcJqbOXaFn1A7fobw9E7exDCFtijMdCCGvUzSuRVSM5zatOzdu3IO3tBeCpGONQCGGFpuFapO70xhifCSFcifQ7DRrHZqBP20sT0BJjHNTwLf1Wt5s1Di9ovXi1pvllTX+bfrYgdTlqfr0aGNfwA7AS2K7+FDPl1Kn+XaJ+HVH/htXdmUybtjiNar69BnhB3TRpXq/X+D+PtN8R4MUY4wlN3zrges3DC5C+45j614z0VYeROtiP1Ou9McZ9Go+b1e2TpDb/AvAO4DLgE1r2XwRuADYBn0Pqn5Wn1Z9TWrb/BenvntX8eEnz8xDwWs33l5B2WtRwhzQul6s/BzT/x4Bv1fJ5WMvhKs2PfVpPtsQYj2l+NGmeDSN1owW4VOP+PPBMpq1s1rh0ann2Iv3/8/qs2IL0d5Y2NI4tWiZfAzYjfXwEnkH6nOv0916knX0X8A1NY3uM8UVtD3mN/9X6Wy/w3cBDwECMcUjDRN03I/3IQaRvbNfwShpeu/p3oZZNY6aMTiH17A16f9DPM5qn40hdvVzTd1rr+jagO/MsejX6rIsx9ocQOoH3aPyPAP0xxmc0ztdqPLdpeT+p+bxS43KZlskZ4F2aV4/r5zVaLoeQdl5E+tv9wHci/cd6pD3sAdYi9ekg8I+avlcjfcmZEMJWddOM1O8b1I+TSH8ZNIw3AK/S+B5S/5/X+xo0zF5SW16tabRXIBYz7bdF8wUybVvzZitSvweR+n1CwytputdpHhTV/1NIHTuCjGV6tGya1P24/l8NDFoZaFiWF2tijF8JIVwdY3xO61S75ksH0of0a/4GpJ2PI/1cG9KvoHlmfdOpGONurcMva9laXC5Bnq2bNQ+ftOdClhDC25DxyA9pmezR/BjR+Lweef49hvQ/r9C8O4T0Ey9ouDnS+Mv62c36vwXpxw5rfncAf488y/NIWxnX+8eBt2ve92kZPYX0De8h9dmPkPqOxzVP+pExypuROnSv5tM7kXr+jzHGsUyZvEbjsAHpI16t6XoVUtb7NF0r9Dl6AfJ8s7KwdrYXqS/Dmu99SFu5nvSss7Hrce3/VyN161JkTD6u+dSu/y9H+rRbNX7DGp9vatvv0P6mifSsKWjePq/RG9ay6NRyuAipR0HT+CAy9v4cUvc3af6N6DVrR5tJ4+cx9buR1FZWI+3rsOZRI3AiMw4vapgB0R6Hdey/JZuXU+EidpEJIXwKGSS3Aa9DGmwDUnnySIUZQTqj65AK04ZUUBvQdiIdiHWcA/obGXcjyGDmeWTAkVN3q5COcrO6j0jl6lH/WjUezRqn/Ugja0UqcpuG3U4SojaY3YBU7lbS4LZD3ZzSOI2qfxeQOrxBpCPcjHS4/eqmpP5F0vt3ezQdTcBGjVOeJC6LGuYA0pg71e2Yfp5AGmnU8HMafknv69d7TpDe9btN82OfxrtN3YaMG3uIdiAPmCa91qP32EB/lCRWTOAX1M8C8nBuQDrlJr3WQKoXKzUtlg+bNA9aM3GH9FAoaFiNGf+aSYOrvPqRR+pSIZMe8yNPqjOtGTfPIg/XBnVjghkN42WkwxxB6ktB09ui1xr173H9u0HjZPeXNL0nkQ7SruU1n6wDtfw5ob9bfS1pOEW9vw8pry16rZ00cLVBt/k5qL+XkIfjSg3nCr3Wr7+3kjrmPqT8cyQRNqxxa9C4nFC/ItIW7YHdo9dNxJmItXo/lInzCU1Pm+bFYaTdtGfydZXGZ1R/M0HRShLquUyeFvW/tf2o965D6tlK/Q3138qxgJTzFZnfixrOmF6z+t2r/lm76STV5Qs0rVHzY4zUpttJg5nTWn7Wblv0P5k4/xWwAxmcrCDV/32kfqVNy2UY6bcG1c8hddOgfjZrno6r+yHN+0Yth26kf7V2ckr9XqnXhtWfJlL/Pqa/d2fy2uqSxWMUGUhdTKrzG9U/6wPy6keH5utpddusf6/P5HXQdGXTY20yp3E4qWUSNe6W72c0TTbAPKbpvEbjbPVjlPQsi5n8tXIdUnelini1IAO8DeqfTdCMkfrVIY1D0E/zt0H9PqP5sVn/HiI9syw+7cjz8EpSv2ris0HLdRypZ53IwNbOvVhNaiuWTkh17wSpT7qE1M/2Z9I8pO5Xab4+p/5sybhB42plj6Y5G09rx2Oab5s1r0uZ9B5FBs4bkYGw1b+s0OrT+Fo/fIZU9y7Q71HDtPGG9YPD+v8F5BmQ0/Rcrmlbq9de1PRl222/hn9I3Y8gZWzPn5Uan0b1y573qyjvax9FJgdsAtTatfW7fVoWJ0ltyPqfIkkwWD/TpGGt1nSPkcYnVt6r9Po4Up5R/2/QdFh7DuqmX9N5jfqzldSebJLSxgN9mhbrx2yM0kp5ewqZa9bv2JivRJrwX6FxHcuEtVrDX6/pGVW3lnfDmTR3ksYo1vZtLLdO49xUkQ6Lhz0/m0njApsMuYw07rE+tUPTYWO4Hr3nsky4MVMWrRpuO9IGNpLGDx0a32OaB22aB/ZsH0Lq42pkXLONNK4Z13A71f/WTL536Kc9r7oQsWf1aFTjtyLj1vqmC9Rd0HSu1Li2ksYLgdQ2GjLXQfqXFfq/SeO/R3+7kTTRfYY0HmzS+Ayr+8tJEwVBy+NC/dvCMo1gQjGQ6oKNLWy8Fygfxx3STxuLjOpvo0g9sD6oRfNui5ZLAP4Mmez6C6bAD3ZafE4hsy6bSZXgINIwxpABYQ4ZGB1BCt86/gJJZP4DUjl6EYvYSqSy5dWfY8iDbBMyyN2r4Qdk9nqcZJ0wi+6o/n9Jw2vSe21WxTplG9DZYHMz0vBs8GoDlH7SwK5IGny1IZ3ms0gHNI50JOOkmfMmUifdqnE/onlwGdIJ2wC5R8O0tJxBGr1NEBQ0Xac1bBtQ9SAPDhMvh0gWzh7kAbgx4/cGpPO2B1gRmdm22dsGDb8lE/+1mXw2sVbQ+008dOv3BqQDsAH/CGlSICKdVreG14E8GP8zE2d7eHyTNAA4ofk0QhroRb3+iF5/QvPcBmhdSP05oWX3MlKPTDh062+XapwtLwNSX8wCv0V/fxmpxy1InWjQ+PXopz08xzJlZHUy6v02YBhQP4YzZXFCv5vAMIuG1fFDmu4eZBDRpffY4Hi9pv0Lmfz634h4HURmnS8i1dEhkgAY1e8vAZ/VND6j/owjdfxRzccSUr6rSJMvVg9s0qJRv48j7fckMvDu0Ov24F9BqoNXkurdikw+231B/16lv6/ScJr1/n7K65oN9m3lglmlT2Xybr+m+Rt675Dm32GN5wvqplk/m0iDi1H9u0H9G9C/XyRhD+VWjctpvW4iC6RtrVA/T2s8AnA70keYleclpO6f1ntWIXXMhOwppL2Z5XQAaVcjGr+9pAk3EyYvaxzX6qfVpw6kPx7U/Duk9/YgddQs+SaA7VVtA3rPMaT8BpD204rUq3akr+rV8B/VtG4jrRCxvGwmTYjYZIBNyHRruo5n0lEg9YFmxbGVAnuQemp9rT1Trtb8+LrGPwf8KkkYniBZwKw+t2mYT2h+2ID9BPKMMZF4XPPNROsZdbtW42gD2T5SG9mv7vuQ51u2HZkQ70Xav1mXniUNhMdJljzrvzYgdcUEyqiW3TdI4u5Rjf9azRNrW12ZfLFBd3sm3kUtu4s1XOu7h5E2tFrdnMyU2WH1v1n9z4oHE2lmYW9EVlZcQlrNYBO8JqJXahnZs9z+t2V+tzrfmcnPLsQiV1L/1yICdgdp1YJNcq0mTRCgZWDp7UDq59dIE+RoPtnk+ABJsFtf26Vh36xhW3/SitSbAkm4jgB/SKrbJaRO28TWSXXfp/lnFr4XSJYpE3GtGqdnkXaTfV4fJLWrg6TnGchYLmo+HtWwxkgipEASwNaOIYnxk5oPNsEyoPnYh7Sjr5PEp02SWd9mE6stJPE+qp9vRyb2IjKOsWdov7o/ShoH2gSnTYZZGvKkvmNY455t67YqbJhkUHhB03dS/XgcqTM9em0caTdmbOgH/kDdNJL6tNFM/vST+hv0/guQepcjjXusTlg/vJo0KTFCEtcF0hilH/hXzfP/QOonpImlIZKxICJtwdpyEWlvw6Q+18baf0ISemagakPq11H16zAynjDDifVjecTC20kaE4xrvCweNqZbj9TJfr3/uF5fq27GSEafFSSdmNfrQyShewwpU5sEfIw0lrAJYhOuJ9SffpIYP6m/rUSe9z1If3UJsvp0StwSu8iEEH4WKUBbavk0aXC8E6mMzyGD5mHE3P9dyFKD9YjwOIJYLn8b6UQPIVbbg0hleg1SUR5Clh6vQypSs/7/e3VvFoMbkQr158hyjZuRgeMa4H6N258hYvVfkIfVUWT55ZeR5aePa5pakOVAp5BO5CCyDO5LSMd3M9JxXKDpeAKZDXocWQrZi3TKb0dmjjYhjeY20qDhCWRZyl7kAf9vwI8jD5kLkUbxKNLobyBZzvqRAT9I4/t/gI8gHcV/A35H8/x3gL9V9xdr+k8iywD/XcvEOhGzjD2BPKxe1nseQTqM6zU+fcis8QkNa1jzvBHpKC4nPei+hHQY20kD+T6N688AH0WW91yOLJW5CelAnkGWEX4cWa7zHmRZ7hP6298hS3IgPUBuRcp1rZbZF/R/Hqkjo8jSsNvU3RhpYN+GdKK9wPdqPu5Clh4fQQbhNsiMpJnAp5ClPSeRur5d/bsMqX+vUL8sbx4jiaBLkTpog+AOzffDpFncb0Xq7xOI0N8LvFvz+0ktg+vVr0GNA3r9dUh7eTvy8PlN9eOzwE8jbfXzmua3IZ25LbU5htSvceB/aT58GanXh5A2fAgZXP4eMhi+Tj+/QmpX65Hy/gpSf/8FuEOv79R8f7PG8xRpKXwT0p5XIBYkE/cPIQ+utXrPHi3Xq7W8Ltf09Orfq4EPavq2IfX4R5G6+VWk/m/Vctupfx/WNDUhy1WvRuqRWQXeo/G/SPPqi0gd+1Wk7nwcqTOrkPq+Gxlg/bHm8Ss13f+KLPe6CJlg6gTuQR60l2tefp/GuaTxaSOt8BjW6y8B70X60ddpmNeRlqWdRtqyTWD1If3SmzXeu7WsjiDt/bsQ4XurpsEm5FB/f13z4VuQPnAv8Blk6eotGv4OzbMe4Fe0rH4SKf8WpM+8AhFV/6T5v1v9fAdpNcJ/kCZKrd59n4b5eUTkv6BpHwf+Blkm/Fnk2WEDyTchfU2D5tUZ5HT+70P6n6PA7yODj59E6sj3aP6+AinvXydZxC/TeD2G9I9v0Ly5V8tgBOmnViHPrcs0nE1aDruR/uA2kvD+AlIXvgR8u5bPV5C6+1WSwLT03ojUpT1aHq9A+pkrgb9Wt69H6tkhzQN7Vl2i5fmMfn+D/r9N8/WfkOfdG/X/m4D7NB0PIe3kNFL3ntM8+W699h6Now2iP6HpeQBpg6uRNrZW43EJ0p/dpeX5ZmQFwlqkrt6nfgwi9WiVxm8PUl9WkNrzryLt5kmNxzDSd28nWXxakUMrX0XaHnQcefZ+h6bvB5EB+B9pmv8LIgSf1vz9QaTOQ1oFtUHz49tIE7pXa97fh9SLL5K2flxDskr+i8bnAqSOXqt58nmkL74YeFeM8X+GEH4GqdP7NN7vQOrBryDtb7umoQlpn7aS5Wua95uR+vdqvX6PuvkVLa/Paj4f0XT9JKk/2YOMuUyAX6hlsF5/fwNp69Qw0jbej2w9uULz5mLNqwJS55/RtJzRcr0Z6ZfWIM+2a5G6fz/Sfi7XvFqPtIF8Jo+e0fBfp587SYaL59XNa5C2dz9SJ1Zq/l2P9Dct6m+j5tH/p/nxBPLctAnSw0h7GUD6zINa1m9FnpU5pN/9FFKHjpEmxtZpOgaQ9jWO9CcDmu9/jyyJ79B03IYslf26xvk1pOfzZg3rrZrnv4a014eQPnwr0ufYWOQjyPj2Ob02hPSnNsbciPSNt6n/mzQd34X0iY9quW1E2lqHxu19pKX+f6G/r0GeAbdomf8WUgf/K0mIDiHjqOe03I5o/m7Q+B1C2tJJpK/4uvrxeqSu70LGrjbRsRGp6zdovq9CDGWv1bzo0PAu0/jZc/3rSF1+AqkztpJoh8ahT/PhzaTJ5DUI/4aMsw7qEvu3xRjvZwpcxC4yIYRfRCpdL7JnsQEZiLaSlkjlSLMqNpN8EmkYZqUKpCXAttTAZlhthtPcrUAGHKv1WjtpeanNcJ5Q/1dk3JgF+KtIxdsD/DBpKWQBGRhsUXfrNH6H9doR0vKnDerfaaQiWzirSdYZs2Su0HQ8hXS6p5EO0wSHPbjypJmdi0gDgGbSDOEoaSneCNIJf7fGoRtp0ObePm1WyyyGqD8jGt8BzeNDGv71pBnj0xp/m2leoffb0g5bJmKz3rZMJZAeKsf0t05Np820HdZrLaQl0/3IQ9EsejYj30Fa2mSzyWNIndlGmmVrQQY+kGY3G0kP1DxpaWyDurNla/2aJrMyrCYtT7JZxTF1Y9aMVZpH9vB6LeX11ixw3Ujn20NaYttOWmlgS5XGSDPq40g5W/lZ/V2p/tkM/wq9brP4jaQlVcOk1QQ2m96gbm1mtJM0MLOJILM+2YD7qJbdCS2/bSSLwjBp+c0m9buXZFU5Rdp/0oXU92F136x58hrSsv4cUp/NcmIW7I8i77Pu1HLrJA0IXiYNys1CaNaWLmRg8gAyILXlS2OkWeVVJAvWQdI+3TzJEm79yTipv4oka18z8vC7VuPfTqpLx5B6PUayQn9Tw70kExdr35bmHHA30h/cpuHaKhILA9LytX7SgL5P4zaCtI2LSPX5DGlJVE79tdlrC2ONfu7SPGon7fs7gfSPDaQJh68hfZGt2rA6vJk0MXadpv15ZODWRGrXRf1uVpkuZABj+Vgi7aNbR1ppsYJkJRgmLTfv0jTb86RIWsrXQXqGDCLtrJ3yfnE/aRmprUY5SbKGjJHqUptes5U0vwb8Aml7QiQttRskLYPcpW6snTVr+C2aPvP7UaTdrCbVtbFMnoyS+p0iInzMcmMTisY40l7aSdt8zF+zxjRrHoPUiyeRycWAtOcO0p5esxpa+ZlVqBMRFK8iDfbypC0HVubmRwlpZ53qtqjf92v6Xk1axWPP22zf2UtafhtJ5fkoMnC1lR22oqdX4zaq+fBK0vL500g9u1bdr1H31ibHkDIza7eVgbUpe3aPk/axniH1V1lrYinjpp1kYbQJqlWkCSwQYTuCCJ5IqntmkbqA1Cas384jZTyi/hwkrRZYrfeuIS2NXJGJ7wWaHzYZYBYxy8uAtNegcbHnjVm6LlJ3hUz+9WXK8QBpj6xZ8Fq1LGz8g7rdo2VWIi0DbUQEk01m5BCBYX1VK+m536Z/Wz9o44IVpDFTj+Z5P9K3Xah+rCY9T+w5tV/jfpIkJK3/WqfpWUlqE9af5EjL2G0cuVavHUTawBFEZFl9sH66kPlu4zkbT60i1a0cafWKjaEL+ndPJu3PadqGNbw20tjrmMbF4nYSWQlwMyKSbQwD0r9foXFA82+Npq1J3Vp77Sdty7B+eZ/mdSOpDtgzzMadZl3tJW1xsfHnHqQft/Ff0LK6RNPbQnoWbiI9d8Y1PkHj2oWMgbtJhqKSfl+vcVir/vVpvn0F2ffdqXF/Icb4JuogV48jZ165F6nEbyQtWzuAdIAF/ezSzxFk4BqQBt9L2ssxhgyMi0hHuQaZjbblAZ9EGsnLyOzSPerXCtJygENIR7EGebgMkixL3fr7MWRG+Hqk0jUhHX0X0jj7NF4v6f+jpCUPNoCxzuc40gHYUpcO0lLEcY2niZY/Rh6IlyAPy1bSrHpB02WzQe16bY/+j8BvIAO/dUhDOaP3vp0k0q4lLZVD496g+XOaNBi3B9Ix0nKbbmSw8wrN83Wkh+wKDfMFTcsxzdvH1c8XSEtwrPOLSIcyqHmzjTRQttnJC5GOZIjUydgDc5S0v2cDUge6SUvujmqZWNn8kebpAf3tLvWzCekcTUyaKO8g7Vc1Ad6q8bTBz5jGs580OWLLpOzBfVrDWUuaGR7TOJWQOnBYw/96pqyKpCVVVt7Pa3ijSH20znwQeVieQsSmCVgbIBVJB1C9SLLQ5fXePtIDaD0yc2ptc73mx5tJ+2hN/L6sfhzQMmhD6o+1rzHE4tWlnx3Ig+UbpOVpDyIPDBDLyuWkpXGbkDryGtJemyZN392a3sdIWwl+mLQUrANpP11IPbyUVBeD5kEPaW/ue5E6Yvc1kCyMLeruedIM+3EN57SmdT3SBr6macmRHq4vkw6vuJLUXvpID7kB0hLiXVpeV2l8bBlZv8bBBsD96u+PIALWJnSGNA1FjVsPaRnbqMb9Zc2b/YjFpRupf0c0LFtqFklnENjkm01k5NX9DUif+UaS1QhN72mkr7gImRXPITP0gTTJ0azpeQtS3ocQC1Qzqa2bgB5Cynob8ozoQOrTAFIPxpDnwFimHCMy8IO0JNX8MGv8ZtK+6yJpSamV2VG9t4ckNtuQet9CGvQWSIM8G5yWKN/7dxXS/1xD2odnz4yCltnLmqa3a55AmoR9kbRPvkXjalahoPc/o/c0kpZIF5HVA0eQQeZrSG2vl2SBzCPPocsQYWp1yMJfqX49gfSdT6v7UaRdmkV+P2mC0epvP1K+XXrNnicmHJ5B6l5e8wDSpEZExg/WN34KGQuYhWw16SyMk6QJxG7NX2v7z2s4pxExfyFS/jbZ14iI2utJA9JLSJMxX0Ta+6s0rQ+R9qs3kyaOTiNtzSaPRtT9V0lLb79GEmIrSVZWS/cBzasiMsaxfvm3NC6tpAldy9cWpF41k1brHCP1qUeQvvNBTUdR47yPtGfzStKSVJu4GiJNJDZpmkqarv2kJbY7SdtHDpC2hdmze0TD7tYysv5qUK/vJS2HtsnKsUw+d5MOn7L2M6h+bNO4HQA+rOE9h/QtXaR99KsRo0Ee6YO7NV59mTJrROp9F2lLhNXTopblDtL+yYOkiZMmpJ3u1fQ/g1hen9LfryAZPKKG8ZJes3HLiKYD0jjsuPqXJx2Y2KTxPE7ah24CvkPz5jhpld2/axzMunqIdGDoYf37z0kTKq9D+sjLkP7usObBatKWkkFNbytSN68mLcsd1bi/AekHT5K2ng2QhKaNGexve073afou0nyKSF1qRdquTWxaHR/PlOEAslqipP7ZRN6LSF1YQ9piRsadCdQRLbMV6nYTIs47kT5jO2ni7Er1/1AmDqNaTreSzgb5PLAhhPBr1IFbYpcAIYQfRZYYvJI0o3sBaQ/BZUjFHUYq4zDlB9hYRz2GDCbWkZamXkyaDQxIA21DZmW3qJsRUudrQtBm7k1EtZFOBQVpaGbtGEUa3BhSMa9Rf09pPMzicQBZ2rmaZGEpIIPD11O+p+c0aZbSBp+dJOuOWcECabbNRHAeaSzW0UI65MDSZtaYkyQhbIMvm23NWrHMklokWTFPkfYfrUIadiPysLuFNKPcjnQyo+qHidVG0jKlbk3PSqThr0Y6mDXILH4ReSDYwNsGZPYwtFk3m0lvJNUFi3O7pmNE/e0mzZz2a5jPIA+QTaQH4ShpBs1m+LKzoVY3T2l45r+JyY2Un+I8QDqQwCycNqPXQbJotJIeHgPIQHRYy+wK0t5Jm2XsI7ULs7zb5EiJdACVPRQKJJFgA5XszPUJLZ/jJPF7KzKQuASpNzZ7fg/Sxm5WP7pIp0N/DmkT34VM9LwdGQQ2IQ+WW5EByB8gltKXtUx+DhlgbCUtjT1DmtCwQZ8JvlUazkqknd9AEtn/oP7/iKZzi5bFlcgAyx58NglmD9bVpP2qh/Q3s4R0IW3aBkY24/0UItisnY2Q9sf3aN62IoMSq8M5ZMCwUdPfBvxTjPFzIYS/QwTET2gae5GBpk2K7UKWV1v9Xo3UmUtIA/+1pEPFbNDXjfQzF2i6Hsyka5Om86eRbQbNpJPEX0nar72RNLBaSWqDJnbyyEB8g4YPaW/bukyenNbf9yP1egPpkKSrNbznScvC9yHtwWbVR0nWyD1IXzKmfrYjE3Q2mD2GTFp0kVZ99CH9DsiseI+m1ybB+jU/nkH6tqDl9ASyesL6F+uHmxHL/ZtJ7XRMw7lS49Gv19ZrGIE0MTmEDGrfTNrz3ajhN2ueWBsfrwj3FKkutJP2iq8jrWKx55hZZc26bBPDL2mcNpMG012kCRkTZS8iQraPJBDNCrgJqWPbkHbSpmk7TDqVfT3y/LI+r4C05+fUn82kfsmsTja5YRPOI/r7BkQAvok0qWX98BmkXewjbbuwlUEtSD/1GPJ8Hif1YRci9SUi7X+73jOg6TpMOhPDVi+YFexS0t7MizQvDiCW6zGk3R3Sz3bSM8DKwvr+C0jC09J1Rsv1Es2PLmRlxk3q7lnS6rBWktXzRfWvg3Ry+FrS5KMJoK0a9gvIZH0jad/6Zv1vFr2C5sPVGq9T6nevpmcd6Y0E25EJy5tIk0I28feclsEPa7zuQdrgatIY7lHSpNDXSGO4b9U8trIzi6OJjqDxWUE6dG03kIsxviWE8D9Ih/hYfO4gTRyMI/VsLUkAbSGtCjDLpln1YiaeVq42udOA1JMjSB16Qv26BhnrPIMsSbdJFrO0NiJtqkCafCDj/wotB7Pm2zjoIOlQqJUkI0meVP/PkCYzr1Z/zeJtfeuLyHjaBPgV6q5AEv8bSJZTMzqsJ60QgjTpdLGm29JwmiR4zQDSgrS/DaR+FKQ+bSEdjPQiaaVQUdPZpu4HSLrCymiUdHCYjbWOaZ7bxIdN5GctvvZMM0OEWZttVYJZ+HOkQ7XsOdZMOsjUJnDXa3pOUn7AaBfwxzHGv2UKXMQuAUII/x3ZszOGNOQSaTnHYdLDD6Sxvai/WcMxa2qO9PBbjXSC70eWivw0slfiKqSyH0Eq1reQKncLacnGYxqGdYg9SAWzeIwiD4obkU7oUtIAqkn93I50Ss8hs9qtGvY6DcesF6tIwqEfaVhdJCHdgHRC9t2E5VG9/lrSnrfV6scqUmdp10AaaK7iejW31dxVXhtXP1cgVu83IR3weqRRd2ta+5BO6AgiRoYoX5r8KNIhrkH2Gb0deeCtID007IFsotUa7jbSg3icJIYv1Xu+gQwEbfBe0Li+FRHFn9N4fB15sL4D2Vv0h4j1aDtSfm9EOqSXSce995NOtbxY42wPmeeRgbPNRK9FBiytlC+lW0OqX2jeZS1cLyEPnwKyV++DpFcWmVXKJgu2IA+sBtIhT2ZF3UKaBDqE1JlmkvXAJo5KGrcNpEkOa5P2ALHlU9mHQj4TD0jLkGyZXFvGHSSLqFmofhcRsDbzv4Y0eB4nTbrYvRaO+Wmzwi9pOr4GvCLGeFUI4SktmweQ/eo247xC/f8y0v/0kwajX0EGRmdIA4YzJOvbN0knmpsF22aIr9C4bCHVvycQqxKkB/ZzSP16QsNqIS072oiU819pnPeqP4NIHWslnYRo+VEknbhtD3ezlIySTpi0QY5NctnACs2XAZLV/K+Qfec9iPjKZdwVSBNS1iedoHyrxAXqz0Wav1H9tomoPtLhQtn61EN52+9E2urbSa9KsHo5or/3k/pAswQWEYtZv96ziXRCvU2A9GoYZ0hL7DpIKzeuJq3sCEjZv57UH9ngxvpnW7UxjJTjcdJka9TfNyMibitpsGQWWpsQ26BpOUr50tNWpAyPkJalNpOWtY0h4usppJ1/GXmWmejrQCalXoG0k1tJA0WQMreJJrNW2+C3Vd1Yesf073ZSe7X69BRijRzUMlqHCIfrNU/W6W8XkFZQdJNeY9OiabqSJNJ79e+1pFd4PEeylJ/SPD9Jmnx6CmmL9ixape4uIR3UeIS0RcKeWzaJ97eIoPl79LUwmgebNd+uRqyg36/58EfA/4v0Rc1I3be6YpOc+zX8bP15Tv2PJEtTkTQxaau0Tuq1bSTLqk1am2XfRMxxje8Z0imrReQ5djlp+4nVrVUa7klSu2whLYVs0HhuQ9rohaSVO1uR5/+16lc/6cDDvyPtBdyPvMatgXQA4h6N51ZNxxfUn61IPftbZH/nMWR/9nfHGL8thPAvGtYK0jYTEx32jG1AxmpvIFlun9FwGtSfnwc2xBh/Ucej48jrGwdJJ/raBP8g8kw+SppAulD9fhqp3zYxXCIdQrpO49dGWiH4JXVvk/IPkbaHrSEdumeCvUnz/iBSX59AxpbHSRPa1v7WaDhm/DCRdUjz9Vmk7vaQttg1I+XcpN8vQ55tl2lcb1L3DchzdIBkyLFJFpu8vgjZhnMT0haPaVoO6L39pAmsbB7ZeMGu2XNsBdJ+15NOYA6aB28iTTatJI3pXk9aUbaR9Codm7C2CVibdFlDGt+eVr/MOAFpNZ61qXHSdhdbWTNA2v72lMZhiHTSv/XDJZIwL2h+PY5MEj8P/HuM8deZAhexS4AQwhuRJQlrSLNgNyEFeh0iFJ9BlumMIZX6dUjlfDNpf9hlSAN7kiSctpL2xz6LLDG6GXlYrCI9iF6NVJx/QGbBViAD0FciAqYRqbR/Rnowbkc62k8is/PDGq8rkIfLpxEhdBGyaXuP+rdOw/5uRLRdiXREHyUdVrINach/jTwYvzPz3RrUC3rvRcgBETdrfPbotcNIo7Zr1Lg+02vPk/a4/A1yIMYJLaM7SMttDiLWohs1nYeRzvo69fNP1E0R6Yy+HfhLDecO5DCUH9T0Zx/0IB35RtLgxmaIn1P3W5El65ciHfNqkrX7ONLpPKJx+lmNVwkRuDbrf0rvHyO9WuJl0us+IjJhsgGZFLHO7K/1vteR6uOAhnWF+neStBQLpKxXkvbMPKxxehsy6HhY89GsNRuQB+uTSN25nPRKqnFksH6GtKfSRMhh5GGwiTSoskGWWR3HkfJ9GLHSRdL7MK/SvP0yUlePZH4D6ehX6bXPI23T3KHXv4Ysc309ckhDC2Jt/DN1/xZN4yNIXXk6c6+FY36+hSQ0T+n/m/S++zQNWzTNJ5AH80b161HkMJgNGueVyETGLZoHL+m9TyAD/jYN5wFkILqJs0/BzU64dGj+rtI8A3moPqlx+H0N/1qkzzAR+3iM8akQwvcih8Jcob8HpN/r1/y3/LgaKf9jJIHzFmSw/52krRRo3I4gA6gn1M93IRN9jyOix8L/cY3/qzX+Zk17o+bFnyOTP/1aRu/XPH0QqfuXkSwgJ/Tv/SQx8jhiIb+PNJj6T6ROPI0c1NUdY3xdCOFD6nfUvP8W0smba5DBbhMyKfAXiBhr08+XtFwu1jw2C7EJyas0/X+NHBL4DVKfMowIwi8gA/AnNV+bNY6vR/rz41oWo5r/r0LqyTipn7qVtJqoU+N5CWlP3CHk8LgPaRzuRw5UezLG+AE9lOcHkL7jIc2/n0Pq9m7N7y5N51rkQJwPafrfSDqlfhXSDk6RtkF0I9bKTi27o6SDTn5Hy/kEaaLmaUS8/QPp/cYDiEC2vPhxpJ5uJ1kuLyENzAcRgWGD+3HEojqocVyLvktS82cPUj93IW30GFLuHyYdOGUisJ10iNV1SH/+NfV/C3KoYF4nu27RfOxD+vOfR9rJx5Dnz0nS/tm85t2F6s/nSMuWn0bK/iZkIH2GdIbFq5G+8UvIs+EI0vb7kH7yZWRV2lbkMLxnkXr9ZuQAMVtZMaJumhFR9wNI+zGr+2PIhOcnSYcegtThLqROvkrzeq+Wz41IPSion69DDlEaUL+bkPemNqi7nwX+by2T9ZrOOxHx9wbkMKOfQOrKV0gTQ48h7f0S0iqHw0i9a1O//w4ZO+wgbbu4BRkDtmn6n0PGPD1IPf0+kvi2CXKQvqJH73kFaf/kYWB9jPF7Qgjfp+neTHo1zGuRfuRCpHxfQdqa8+2kU7f7NW2vUr8f1/y8Fak/uzNxeTXS3n4PKbOXkPb9CdL7rvciZX4F0pZH9LtZSI+TXiM1pP7Z+PQbmsevR+rgYdIbEd6kfo4g7eogUn82A2Mxxhu0DdxJeovEoxqH1fr3tyFjvcc1/q9B+oOPIxPRL2v4l2rav4mMXU6SltzuQ/qLA8iYd7wij7Yg9WK3ujfDz3rSmQYntTyeR/riN5BOB75Nw35S87SfNJ42y+4LSP+4jXSmi4lMa98FpH1ejZT71/V7l5bjMb33rZqXo6R9xa9F+vhtmgedyPOlQDozYAPp3IIXSQeVPaXp+c0Y478zBS5iHcdxHMdxHMdxnGVDbmonjuM4juM4juM4jrM0cBHrOI7jOI7jOI7jLBtcxDqO4ziO4ziO4zjLBhexjuM4juM4juM4zrLBRazjOI7jOI7jOI6zbPg/SQ45Gwr0KU8AAAAASUVORK5CYII=", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "cluster = hierarchy_clustering(linked, labelList, n_clusters=6)\n", + "cluster.head()" + ] + }, + { + "cell_type": "markdown", + "id": "48b62135-409c-45a9-b604-6e98ccf059fd", + "metadata": {}, + "source": [ + "

    The above dendogram is for only 6 clusters with the colors representing the different clusters. Now, we plot the Resistance curves for each cluster.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d31bafdc-9f43-4083-9677-ef7d94c18eb1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(2,3,figsize=(20,10))\n", + "colors = cluster[['cluster','leaves_color_list']].copy().drop_duplicates()\n", + "for k in range(6):\n", + " plt.subplot(2,3,k+1)\n", + " img = plotcurves( subset_zoom,\n", + " field='RESISTANCE',\n", + " row_axis='TIME_MS',\n", + " series_id='WELDING_ID',\n", + " select_id=list(cluster[cluster.cluster ==k].CURVE_ID.values),\n", + " noplot=True)\n", + " plt.imshow(img)\n", + " plt.title('cluster : ' +str(k) + '\\n' + str(cluster.groupby('cluster').count()['CURVE_ID'][k]) + ' obs.',fontdict = {'fontsize' : 10, 'color':colors.leaves_color_list.values[k]})\n", + " plt.axis('off')" + ] + }, + { + "cell_type": "markdown", + "id": "f50fab99-9231-410d-bdd3-1132fc98575f", + "metadata": {}, + "source": [ + "

    And if we plot the curves per cluster, we spot the curves with a sharp drop(cluster 4) and these are the curves we are interested in, i.e. the curve exhibiting the anomaly we are looking for. We note also the other clusters are looking more or less similar. By monitoring the resistance over time and calculating its derivative, you can detect any sudden changes or anomalies. Anomalies might indicate a problem with the welding process, such as a sudden drop in current or a sudden increase in resistance.

    " + ] + }, + { + "cell_type": "markdown", + "id": "9b99a7ac-6a99-4c9e-9ead-0f6d6e5c4759", + "metadata": {}, + "source": [ + "
    \n", + "

    5.3 Create the anomaly dataset

    \n", + "

    Now we create a table containing the anomaly flag that will be the target of a supervised machine learning model or a relevant KPI to monitor in production dashboards.

    \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cec5b577-b0dd-45c8-8fad-fee1fb1f952a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "target = cluster.copy().drop('leaves_color_list',axis=1)\n", + "target = target[target.cluster.isin([1,2])]\n", + "target['WELDING_ID'] = target['CURVE_ID']\n", + "target['anomaly'] = 0\n", + "target.loc[target.cluster==2,'anomaly'] = 1\n", + "target.drop(['cluster','CURVE_ID'],axis=1, inplace=True)\n", + "target.groupby('anomaly').count().plot(y='WELDING_ID',kind='bar',figsize=(10,10))\n", + "copy_to_sql( target,\n", + " table_name = 'Anomaly_Target',\n", + " if_exists='replace',\n", + " primary_index='WELDING_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "3ac7c451-2fb3-45fa-895d-e881cc88a9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dbe9eda0991244729344c7a9cebb4f65", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDanomaly
    3261
    1830
    5300
    9991
    3871
    " + ], + "text/plain": [ + " anomaly\n", + "WELDING_ID \n", + "326 1\n", + "183 0\n", + "530 0\n", + "999 1\n", + "387 1" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anomalies = DataFrame('Anomaly_Target')\n", + "anomalies" + ] + }, + { + "cell_type": "markdown", + "id": "da6297fd-6f49-4619-af30-791db2af90da", + "metadata": {}, + "source": [ + "

    The above anomaly data has the welding ID and the anomaly flag.

    \n", + "
    \n", + "

    5.4 Build the analytical dataset

    \n", + "\n", + "

    We prepare the analytical dataset by joining the feature table with the anomaly table using the Welding ID so that we get the anomalies for the weldings.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "fe4cfcfb-7d91-47e5-a4cc-e44428e51cfe", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "62175dfc79d94e4bbd5646fd055c4db3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDcount_RESISTANCEsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEpercentile_RESISTANCEunique_RESISTANCEmedian_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEanomaly
    570637108597.7712408661676.02029807955392352.4795596378072170.4831573639971286.51173633708574115.06655015743611637115.066550157436117484.280524057440.7249096707674543-1.00640153415498950
    1831280180122.75820553369.41522063694657414.25927750342277140.7209048480726586.13303343939174104.77905816931691280104.77905816931697418.8994494713761.90635898437782552.2020918670737320
    530888142793.8025924895580.33864364172554352.84994686956196160.8038317482990581.96568876456104110.63194606046804888110.631946060468046718.3741346488870.9804755013182304-0.48659350734068280
    9991009167017.7252387674578.48183376623636339.3274318454616165.5279734774702383.15916845787675111.430836619855941009111.430836619855946915.44729860552250.7309353059569027-1.11772798221020731
    3871629252327.6695428601867.734531657172385.3317739232777154.897280259582789.27281066015279107.33433184613271629107.33433184613277969.6347231634891.31458406785533070.234406433063343381
    " + ], + "text/plain": [ + " WELDING_ID count_RESISTANCE sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE percentile_RESISTANCE unique_RESISTANCE median_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE anomaly\n", + "0 570 637 108597.771241 76.020298 352.479560 170.483157 86.511736 115.066550 637 115.066550 7484.280524 0.724910 -1.006402 0\n", + "1 183 1280 180122.758206 69.415221 414.259278 140.720905 86.133033 104.779058 1280 104.779058 7418.899449 1.906359 2.202092 0\n", + "2 530 888 142793.802592 80.338644 352.849947 160.803832 81.965689 110.631946 888 110.631946 6718.374135 0.980476 -0.486594 0\n", + "3 999 1009 167017.725239 78.481834 339.327432 165.527973 83.159168 111.430837 1009 111.430837 6915.447299 0.730935 -1.117728 1\n", + "4 387 1629 252327.669543 67.734532 385.331774 154.897280 89.272811 107.334332 1629 107.334332 7969.634723 1.314584 0.234406 1" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ADS = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "ADS = ADS.assign(WELDING_ID=ADS.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "ADS" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "20a2163c-9fea-4f3d-ab0b-696b3cccaad9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(391, 14)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ADS.shape" + ] + }, + { + "cell_type": "markdown", + "id": "c03b26f4-0fa4-4478-922e-9cb850acbe34", + "metadata": {}, + "source": [ + "

    The analytical dataset we created has 14 columns and 391 rows which will be used to build the model below.

    " + ] + }, + { + "cell_type": "markdown", + "id": "09b3168b-8c53-4ffd-ba75-b26f40608654", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    6. Build the model

    \n", + "

    We have datasets in which different columns have different units – like one column can be in kilograms, while another column can be in centimetres. If we feed these features to the model as is, there is every chance that one feature will influence the result more due to its value than the others. But this doesn’t necessarily mean it is more important as a predictor. So, to give importance to all the features we need feature scaling.

    \n", + " \n", + "

    Here, we apply the Standard scale and transform functions which are ScaleFit and ScaleTransform functions in Vantage. ScaleFit() function outputs statistics to input to ScaleTransform() function, which scales specified input DataFrame columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d5d0898e-53a7-4aca-9f24-2e2f06ac73dc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ScaleFit , ScaleTransform\n", + "scaler = ScaleFit(\n", + " data=ADS,\n", + " target_columns=feature_names,\n", + " scale_method=\"STD\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "76af7c0a-b1cf-4914-a099-aeaeeb0c4977", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c3445e103d204f1ea97a9885e3fab53a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    anomalycount_RESISTANCEsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEpercentile_RESISTANCEunique_RESISTANCEmedian_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCE
    1-0.44549580137880120.491755163242834470.0582665013373593961.11502008560513621.35397556322812871.82006190703546040.9194382247575821-0.44549580137880120.91943822475758212.0817677950852636-1.0994434098745625-1.0566279679947848
    00.7935251829744480.523797308954638-0.471238281677336331.9938112179994238-0.8067834082724330.4409365865810209-0.57771809233581870.793525182974448-0.57771809233581870.388474702117049041.39880231152042531.5933929191283127
    10.00346709648096439830.191506855383746230.007189680804109719-0.572992840326859-0.0239394435629014250.21488164140523353-0.429876020597413530.0034670964809643983-0.429876020597413530.14170515277163295-0.3998375077234471-0.7211087815327962
    11.81098006189041022.354620840583509-0.55992504846580251.0028942996347474-0.35941535773196330.679603160858965-0.52092471639160841.8109800618904102-0.52092471639160840.658420306806380.493265058504567260.22156809424649065
    0-0.3492894661231371-0.422713518102620530.10517001290583916-0.10977661846737265-0.17302057428269490.12416064757468764-0.4476321194017249-0.3492894661231371-0.44763211940172490.04510877161800008-0.017989719458924348-0.2810964068413063
    " + ], + "text/plain": [ + " anomaly count_RESISTANCE sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE percentile_RESISTANCE unique_RESISTANCE median_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE\n", + "0 1 -0.445496 0.491755 0.058267 1.115020 1.353976 1.820062 0.919438 -0.445496 0.919438 2.081768 -1.099443 -1.056628\n", + "1 0 0.793525 0.523797 -0.471238 1.993811 -0.806783 0.440937 -0.577718 0.793525 -0.577718 0.388475 1.398802 1.593393\n", + "2 1 0.003467 0.191507 0.007190 -0.572993 -0.023939 0.214882 -0.429876 0.003467 -0.429876 0.141705 -0.399838 -0.721109\n", + "3 1 1.810980 2.354621 -0.559925 1.002894 -0.359415 0.679603 -0.520925 1.810980 -0.520925 0.658420 0.493265 0.221568\n", + "4 0 -0.349289 -0.422714 0.105170 -0.109777 -0.173021 0.124161 -0.447632 -0.349289 -0.447632 0.045109 -0.017990 -0.281096" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ADS_scaled = ScaleTransform(data=ADS,\n", + " object=scaler.output,\n", + " accumulate=\"anomaly\").result\n", + "ADS_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "5cc1ed77-bd6e-4476-9b76-abb448c7199b", + "metadata": {}, + "outputs": [], + "source": [ + "df = ADS_scaled.to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "3b3a8548-555a-48fd-88e4-795abaff2cc5", + "metadata": {}, + "source": [ + "
    \n", + "

    6.1 Create a model file using the python libraries.

    \n", + "\n", + "

    The Vantage Bring Your Own Model (BYOM) package gives data scientists and analysts the ability to operationalize predictive models in Vantage. Predictive models trained in external tools with sample data can be used to score data stored in Vantage using the BYOM Predict. Create or convert your predictive model using a supported model interchange format (PMML, MOJO, ONNX, Dataiku, and DataRobot are currently available), store it in a Vantage table, and use the BYOM PMMLPredict, H2OPredict, ONNXPredict, DataikuPredict, or DataRobotPredict to score your data with the model.

    \n", + "\n", + "

    A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary. One way to solve this problem is to oversample the examples in the minority class. the most widely used approach to synthesizing new examples is called the Synthetic Minority Oversampling Technique, or SMOTE for short. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

    \n", + "\n", + "

    Then we use the RandomForestClassifier to create the model. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. It is basically a set of decision trees (DT) from a randomly selected subset of the training set and then It collects the votes from different decision trees to decide the final prediction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "d847d16a-9735-4482-953d-66c80faf0bdc", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df[feature_names]\n", + "y_train = df['anomaly']" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "4350a66c-2ff9-483c-ae30-8f17c5d375b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Balance the training set using SMOTE\n", + "smote = imblearn.over_sampling.SMOTE(random_state=42)\n", + "X_train, y_train = smote.fit_resample(X_train, y_train)\n", + "\n", + "\n", + "# Create a random forest classifier\n", + "model = RandomForestClassifier(n_estimators=10,max_depth= 3, random_state=42)\n", + "\n", + "# Create a pipeline that includes the SMOTE transformer and the model\n", + "pipeline = PMMLPipeline([ ('model', model)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "455a3ff5-e8ee-4c9b-909e-3e1a79fa6612", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "duration : 0.02437281608581543 s\n" + ] + } + ], + "source": [ + "# Train the pipeline\n", + "start = time.time()\n", + "pipeline.fit(X_train, y_train)\n", + "end = time.time()\n", + "print('duration : ', end-start, 's')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "61ff634a-aea7-4966-bf38-30b77547f0a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 94.02%\n", + "Precision: 93.85%\n", + "AUC: 98.74%\n", + "F1-Score: 94.03%\n" + ] + } + ], + "source": [ + "# make predictions on the training set\n", + "y_train_pred = pipeline.predict(X_train)\n", + "\n", + "# calculate and print the accuracy score\n", + "acc = accuracy_score(y_train, y_train_pred)\n", + "print(\"Accuracy: {:.2f}%\".format(acc * 100))\n", + "\n", + "# calculate and print precision, AUC and F1-score\n", + "prec = precision_score(y_train, y_train_pred)\n", + "print(\"Precision: {:.2f}%\".format(prec * 100))\n", + "\n", + "# calculate AUC, AUC requires probability for positive class\n", + "prob = pipeline.predict_proba(X_train)[:, 1]\n", + "auc = roc_auc_score(y_train, prob)\n", + "print(\"AUC: {:.2f}%\".format(auc * 100))\n", + "\n", + "f1 = f1_score(y_train, y_train_pred)\n", + "print(\"F1-Score: {:.2f}%\".format(f1 * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "60a0b3c9-4a3f-478c-a9f9-2ddd786aa332", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    ModelAccuracyPrecisionF1-Score
    0PMML using BYOM0.9401540.9384620.94027
    \n", + "
    " + ], + "text/plain": [ + " Model Accuracy Precision F1-Score\n", + "0 PMML using BYOM 0.940154 0.938462 0.94027" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pmml_metrics=pd.DataFrame([{'Model':'PMML using BYOM','Accuracy':acc, 'Precision':prec, 'F1-Score':f1}])\n", + "pmml_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "da084cfa-5c7b-4899-9c9b-41b065546bf6", + "metadata": {}, + "outputs": [], + "source": [ + "sklearn2pmml(pipeline, \"my_model.pmml\", with_repr = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "c35b23c2-c4c4-4601-b374-9d021a4845b0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "source": [ + "additional_columns = {\"Description\": type(\"RandomForestClassifier model\"),\n", + " \"UserId\": type('demo_user'),\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": prec,\n", + " \"ModelAUC\": auc,\n", + " \"Modelf1Score\": f1,\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": end-start,\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + "for k in additional_columns.keys():\n", + " print(type(additional_columns[k]))" + ] + }, + { + "cell_type": "markdown", + "id": "8351d68c-fed5-4034-b00f-fe0379625090", + "metadata": {}, + "source": [ + "
    \n", + "

    6.2 Save the model file

    " + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "7ffc1be2-d980-4468-9fc9-58ef30e5cb27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model is deleted.\n", + "Model is saved.\n" + ] + } + ], + "source": [ + "try:\n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + "except Exception as e: \n", + " # if our model exists, delete and rewrite \n", + " if str(e.args).find('TDML_2200') >= 1: \n", + " delete_byom(model_id = 'model_anomaly1', table_name = 'BYOM_PMMLMODELS_REPOSITORY') \n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + " else: \n", + " raise ValueError(f\"Unable to save the model due to the following error: {e}\")\n", + "# pass \n", + "# else: \n", + "# raise \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "60c0f97c-52b2-407e-921c-75a61ca2d3fa", + "metadata": {}, + "source": [ + "

    The model file is saved as can be found in the left navigation pane in /UseCases/Anomaly_Detection.

    \n", + "\n", + "

    We create new scaled data to apply this model and predict data. New dataset is created by joining the features and the anomalies.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "60fe7dff-a0fa-43a6-aa03-d11aeed2904e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d07a6981a51e4db19c7fdae5e2a71f36", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDcount_RESISTANCEsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEpercentile_RESISTANCEunique_RESISTANCEmedian_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEanomaly
    3871629252327.6695428601867.734531657172385.3317739232777154.897280259582789.27281066015279107.33433184613271629107.33433184613277969.6347231634891.31458406785533070.234406433063343381
    3451572225618.0533032045676.78441855749419340.6338108095452143.5229346712497172.66592162969769106.393124612773551572106.393124612773555280.3361662933671.40374749654955780.43970636043189191
    326855178859.0644359209779.44978256514692388.6050241978192209.19188822914734104.27611140847034172.13980338906399855172.1398033890639910873.5074104717190.27373796485881025-1.59898200536082661
    1411707217554.7085115232669.20055931911273385.9221118724905127.4485697196972967.96165098216868102.934973854092181707102.934973854092184618.7860042221082.38698574811646224.5244079283770940
    570637108597.7712408661676.02029807955392352.4795596378072170.4831573639971286.51173633708574115.06655015743611637115.066550157436117484.280524057440.7249096707674543-1.00640153415498950
    " + ], + "text/plain": [ + " WELDING_ID count_RESISTANCE sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE percentile_RESISTANCE unique_RESISTANCE median_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE anomaly\n", + "0 387 1629 252327.669543 67.734532 385.331774 154.897280 89.272811 107.334332 1629 107.334332 7969.634723 1.314584 0.234406 1\n", + "1 345 1572 225618.053303 76.784419 340.633811 143.522935 72.665922 106.393125 1572 106.393125 5280.336166 1.403747 0.439706 1\n", + "2 326 855 178859.064436 79.449783 388.605024 209.191888 104.276111 172.139803 855 172.139803 10873.507410 0.273738 -1.598982 1\n", + "3 141 1707 217554.708512 69.200559 385.922112 127.448570 67.961651 102.934974 1707 102.934974 4618.786004 2.386986 4.524408 0\n", + "4 570 637 108597.771241 76.020298 352.479560 170.483157 86.511736 115.066550 637 115.066550 7484.280524 0.724910 -1.006402 0" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "newdata = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "newdata = newdata.assign(WELDING_ID=newdata.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "newdata" + ] + }, + { + "cell_type": "markdown", + "id": "bd7108ab-49b6-411a-a919-4ab7f859252e", + "metadata": {}, + "source": [ + "

    We create new transformed data by using the same Scalefit object we used earlier and get the transformed data for this new data.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "099b4d80-3bb8-4e96-ba57-c85c84ae990a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd54b8613eb14b16835e3bd03d9495bf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDanomalycount_RESISTANCEsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEpercentile_RESISTANCEunique_RESISTANCEmedian_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCE
    3261-0.44549580137880120.491755163242834470.0582665013373593961.11502008560513621.35397556322812871.82006190703546040.9194382247575821-0.44549580137880120.91943822475758212.0817677950852636-1.0994434098745625-1.0566279679947848
    99910.00346709648096439830.191506855383746230.007189680804109719-0.572992840326859-0.0239394435629014250.21488164140523353-0.429876020597413530.0034670964809643983-0.429876020597413530.14170515277163295-0.3998375077234471-0.7211087815327962
    18300.7935251829744480.523797308954638-0.471238281677336331.9938112179994238-0.8067834082724330.4409365865810209-0.57771809233581870.793525182974448-0.57771809233581870.388474702117049041.39880231152042531.5933929191283127
    5300-0.3492894661231371-0.422713518102620530.10517001290583916-0.10977661846737265-0.17302057428269490.12416064757468764-0.4476321194017249-0.3492894661231371-0.44763211940172490.04510877161800008-0.017989719458924348-0.2810964068413063
    38711.81098006189041022.354620840583509-0.55992504846580251.0028942996347474-0.35941535773196330.679603160858965-0.52092471639160841.8109800618904102-0.52092471639160840.658420306806380.493265058504567260.22156809424649065
    " + ], + "text/plain": [ + " WELDING_ID anomaly count_RESISTANCE sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE percentile_RESISTANCE unique_RESISTANCE median_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE\n", + "0 326 1 -0.445496 0.491755 0.058267 1.115020 1.353976 1.820062 0.919438 -0.445496 0.919438 2.081768 -1.099443 -1.056628\n", + "1 999 1 0.003467 0.191507 0.007190 -0.572993 -0.023939 0.214882 -0.429876 0.003467 -0.429876 0.141705 -0.399838 -0.721109\n", + "2 183 0 0.793525 0.523797 -0.471238 1.993811 -0.806783 0.440937 -0.577718 0.793525 -0.577718 0.388475 1.398802 1.593393\n", + "3 530 0 -0.349289 -0.422714 0.105170 -0.109777 -0.173021 0.124161 -0.447632 -0.349289 -0.447632 0.045109 -0.017990 -0.281096\n", + "4 387 1 1.810980 2.354621 -0.559925 1.002894 -0.359415 0.679603 -0.520925 1.810980 -0.520925 0.658420 0.493265 0.221568" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "newdata_scaled = ScaleTransform(data=newdata,\n", + " object=scaler.output,\n", + " # DataFrame(in_schema('demo_user','scaler_anomaly')),\n", + " accumulate=[\"WELDING_ID\",\"anomaly\"]).result\n", + "newdata_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "46bb63a9-35eb-40e9-a4d4-d1aa558b19d1", + "metadata": {}, + "source": [ + "
    \n", + "

    6.3 Retrieve the model file and use it to predict

    \n", + "

    We use the PMMLPredict function from the teradataml library to predict the anomalies.

    \n", + "

    Predictive Model Markup Language (PMML) is an XML-based standard established by the Data Mining Group (DMG) for defining statistical and data-mining models. PMML models can be shared between PMML-compliant platforms and across organizations so that business analysts and developers are unified in designing, analyzing, and implementing PMML-based assets and services.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "16f0c6bb-3551-4337-a4e3-8c2a79fd55cc", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "29a988d766e44b49b4522e5cb63ea9aa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDpredictionprobability(0)probability(1)
    1410.95529042972566050.04470957027433943
    1830.72171872490195710.27828127509804285
    5300.84364078115604110.15635921884395887
    3870.08717783349119830.9128221665088017
    9990.198361185252098780.8016388147479013
    " + ], + "text/plain": [ + " WELDING_ID prediction probability(0) probability(1)\n", + "0 141 0.955290 0.044710\n", + "1 183 0.721719 0.278281\n", + "2 530 0.843641 0.156359\n", + "3 387 0.087178 0.912822\n", + "4 999 0.198361 0.801639" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from teradataml import PMMLPredict\n", + "modeldata_anomaly = retrieve_byom(\"model_anomaly1\", table_name=\"BYOM_PMMLMODELS_REPOSITORY\")\n", + "result=PMMLPredict(\n", + " modeldata = modeldata_anomaly,\n", + " newdata = newdata_scaled,\n", + " accumulate = ['WELDING_ID'],\n", + " model_output_fields=['probability(0)','probability(1)'],\n", + " overwrite_cached_models = '*'\n", + " )\n", + "pmml_predict=result.result\n", + "pmml_predict" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "1f03ec30-32a9-4b13-af64-78eaa88b79e1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f33dc97734874a128e4be620a57ce1df", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDanomalyprob_0prob_1prediction
    11600.95529042972566050.044709570274339430
    58100.74351152412960840.25648847587039160
    28100.75225481005983020.247745189940169780
    99910.198361185252098780.80163881474790131
    85600.79198281008558370.208017189914416170
    " + ], + "text/plain": [ + " WELDING_ID anomaly prob_0 prob_1 prediction\n", + "0 116 0 0.955290 0.044710 0\n", + "1 581 0 0.743512 0.256488 0\n", + "2 281 0 0.752255 0.247745 0\n", + "3 999 1 0.198361 0.801639 1\n", + "4 856 0 0.791983 0.208017 0" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pmml_predict_result = pmml_predict.join(other=newdata_scaled, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "pmml_predict_result = pmml_predict_result.assign(prob_0=pmml_predict_result['probability(0)'])\n", + "pmml_predict_result = pmml_predict_result.assign(prob_1=pmml_predict_result['probability(1)'])\n", + "pmml_predict_result = pmml_predict_result.assign(WELDING_ID=pmml_predict_result.WELDING_ID_l)\n", + "pmml_predict_result = pmml_predict_result.assign(prediction=case([(pmml_predict_result.prob_1>pmml_predict_result.prob_0, 1 )],else_ = 0))\n", + "pmml_predict_result = pmml_predict_result.select(['WELDING_ID']+['anomaly']+['prob_0']+['prob_1']+['prediction'])\n", + "pmml_predict_result" + ] + }, + { + "cell_type": "markdown", + "id": "220bb477-2d63-4672-98a1-cb50d40f960f", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    7. Random Forest using Teradata OpenSource ML functions

    \n", + " \n", + "

    We start by creating a subset for the most interesting part lies between 40 and 400ms from the start of the curve.

    \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "cf8a84c6-2c67-43c7-86e2-1f31c6bd1c18", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc6d18df9e254ce8882ec7453be1ab40", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    PLANTROBOT_IDWELDING_TYPEWELDING_DAYWELDING_IDTIME_MSRESISTANCE
    14192025-06-18833184225.50114877088427
    14152025-07-12489287230.52740760760418
    14192025-06-1881276178.16102130409436
    14122025-07-30131122314.408712136953
    14172025-06-3060169198.1165335758247
    " + ], + "text/plain": [ + " PLANT ROBOT_ID WELDING_TYPE WELDING_DAY WELDING_ID TIME_MS RESISTANCE\n", + "0 1 41 9 2025-06-18 833 184 225.501149\n", + "1 1 41 5 2025-07-12 489 287 230.527408\n", + "2 1 41 9 2025-06-18 812 76 178.161021\n", + "3 1 41 2 2025-07-30 131 122 314.408712\n", + "4 1 41 7 2025-06-30 601 69 198.116534" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_curves_zoom = welding_dataset_new[(welding_dataset_new.TIME_MS > 40) & (welding_dataset_new.TIME_MS < 400) ]\n", + "DF_curves_zoom" + ] + }, + { + "cell_type": "markdown", + "id": "58c9f479-f2ff-4863-b969-b9b8a873e6d4", + "metadata": {}, + "source": [ + "

    We create various features by using the window function on the Resistance and taking the difference between the previous and current resistance based on time. We will create these features by using the aggregation function on this resistance and the difference of the resistance.

    \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "a227337c-3b57-443c-a256-dd5230ed98dd", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = DF_curves_zoom.assign(\n", + " resistance_diff = DF_curves_zoom.RESISTANCE \n", + " - DF_curves_zoom.RESISTANCE.window(\n", + " partition_columns=['WELDING_ID'],\n", + " order_columns=[\"TIME_MS\"]\n", + " ).lag(1)\n", + ")\n", + "# DF_curves_zoom[DF_curves_zoom.WELDING_ID==138].sort(\"TIME_MS\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "cb8c00e7-c465-46ba-99ae-c094969a2eed", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2e2da15e86ab4b1385219f642e37e5dc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diff
    62088342.64044827396137.30900299564482314.06522413849495246.0797784074483445.091061561972292033.203832785575-0.5593553778837088-0.6737088126622688-29.856946521200086
    7282132.21433826616104.70467014938446312.7132264322771228.7805413322177257.0992485544289253260.324185480454-0.4527050989025892-0.9189959880168315-27.171428796617818
    75985850.394882182895.93586249651608349.2521127143311239.1375902010662771.379670292554745095.057331073822-0.40798080630716765-1.042277705763781-34.2552297092069
    71177910.2939722271899.59721397994052308.17260229102055217.020317471384958.5803330215977953431.6554169213014-0.28157863829571916-1.0842604382366274-25.294217256768533
    73586983.36964263133105.16013699875654328.7554675173849242.293508753847756.51508057070153193.954331912882-0.5469935212726376-0.7309877203258219-31.729098532553564
    " + ], + "text/plain": [ + " WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff\n", + "0 620 88342.640448 137.309003 314.065224 246.079778 45.091062 2033.203833 -0.559355 -0.673709 -29.856947\n", + "1 72 82132.214338 104.704670 312.713226 228.780541 57.099249 3260.324185 -0.452705 -0.918996 -27.171429\n", + "2 759 85850.394882 95.935862 349.252113 239.137590 71.379670 5095.057331 -0.407981 -1.042278 -34.255230\n", + "3 711 77910.293972 99.597214 308.172602 217.020317 58.580333 3431.655417 -0.281579 -1.084260 -25.294217\n", + "4 735 86983.369643 105.160137 328.755468 242.293509 56.515081 3193.954332 -0.546994 -0.730988 -31.729099" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_features = DF_curves_zoom.groupby(\"WELDING_ID\").agg({\n", + " 'RESISTANCE':['sum', 'min', 'max', 'mean', 'std', 'var','skew','kurtosis'],\n", + " 'resistance_diff':['min']\n", + "})\n", + "DF_features" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "f6498373-8b50-49fb-ac0b-b0db7b0cb522", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['sum_RESISTANCE',\n", + " 'min_RESISTANCE',\n", + " 'max_RESISTANCE',\n", + " 'mean_RESISTANCE',\n", + " 'std_RESISTANCE',\n", + " 'var_RESISTANCE',\n", + " 'skew_RESISTANCE',\n", + " 'kurtosis_RESISTANCE',\n", + " 'min_resistance_diff']" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = DF_features.columns[1:]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "57712977-e195-4ce9-9867-a7cdbc772279", + "metadata": {}, + "source": [ + "
    \n", + "

    7.1 Build the analytical dataset.

    \n", + "

    We create the analytical dataset joining the anomaly table created above and the dataset with the features created.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "55686241-b413-45eb-a495-9888c946c634", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_target = DataFrame('Anomaly_Target')" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "d4f6e7dc-7a1e-447f-918c-fe675f5d597f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_ADS = DF_features[['WELDING_ID']+feature_names].join(\n", + " other=DF_target, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "41bca2b7-9260-46f2-afed-f0d611fd232a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5f9efcfa58094da8a6ed2b379fdabc9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_ID_lWELDING_ID_rsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly
    75975985850.394882182895.93586249651608349.2521127143311239.1375902010662771.379670292554745095.057331073822-0.40798080630716765-1.042277705763781-34.25522970920690
    52152186311.1620955712983.63565446899322371.03290669363076240.4210643330676586.594690177420827498.6403669235015-0.22559943677069424-1.2315781754798603-48.0002499112636660
    12312383759.9491939642582.6451402512838368.3012871516082233.314621710206882.768855744613536850.683481272646-0.2867184557621234-1.3075810926072349-39.5413438501544760
    342342106500.14073297645178.45785277776122367.5445073395069296.6577736294608545.551502888489852074.9394154000993-0.41900879613466196-0.8516234489970328-31.2662921561825441
    14414483121.0162270333478.52177145197345374.24517113960087231.5348641421541395.914049297260429199.504852597303-0.17056032020583595-1.5247519859543917-35.8187551850936640
    " + ], + "text/plain": [ + " WELDING_ID_l WELDING_ID_r sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly\n", + "0 759 759 85850.394882 95.935862 349.252113 239.137590 71.379670 5095.057331 -0.407981 -1.042278 -34.255230 0\n", + "1 521 521 86311.162096 83.635654 371.032907 240.421064 86.594690 7498.640367 -0.225599 -1.231578 -48.000250 0\n", + "2 123 123 83759.949194 82.645140 368.301287 233.314622 82.768856 6850.683481 -0.286718 -1.307581 -39.541344 0\n", + "3 342 342 106500.140733 178.457853 367.544507 296.657774 45.551503 2074.939415 -0.419009 -0.851623 -31.266292 1\n", + "4 144 144 83121.016227 78.521771 374.245171 231.534864 95.914049 9199.504853 -0.170560 -1.524752 -35.818755 0" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_ADS" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "f1e3ee20-796a-46b9-ad63-4c4cc685a23b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "71f0699d7bd64abfb869d598edbfa25c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomalyanomaly_int
    16881231.3199315335878.20584184189954383.797723458992226.2710861602606792.612598349474038577.0933730410.05391970417306466-1.3925390208869324-36.40555852086231400
    52186311.1620955712983.63565446899322371.03290669363076240.4210643330676586.594690177420827498.6403669235015-0.22559943677069424-1.2315781754798603-48.00024991126366600
    342106500.14073297645178.45785277776122367.5445073395069296.6577736294608545.551502888489852074.9394154000993-0.41900879613466196-0.8516234489970328-31.26629215618254411
    14483121.0162270333478.52177145197345374.24517113960087231.5348641421541395.914049297260429199.504852597303-0.17056032020583595-1.5247519859543917-35.81875518509366400
    12383759.9491939642582.6451402512838368.3012871516082233.314621710206882.768855744613536850.683481272646-0.2867184557621234-1.3075810926072349-39.54134385015447600
    " + ], + "text/plain": [ + " WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly anomaly_int\n", + "0 168 81231.319932 78.205842 383.797723 226.271086 92.612598 8577.093373 0.053920 -1.392539 -36.405559 0 0\n", + "1 521 86311.162096 83.635654 371.032907 240.421064 86.594690 7498.640367 -0.225599 -1.231578 -48.000250 0 0\n", + "2 342 106500.140733 178.457853 367.544507 296.657774 45.551503 2074.939415 -0.419009 -0.851623 -31.266292 1 1\n", + "3 144 83121.016227 78.521771 374.245171 231.534864 95.914049 9199.504853 -0.170560 -1.524752 -35.818755 0 0\n", + "4 123 83759.949194 82.645140 368.301287 233.314622 82.768856 6850.683481 -0.286718 -1.307581 -39.541344 0 0" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "DF_ADS = DF_ADS.assign(WELDING_ID=DF_ADS.WELDING_ID_l\n", + " ).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1\n", + " ).select(['WELDING_ID']+feature_names+['anomaly']\n", + " ).assign(anomaly_int = DF_ADS.anomaly.cast(INTEGER()))\n", + "DF_ADS" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "0199e5db-a881-4a2e-92df-0fcc0a54158b", + "metadata": {}, + "outputs": [], + "source": [ + "# Sample 5% of data for model validation.\n", + "DF_ADS=DF_ADS.drop('anomaly', axis=1)\n", + "# df_sample = DF_ADS.sample(frac=[0.75, 0.25], randomize=False, seed=20)\n", + "# df_sample\n", + "\n", + "TrainTestSplit_out = TrainTestSplit(\n", + " data = DF_ADS,\n", + " id_column = \"WELDING_ID\",\n", + " train_size = 0.80,\n", + " test_size = 0.20,\n", + " seed = 42\n", + " )\n", + "df_sample = TrainTestSplit_out.result" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "d14d6db3-06c1-42a2-9a2f-93c17d30fa6c", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "88801a5c9e6e437eb480a662606af2ab", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    TD_IsTrainRowWELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly_int
    11095127.1795645449144.88744123637863356.36420112865824264.9782160572281754.93373787706333017.7155571458993-0.2307254513118324-1.1858367926106634-32.03203701690951
    18100195.44141325781153.1729060302975369.01676378573256279.095937084283653.341607900425462845.3271334027318-0.4776006642407464-0.7252063427887376-30.307566523428051
    135101079.20526104877187.26783471658626358.73415881062687281.5576748218628649.841284282326292484.153618911666-0.2591425375150598-1.3078073722006127-45.3142813421208641
    1982501.0050051482291.14168264526084365.18561530292624229.8078133848139889.144566161632147946.7536761456095-0.18283169109922082-1.6351004381704022-33.845520531483430
    12187651.6631041502100.06115552653354370.50674200883566244.1550504293877377.731398445349386042.170304269665-0.10823631050168349-1.2142935096694525-35.924492655011360
    " + ], + "text/plain": [ + " TD_IsTrainRow WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly_int\n", + "0 1 10 95127.179565 144.887441 356.364201 264.978216 54.933738 3017.715557 -0.230725 -1.185837 -32.032037 1\n", + "1 1 8 100195.441413 153.172906 369.016764 279.095937 53.341608 2845.327133 -0.477601 -0.725206 -30.307567 1\n", + "2 1 35 101079.205261 187.267835 358.734159 281.557675 49.841284 2484.153619 -0.259143 -1.307807 -45.314281 1\n", + "3 1 9 82501.005005 91.141683 365.185615 229.807813 89.144566 7946.753676 -0.182832 -1.635100 -33.845521 0\n", + "4 1 21 87651.663104 100.061156 370.506742 244.155050 77.731398 6042.170304 -0.108236 -1.214294 -35.924493 0" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sample" + ] + }, + { + "cell_type": "markdown", + "id": "f38cc3c9-6828-4c65-9b72-53ea02a172cd", + "metadata": {}, + "source": [ + "
    \n", + "

    7.2 Train RandomForest Classifier

    \n", + "

    Train dataset is created using sampleid = 1.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "5eef1a68-6211-4b4a-a870-083f6aff1633", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bead7343638f493fa1e329c078fee65f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly_int
    982501.0050051482291.14168264526084365.18561530292624229.8078133848139889.144566161632147946.7536761456095-0.18283169109922082-1.6351004381704022-33.845520531483430
    12106817.32190479447191.7494296610409385.8402275855924297.541286642881552.559225977857672762.4722353915085-0.1999867594892224-1.3022494693454283-31.8261162239902551
    6105201.78708839459199.73777915742855369.66316282209505293.041189661266348.9378547716320542394.91362964935-0.2972221842013988-1.2708881859616288-44.075540597504471
    1095127.1795645449144.88744123637863356.36420112865824264.9782160572281754.93373787706333017.7155571458993-0.2307254513118324-1.1858367926106634-32.03203701690951
    376967.210140556104.14666690623334287.7835576436938214.3933430099052751.091158700261672610.306497335324-0.4833111981907075-0.8441517264101307-22.4506294806287770
    " + ], + "text/plain": [ + " WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly_int\n", + "0 9 82501.005005 91.141683 365.185615 229.807813 89.144566 7946.753676 -0.182832 -1.635100 -33.845521 0\n", + "1 12 106817.321905 191.749430 385.840228 297.541287 52.559226 2762.472235 -0.199987 -1.302249 -31.826116 1\n", + "2 6 105201.787088 199.737779 369.663163 293.041190 48.937855 2394.913630 -0.297222 -1.270888 -44.075541 1\n", + "3 10 95127.179565 144.887441 356.364201 264.978216 54.933738 3017.715557 -0.230725 -1.185837 -32.032037 1\n", + "4 3 76967.210141 104.146667 287.783558 214.393343 51.091159 2610.306497 -0.483311 -0.844152 -22.450629 0" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create train dataset from sample 1 by filtering on \"sampleid\" and drop \"sampleid\" column as it is required for training model.\n", + "data_train = df_sample[df_sample.TD_IsTrainRow == \"1\"].drop(\"TD_IsTrainRow\", axis = 1)\n", + "data_train" + ] + }, + { + "cell_type": "markdown", + "id": "4564b34a-cc44-4d61-945a-696ae04ab384", + "metadata": {}, + "source": [ + "

    Test dataset is created using sampleid = 2.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "be644b0e-dcfd-40ff-ba0a-b5800cfd0875", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "492a163cd1f34097ab58c864fbea4d6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly_int
    2380968.3856646060691.32685504056975362.0288544976556225.538678731493276.160922442548545800.486107299894-0.06229470076093798-1.2994883640937822-38.240981405083060
    26103807.76297225684152.857497527659383.578384675018289.1581141288491557.6083228236750243318.7188585567565-0.21002500953619418-1.3168188493421034-42.3252110300654751
    6997411.15857106655168.72998702644077362.43335495393376271.34027457121655.080923684897663033.9081539815206-0.19776981850595154-1.3735577645369346-33.541965004148611
    8294048.76279519273114.03951750210967366.35161875486074261.9742696244922568.100843494377364637.72488464568-0.4827546065748303-0.8556540622845826-49.745322557524760
    3393185.74584100883124.65019295111625351.7031928317579259.570322676904859.3547381126017653522.9849364155407-0.3229589598739873-1.046640890827172-40.224341015516271
    " + ], + "text/plain": [ + " WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly_int\n", + "0 23 80968.385665 91.326855 362.028854 225.538679 76.160922 5800.486107 -0.062295 -1.299488 -38.240981 0\n", + "1 26 103807.762972 152.857498 383.578385 289.158114 57.608323 3318.718859 -0.210025 -1.316819 -42.325211 1\n", + "2 69 97411.158571 168.729987 362.433355 271.340275 55.080924 3033.908154 -0.197770 -1.373558 -33.541965 1\n", + "3 82 94048.762795 114.039518 366.351619 261.974270 68.100843 4637.724885 -0.482755 -0.855654 -49.745323 0\n", + "4 33 93185.745841 124.650193 351.703193 259.570323 59.354738 3522.984936 -0.322959 -1.046641 -40.224341 1" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create validation dataset from sample 2 by filtering on \"sampleid\" and drop \"sampleid\" column as it is required for validating model.\n", + "data_val = df_sample[df_sample.TD_IsTrainRow == \"0\"].drop(\"TD_IsTrainRow\", axis = 1)\n", + "data_val" + ] + }, + { + "cell_type": "markdown", + "id": "17422855-50cb-4c77-bcea-89cf782e0116", + "metadata": {}, + "source": [ + "

    Copy the Train and Test datasets into vantage

    " + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "130cdab5-841a-4e65-b4f3-090414df65c3", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(data_train, table_name='data_train', if_exists='replace')\n", + "copy_to_sql(data_val, table_name='data_val', if_exists='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "f717da10-a24f-4345-a36e-8554fefcdc2e", + "metadata": {}, + "outputs": [], + "source": [ + "data_train=DataFrame('data_train')\n", + "data_val= DataFrame('data_val')" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "5217c805-010b-4184-b312-b22c7f0b1d49", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import td_sklearn as osml\n", + "X_train = data_train.drop(['anomaly_int','WELDING_ID'], axis = 1)\n", + "y_train = data_train.select([\"anomaly_int\"])\n", + "X_test = data_val.drop(['anomaly_int','WELDING_ID'], axis = 1)\n", + "y_test = data_val.select([\"anomaly_int\"])" + ] + }, + { + "cell_type": "markdown", + "id": "139fef94-8a5b-4cfc-964a-14c1498d8ba5", + "metadata": {}, + "source": [ + "

    Set the session to use the Analytic compute group and cluster to execute the OpenSourceML function.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "72c6f350-9dd5-4e61-af02-b368b8014414", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compute group set to GPUGroup\n" + ] + } + ], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\")" + ] + }, + { + "cell_type": "markdown", + "id": "30978fe8-6c08-47b8-8b79-3157a74151e7", + "metadata": {}, + "source": [ + "

    Check the user environments and create an environment for the usecase.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "bb089556-3f28-4ad8-8de7-1506cf7a4412", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No user environment(s) found.\n" + ] + } + ], + "source": [ + "list_user_envs()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "8dcea3eb-0d96-45e4-b525-76b59dba9b98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User environment 'osml_env' created.\n" + ] + }, + { + "data": { + "text/plain": [ + "\n", + "================================================\n", + "Environment Name: osml_env\n", + "Base Environment: python_3.9\n", + "Description: OAF Demo env for Anomaly OSML\n", + "\n", + "############ Libraries installed in User Environment ############\n", + "\n", + " name version\n", + "0 pip 25.0.1\n", + "1 setuptools 78.1.0\n", + "\n", + "================================================" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "try:\n", + " env = create_env(\n", + " env_name=\"osml_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for Anomaly OSML\",\n", + " )\n", + "except:\n", + " remove_env(\"osml_env\")\n", + " env = create_env(\n", + " env_name=\"osml_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for Anomaly OSML\",\n", + " )\n", + " \n", + "env " + ] + }, + { + "cell_type": "markdown", + "id": "c51c6334-d126-4668-94a3-e8e5f9c76b13", + "metadata": {}, + "source": [ + "

    Confirm that the versions in the local environment are same to the virtual environment.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "f529092a-43da-4a42-9eb1-9beceec29792", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "scikit-learn 1.1.3\n", + "scipy 1.11.2\n", + "numpy 1.24.2\n", + "geopandas 0.12.2\n", + "pandas 2.1.3\n", + "sklearn-pandas 2.2.0\n" + ] + } + ], + "source": [ + "!pip list | grep scikit-learn\n", + "!pip list | grep scipy\n", + "!pip list | grep numpy\n", + "!pip list | grep pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "7a82e027-864c-4590-ba76-dc493311fdf4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request to install libraries initiated successfully in the remote user environment osml_env. Check the status using status() with the claim id '68add8e1-7c66-406d-90f9-1ede4695e870'.\n" + ] + } + ], + "source": [ + "claim_id = env.install_lib([\"pandas==2.1.3\",\n", + " \"scipy==1.11.2\",\n", + " \"scikit-learn==1.1.3\",\n", + " \"numpy==1.24.2\"], asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "606ca6eb-845c-4fea-8bd6-ec2e9094b43f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    Claim IdFile/Libs/ModelMethod NameStageTimestampAdditional Details
    068add8e1-7c66-406d-90f9-1ede4695e870pandas==2.1.3, scipy==1.11.2, scikit-learn==1....install_libStarted2025-08-11T13:43:37Z
    168add8e1-7c66-406d-90f9-1ede4695e870pandas==2.1.3, scipy==1.11.2, scikit-learn==1....install_libFinished2025-08-11T13:47:04Z
    \n", + "
    " + ], + "text/plain": [ + " Claim Id \\\n", + "0 68add8e1-7c66-406d-90f9-1ede4695e870 \n", + "1 68add8e1-7c66-406d-90f9-1ede4695e870 \n", + "\n", + " File/Libs/Model Method Name Stage \\\n", + "0 pandas==2.1.3, scipy==1.11.2, scikit-learn==1.... install_lib Started \n", + "1 pandas==2.1.3, scipy==1.11.2, scikit-learn==1.... install_lib Finished \n", + "\n", + " Timestamp Additional Details \n", + "0 2025-08-11T13:43:37Z \n", + "1 2025-08-11T13:47:04Z " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    nameversion
    0joblib1.5.1
    1numpy1.24.2
    2pandas2.1.3
    3pip25.0.1
    4python-dateutil2.9.0.post0
    5pytz2025.2
    6scikit-learn1.1.3
    7scipy1.11.2
    8setuptools78.1.0
    9six1.17.0
    10threadpoolctl3.6.0
    11tzdata2025.2
    \n", + "
    " + ], + "text/plain": [ + " name version\n", + "0 joblib 1.5.1\n", + "1 numpy 1.24.2\n", + "2 pandas 2.1.3\n", + "3 pip 25.0.1\n", + "4 python-dateutil 2.9.0.post0\n", + "5 pytz 2025.2\n", + "6 scikit-learn 1.1.3\n", + "7 scipy 1.11.2\n", + "8 setuptools 78.1.0\n", + "9 six 1.17.0\n", + "10 threadpoolctl 3.6.0\n", + "11 tzdata 2025.2" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Check the status of installation using status() API.\n", + "# Create a loop here for demo purposes\n", + "\n", + "ipydisplay(env.status(claim_id))\n", + "stage = env.status(claim_id)['Stage'].iloc[-1]\n", + "while stage == 'Started':\n", + " stage = env.status(claim_id)['Stage'].iloc[-1]\n", + " clear_output()\n", + " ipydisplay(env.status(claim_id))\n", + " sleep(5)\n", + " \n", + "# Verify the Python libraries have been installed correctly.\n", + "ipydisplay(env.libs)" + ] + }, + { + "cell_type": "markdown", + "id": "0c50b35c-9929-499a-82ed-249fc0954e80", + "metadata": {}, + "source": [ + "

    Set the user environment to the created virtual environment and execute the RandomForestClassifier.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "e775073f-81b6-47bd-8696-efee242e3baf", + "metadata": {}, + "outputs": [], + "source": [ + "configure.openml_user_env = env" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "95aa7d9d-dadd-4267-9f13-2526d2d6989b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    RandomForestClassifier(max_depth=2, max_features='auto', max_leaf_nodes=2,\n",
    +       "                       n_estimators=10)
    In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
    On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
    " + ], + "text/plain": [ + "RandomForestClassifier(max_depth=2, max_features='auto', max_leaf_nodes=2,\n", + " n_estimators=10)" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "RF_classifier = osml.RandomForestClassifier(n_estimators=10,max_leaf_nodes=2,max_features='auto',max_depth=2)\n", + "#,random_state=42\n", + "RF_classifier.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "aae045a4-5820-4f72-a4f9-d4c4487a9ed9", + "metadata": {}, + "source": [ + "

    Check the params for the Classifier.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ef508629-f7e7-4210-9f8d-9d2f21530a85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'bootstrap': True,\n", + " 'ccp_alpha': 0.0,\n", + " 'class_weight': None,\n", + " 'criterion': 'gini',\n", + " 'max_depth': 2,\n", + " 'max_features': 'auto',\n", + " 'max_leaf_nodes': 2,\n", + " 'max_samples': None,\n", + " 'min_impurity_decrease': 0.0,\n", + " 'min_samples_leaf': 1,\n", + " 'min_samples_split': 2,\n", + " 'min_weight_fraction_leaf': 0.0,\n", + " 'n_estimators': 10,\n", + " 'n_jobs': None,\n", + " 'oob_score': False,\n", + " 'random_state': None,\n", + " 'verbose': 0,\n", + " 'warm_start': False}" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "RF_classifier.get_params()" + ] + }, + { + "cell_type": "markdown", + "id": "aca1ef54-8f11-48af-9d9f-ffe19a08b050", + "metadata": {}, + "source": [ + "
    \n", + "

    7.3 Predict and Evaluate model

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "12ca4daf-6b7f-453b-b690-3ca59df0fb6b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7dba28565d8d45aea0fea458c4379def", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    sum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly_intrandomforestclassifier_predict_1
    110341.49664250232205.03522927809342380.74742538916405307.3579293663017345.52098194783542072.159797495157-0.3214105274578798-1.1819447978660633-49.2578659100591411
    96649.6900333734122.17637448667617366.32142645670035269.219192293519264.876011476483584208.896865096829-0.5753868635026909-0.6683953472392237-41.07628045826919600
    84099.21496523272118.69683849065697355.3352831442057234.2596517137401770.968908185968475036.5859291084230.15449954336904378-1.383727210170674-31.55429269268452700
    107360.61084072788201.71277038171522377.2026817832662299.0546262972921545.1582850104936852039.2707050889785-0.46398574196047354-1.0083162755064483-34.0409350215294911
    119047.29002453877207.8252169086635410.920900680535331.608050207628949.557942003245072455.989615597002-0.6231397861594759-0.8036020482778766-33.4393135710980711
    " + ], + "text/plain": [ + " sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly_int randomforestclassifier_predict_1\n", + "0 110341.496643 205.035229 380.747425 307.357929 45.520982 2072.159797 -0.321411 -1.181945 -49.257866 1 1\n", + "1 96649.690033 122.176374 366.321426 269.219192 64.876011 4208.896865 -0.575387 -0.668395 -41.076280 0 0\n", + "2 84099.214965 118.696838 355.335283 234.259652 70.968908 5036.585929 0.154500 -1.383727 -31.554293 0 0\n", + "3 107360.610841 201.712770 377.202682 299.054626 45.158285 2039.270705 -0.463986 -1.008316 -34.040935 1 1\n", + "4 119047.290025 207.825217 410.920901 331.608050 49.557942 2455.989616 -0.623140 -0.803602 -33.439314 1 1" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#model predictions\n", + "predict_RF =RF_classifier.predict(X_test,y_test)\n", + "predict_RF" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "a6fb0a47-70ed-4c6d-8974-b185f22b5ddf", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "389a0955589845c1b8c4a67bd7749e9c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\n", + "
    score
    0.9746835443037974
    " + ], + "text/plain": [ + " score\n", + "0 0.974684" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#accuracy of the model\n", + "accuracy_RF = RF_classifier.score(X_test, y_test)\n", + "accuracy_RF" + ] + }, + { + "cell_type": "markdown", + "id": "cda02bba-235d-4f1a-b2a7-3e2ea619cce2", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    8. Compare PMML and OpenSource ML model

    \n", + "
    \n", + "

    8.1 Show AUC-ROC Curve

    \n", + "\n", + "

    The ROC curve shows the performance of a binary classification model as its discrimination threshold varies. For a range of thresholds, the curve plots the true positive rate against false-positive rate.

    \n", + "\n", + "

    This function accepts a set of prediction-actual pairs as input and calculates the following values for a range of discrimination thresholds.

    \n", + "
      \n", + "
    • True-positive rate (TPR)
    • \n", + "
    • False-positive rate (FPR)
    • \n", + "
    • The area under the ROC curve (AUC)
    • \n", + "
    • Gini coefficient
    • \n", + "
    • Other details are mentioned in the documentation
    • \n", + "
    \n", + "\n", + "

    ROC for PMML

    " + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "7c4b179b-a334-4dc0-b3f8-71c35f87283e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ROC \n", + "roc_pmml = ROC(data = pmml_predict_result, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "32b946fb-e09e-4e62-b78a-c5325d84c175", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    threshold_valuetprfpr
    90.1836731.00.366795
    80.1632651.00.447876
    70.1428571.00.490347
    60.1224491.00.548263
    50.1020411.00.594595
    40.0816331.00.687259
    30.0612241.00.698842
    20.0408161.00.949807
    10.0204081.01.000000
    00.0000001.01.000000
    \n", + "
    " + ], + "text/plain": [ + " threshold_value tpr fpr\n", + "9 0.183673 1.0 0.366795\n", + "8 0.163265 1.0 0.447876\n", + "7 0.142857 1.0 0.490347\n", + "6 0.122449 1.0 0.548263\n", + "5 0.102041 1.0 0.594595\n", + "4 0.081633 1.0 0.687259\n", + "3 0.061224 1.0 0.698842\n", + "2 0.040816 1.0 0.949807\n", + "1 0.020408 1.0 1.000000\n", + "0 0.000000 1.0 1.000000" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_data_pmml = roc_pmml.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data_pmml.tail(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "de67ebb4-b0f9-4a8c-9559-e6a44f1c9a21", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9715982215982211" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auc_pmml = roc_pmml.result.to_pandas().iloc[0,0]\n", + "auc_pmml" + ] + }, + { + "cell_type": "markdown", + "id": "baf0989e-387a-4ee9-b99e-0687d5a97799", + "metadata": {}, + "source": [ + "

    ROC for tdmlOpenSource RandomForestClassifier

    " + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "02a1c9e2-be8c-44da-9e0a-9056a2ec8243", + "metadata": {}, + "outputs": [], + "source": [ + "roc_obj = ROC(data = predict_RF, \n", + " probability_column = \"randomforestclassifier_predict_1\",\n", + " observation_column = \"anomaly_int\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "27834036-13cc-49e9-a34e-b2bcb2c192b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
    \n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    threshold_valuetprfpr
    150.3061220.9750.025641
    160.3265310.9750.025641
    170.3469390.9750.025641
    180.3673470.9750.025641
    190.3877550.9750.025641
    200.4081630.9750.025641
    210.4285710.9750.025641
    220.4489800.9750.025641
    120.2448980.9750.025641
    00.0000001.0001.000000
    \n", + "
    " + ], + "text/plain": [ + " threshold_value tpr fpr\n", + "15 0.306122 0.975 0.025641\n", + "16 0.326531 0.975 0.025641\n", + "17 0.346939 0.975 0.025641\n", + "18 0.367347 0.975 0.025641\n", + "19 0.387755 0.975 0.025641\n", + "20 0.408163 0.975 0.025641\n", + "21 0.428571 0.975 0.025641\n", + "22 0.448980 0.975 0.025641\n", + "12 0.244898 0.975 0.025641\n", + "0 0.000000 1.000 1.000000" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roc_data = roc_obj.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data.tail(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "ab90afd6-b0c1-4edd-9492-c97b16c8d4e0", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9243589743589744" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "auc = roc_obj.result.to_pandas().iloc[0,0]\n", + "auc" + ] + }, + { + "cell_type": "markdown", + "id": "abb98428-872c-41d5-b8b1-79804c772a8a", + "metadata": {}, + "source": [ + "

    Plot ROC Curves

    " + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "93ab97d1-cbd3-4044-8546-0f170a5ca9ce", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plot 1\n", + "plt.plot(roc_data_pmml['fpr'], roc_data_pmml['tpr'], color='orange', label='PMML ROC. AUC = {}'.format(str(auc_pmml)), drawstyle='steps') \n", + "# Plot 2\n", + "plt.plot(roc_data['fpr'], roc_data['tpr'], color='green', label='RandomForest ROC. AUC = {}'.format(str(auc)), drawstyle='steps') \n", + "# Plot the diagonal dashed line\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') \n", + "# Set labels and title\n", + "plt.xlabel('False Positive Rate',fontsize=12) \n", + "plt.ylabel('True Positive Rate',fontsize=12) \n", + "plt.title('Receiver Operating Characteristic (ROC) Curve',fontsize=16) \n", + "# Add legend\n", + "plt.legend(loc=\"lower right\",fontsize=10) \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c721c745-be69-4eee-a8e2-9faa4ecff46e", + "metadata": {}, + "source": [ + "

    The closer the ROC curve is to the upper left corner of the graph, the higher the accuracy of the test because in the upper left corner, the sensitivity = 1 and the false positive rate = 0 (specificity = 1). The ideal ROC curve thus has an AUC = 1.0. As seen in the above graph the AUC for both the models is close to 1 so the accuracy of both models is very good.

    \n", + "\n", + "
    \n", + "

    8.2 Show Confusion Matrix

    \n", + "\n", + "

    Confusion Matrix is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.

    \n", + "\n", + "

    Confusion matrices represent counts from predicted and actual values. The output “TN” stands for True Negative which shows the number of negative examples classified accurately. Similarly, “TP” stands for True Positive which indicates the number of positive examples classified accurately. The term “FP” shows False Positive value, i.e., the number of actual negative examples classified as positive; and “FN” means a False Negative value which is the number of actual positive examples classified as negative.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "0cac3275-2854-464a-b240-03e7b836b96d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
    " + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Calculate confusion matrix for PMML\n", + "DF_result=predict_RF.to_pandas().reset_index()\n", + "pmml_result=pmml_predict_result.to_pandas()\n", + "cm_pmml = confusion_matrix(pmml_result['anomaly'], pmml_result['prediction']) \n", + "# Calculate confusion matrix for DecisionForest\n", + "cm_df = confusion_matrix(DF_result['anomaly_int'], DF_result['randomforestclassifier_predict_1']) \n", + "# Create figure and axes objects\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) \n", + "# Plot PMML confusion matrix\n", + "disp_pmml = ConfusionMatrixDisplay(confusion_matrix=cm_pmml, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_pmml.plot(ax=ax1, cmap='Blues', colorbar=False) \n", + "ax1.set_title('PMML Confusion Matrix') \n", + "ax1.set_xlabel('Predicted Label') \n", + "ax1.set_ylabel('True Label') \n", + "ax1.set_xticks([0, 1]) \n", + "ax1.set_yticks([0, 1]) \n", + "ax1.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax1.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_pmml.shape[0]): \n", + " for j in range(cm_pmml.shape[1]): \n", + " ax1.text(j, i, f'{cm_pmml[i, j]}', ha='center', va='center', color='white' if cm_pmml[i, j] > cm_pmml.max() / 2 else 'black') \n", + "\n", + "# Plot DecisionForest confusion matrix\n", + "disp_df = ConfusionMatrixDisplay(confusion_matrix=cm_df, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_df.plot(ax=ax2, cmap='Blues', colorbar=False) \n", + "ax2.set_title('RandomForest Confusion Matrix') \n", + "ax2.set_xlabel('Predicted Label') \n", + "ax2.set_ylabel('True Label') \n", + "ax2.set_xticks([0, 1]) \n", + "ax2.set_yticks([0, 1]) \n", + "ax2.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax2.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_df.shape[0]): \n", + " for j in range(cm_df.shape[1]): \n", + " ax2.text(j, i, f'{cm_df[i, j]}', ha='center', va='center', color='white' if cm_df[i, j] > cm_df.max() / 2 else 'black') \n", + "\n", + "# Adjust layout and spacing\n", + "plt.tight_layout() \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6f7bd547-6020-42c0-b2a7-d1938a9bdb30", + "metadata": {}, + "source": [ + "

    The confusion matrix for this binary class classification problem has the below 4 quadrants:

    \n", + "\n", + "
  • True Positive (TP) refers to a sample belonging to the positive class being classified correctly.
  • \n", + "
  • True Negative (TN) refers to a sample belonging to the negative class being classified correctly.
  • \n", + "
  • False Positive (FP) refers to a sample belonging to the negative class but being classified wrongly as belonging to the positive class.
  • \n", + "
  • False Negative (FN) refers to a sample belonging to the positive class but being classified wrongly as belonging to the negative class.
  • \n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "43be6263-22d8-43d2-94e2-1f58d730f567", + "metadata": {}, + "source": [ + "

    Conclusion

    \n", + "

    We have seen an end-to-end exploration process for labelling anomalous time series using ClearScape Analytics on Teradata Vantage. Thanks to the in-database capabilities offered by Teradata Vantage with ClearScape Analytics, we were able to run this exploration with the smallest notebook instance. The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "

    In this particular use case, we have observed that with large volume of machine sensor data millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.

    " + ] + }, + { + "cell_type": "markdown", + "id": "4cb4409a-847a-4501-95bb-8268958315ec", + "metadata": {}, + "source": [ + "
    \n", + "

    9. Model Explainability

    \n", + "

    Trusted AI

    \n", + "\n", + "

    Trusted AI is important for the in-database functions and data pipelines used in predictive AI/ML, providing significant benefits when applied. One way to enhance the benefits: Teradata VantageCloud, the only platform to offer the massively parallel processing (MPP) architecture that enables best-in-class vertical and horizontal scaling of models.

    \n", + "\n", + "

    LIME stands for Local Interpretable Model-agnostic Explanations. LIME focuses on training local surrogate models to explain individual predictions. Local surrogate models are interpretable models that are used to explain individual predictions of black box machine learning models. Surrogate models are trained to approximate the predictions of the underlying black box model. Instead of training a global surrogate model, LIME focuses on training local surrogate models.

    \n", + "\n", + "

    In practice, LIME only optimizes the loss part. The user has to determine the complexity, e.g. by selecting the maximum number of features that the linear regression model may use.

    \n", + "\n", + "

    So, the recipe for training local surrogate models is as follows:

    \n", + "\n", + "
  • Select your instance of interest for which you want to have an explanation of its black box prediction.
  • \n", + "
  • Perturb your dataset and get the black box predictions for these new points.
  • \n", + "
  • Weight the new samples according to their proximity to the instance of interest.
  • \n", + "
  • Train a weighted, interpretable model on the dataset with the variations.
  • \n", + "
  • Explain the prediction by interpreting the local model.
  • \n", + "\n", + "

    Here we will use the model which is created using the teradataml opensouce ml functions to create the explainer and explain the modle parameters. LIME has an attribute lime_tabular that can interpret how the features correlate to the target outcome. We can also specify the mode to classification, training_label to the target outcome (Anomaly), and the features that we have selected on the training process.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "908bc562-13e9-4af4-893f-bf0097d22cc9", + "metadata": {}, + "outputs": [], + "source": [ + "import lime.lime_tabular\n", + "explainer = lime.lime_tabular.LimeTabularExplainer(X_train.get_values(), feature_names=X_train.columns, \n", + " class_names=['Anomaly','NoAnomaly'], verbose=True, mode='classification')" + ] + }, + { + "cell_type": "markdown", + "id": "dcfdd23a-a708-4954-9499-16da43b8c2ae", + "metadata": {}, + "source": [ + "

    We will choose 1 instance of the data and use it to explain the predictions.

    \n", + "

    Note:Please replace the WELDING_ID with the ID we need to get explaination

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "56dc3b86-06ba-4599-838b-37e2c5b193fa", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "13cf6ec9c9da4a76af1198f2f374eb6f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    WELDING_IDsum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diffanomaly_int
    3186754.86978245218118.8968714267842362.48279195869617241.6570188926244569.878426705688414882.9945188622680.08507576069901807-1.42337112306267-35.309703780996130
    4991432.8122418440288.86290355992934363.8854353719825254.6874992809025670.823638156064755015.98772166119-0.407115172919324-0.9634199725189564-35.265341369459120
    66103051.29313790884135.40482535648476380.15990390532056287.0509558159020662.962737284257513964.3062863264304-0.6235690534090123-0.5482607552102269-40.703648564590711
    6997411.15857106655168.72998702644077362.43335495393376271.34027457121655.080923684897663033.9081539815206-0.19776981850595154-1.3735577645369346-33.541965004148611
    13494420.899597615894.11009786670994375.05861641361037263.0108623889019680.601478628631096496.598357121674-0.4659654036466217-1.005563091732258-33.0383809672692340
    15982057.3797522587190.58182588355331336.17284608259644228.572088446403176.269572808218345817.0477363481195-0.30761862548752666-1.254185147630323-30.428410194935340
    16184858.5233496950178.97502599970588398.92248591342815236.3747168515181498.295322133555549661.970353339455-0.047871194645529774-1.3935480197759396-42.871267181087490
    18177288.8501368628286.43693248109098347.45875485406475215.289276147250282.095484982158576739.668654455822-0.00877229297708187-1.3780222101024648-32.233925131194380
    18384737.5484590543869.41522063694657414.25927750342277236.0377394402629111.302014883651712388.1385171606220.04790406345700506-1.5726990344048148-41.1714954969646240
    19170922.0380749592285.22354420250932324.8828641407507197.5544236071287473.602267487526465417.2937793053940.0890401373922395-1.390612017613927-28.396842346793620
    19789400.06752725711101.29585436346969361.75865663789249.0252577360922173.948677595399775468.40691810838-0.24937134877639008-1.150863210615582-37.8314976805794460
    21377113.9208639003595.5138494285472310.05832297655536214.802007977438362.36366694420543889.2269547277774-0.18709413696807978-1.201959178021247-31.099755380094480
    22080431.8067954102999.8457585154082318.63240984263155224.0440300707807462.147565041356873862.319840569682-0.3778431806772692-1.0024887103133715-34.435338121402250
    24675325.5810988189287.41145030453134328.385988902767209.82056016384172.693500841315065284.345064566274-0.16540605044478487-1.3743463677707606-34.407945466649890
    28882600.6500639613105.11201031767013321.9040522651045230.085376222733460.6754745811199143681.5132156441286-0.2672838739091899-1.1058119595257525-39.327263993346180
    21283454.39580858205100.12825272685797361.5405819021655232.463498074044775.202711931799735655.4478818972530.024830506918099376-1.293111687332468-32.12402975534860
    8294048.76279519273114.03951750210967366.35161875486074261.9742696244922568.100843494377364637.72488464568-0.4827546065748303-0.8556540622845826-49.745322557524760
    3393185.74584100883124.65019295111625351.7031928317579259.570322676904859.3547381126017653522.9849364155407-0.3229589598739873-1.046640890827172-40.224341015516271
    26103807.76297225684152.857497527659383.578384675018289.1581141288491557.6083228236750243318.7188585567565-0.21002500953619418-1.3168188493421034-42.3252110300654751
    2380968.3856646060691.32685504056975362.0288544976556225.538678731493276.160922442548545800.486107299894-0.06229470076093798-1.2994883640937822-38.240981405083060
    " + ], + "text/plain": [ + " WELDING_ID sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff anomaly_int\n", + "0 31 86754.869782 118.896871 362.482792 241.657019 69.878427 4882.994519 0.085076 -1.423371 -35.309704 0\n", + "1 49 91432.812242 88.862904 363.885435 254.687499 70.823638 5015.987722 -0.407115 -0.963420 -35.265341 0\n", + "2 66 103051.293138 135.404825 380.159904 287.050956 62.962737 3964.306286 -0.623569 -0.548261 -40.703649 1\n", + "3 69 97411.158571 168.729987 362.433355 271.340275 55.080924 3033.908154 -0.197770 -1.373558 -33.541965 1\n", + "4 134 94420.899598 94.110098 375.058616 263.010862 80.601479 6496.598357 -0.465965 -1.005563 -33.038381 0\n", + "5 159 82057.379752 90.581826 336.172846 228.572088 76.269573 5817.047736 -0.307619 -1.254185 -30.428410 0\n", + "6 161 84858.523350 78.975026 398.922486 236.374717 98.295322 9661.970353 -0.047871 -1.393548 -42.871267 0\n", + "7 181 77288.850137 86.436932 347.458755 215.289276 82.095485 6739.668654 -0.008772 -1.378022 -32.233925 0\n", + "8 183 84737.548459 69.415221 414.259278 236.037739 111.302015 12388.138517 0.047904 -1.572699 -41.171495 0\n", + "9 191 70922.038075 85.223544 324.882864 197.554424 73.602267 5417.293779 0.089040 -1.390612 -28.396842 0\n", + "10 197 89400.067527 101.295854 361.758657 249.025258 73.948678 5468.406918 -0.249371 -1.150863 -37.831498 0\n", + "11 213 77113.920864 95.513849 310.058323 214.802008 62.363667 3889.226955 -0.187094 -1.201959 -31.099755 0\n", + "12 220 80431.806795 99.845759 318.632410 224.044030 62.147565 3862.319841 -0.377843 -1.002489 -34.435338 0\n", + "13 246 75325.581099 87.411450 328.385989 209.820560 72.693501 5284.345065 -0.165406 -1.374346 -34.407945 0\n", + "14 288 82600.650064 105.112010 321.904052 230.085376 60.675475 3681.513216 -0.267284 -1.105812 -39.327264 0\n", + "15 212 83454.395809 100.128253 361.540582 232.463498 75.202712 5655.447882 0.024831 -1.293112 -32.124030 0\n", + "16 82 94048.762795 114.039518 366.351619 261.974270 68.100843 4637.724885 -0.482755 -0.855654 -49.745323 0\n", + "17 33 93185.745841 124.650193 351.703193 259.570323 59.354738 3522.984936 -0.322959 -1.046641 -40.224341 1\n", + "18 26 103807.762972 152.857498 383.578385 289.158114 57.608323 3318.718859 -0.210025 -1.316819 -42.325211 1\n", + "19 23 80968.385665 91.326855 362.028854 225.538679 76.160922 5800.486107 -0.062295 -1.299488 -38.240981 0" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_test_df = data_val\n", + "X_test_df.head(20)" + ] + }, + { + "cell_type": "markdown", + "id": "b7e30288-56ee-46a5-9c14-63a079aba522", + "metadata": {}, + "source": [ + "

    Please replace the IDs in the below cell with any 2 WELDING_IDs from the above output dataframe.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "6ab9cc59-a9a7-4445-be5d-09665230b782", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ID1: 31\n", + "ID2: 23\n" + ] + } + ], + "source": [ + "ID1=input('ID1:')\n", + "ID2=input('ID2:')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "ba156ce1-34a1-4a24-a04d-882d0cd1a082", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d95d72933ad4cdf9900b837a844f5ed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    sum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diff
    86754.86978245218118.8968714267842362.48279195869617241.6570188926244569.878426705688414882.9945188622680.08507576069901807-1.42337112306267-35.30970378099613
    " + ], + "text/plain": [ + " sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff\n", + "0 86754.869782 118.896871 362.482792 241.657019 69.878427 4882.994519 0.085076 -1.423371 -35.309704" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = X_test_df[X_test_df.WELDING_ID==ID1]\n", + "df = df.drop(columns=[\"WELDING_ID\",\"anomaly_int\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "97a9253d-2459-4185-bf31-7613c6049976", + "metadata": {}, + "source": [ + "

    Next, we call the explainer using the selected instance and the model object created using the RandomForestClassifier.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "1e463fab-ef93-42e7-888a-93d560fec61c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept 0.3255261119033892\n", + "Prediction_local [0.16654005]\n", + "Right: 0.049446324561220036\n" + ] + } + ], + "source": [ + "exp = explainer.explain_instance(df.get_values().flatten(), RF_classifier.modelObj.predict_proba, num_features=9)" + ] + }, + { + "cell_type": "markdown", + "id": "36c413e2-fbea-4c14-9a6d-c57439d02db7", + "metadata": {}, + "source": [ + "

    We display the results using the show_in_notebook function of the explainer

    " + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "eace0bf3-be59-47ab-a99f-007c2c2829b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + "
    \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython import display\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=ResourceWarning)\n", + "exp.show_in_notebook(show_table=True)" + ] + }, + { + "cell_type": "markdown", + "id": "1fce7fdd-7877-442b-ad05-6bcf911d7fd8", + "metadata": {}, + "source": [ + "

    This gives a result as shown in the image above. There are three parts to the explanation :

    \n", + "\n", + "
  • left most section displays prediction probabilities
  • \n", + "
  • the middle section returns the features. For the binary classification task, it would be in 2 colors orange/blue. Attributes in orange support class 0 and those in blue support class 1.
  • \n", + "
  • Float point numbers on the horizontal bars represent the relative importance of these features. The color-coding is consistent across sections. It contains the actual values of the variables.
  • \n", + "\n", + "

    We will repeat the same steps for 1 more instance

    " + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "c7900aca-d181-41a0-a114-5a812273d657", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dddb984c51840a68d35ff77a82aa51a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\t\n", + "\n", + "\n", + "\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\t\n", + "\t\n", + "
    sum_RESISTANCEmin_RESISTANCEmax_RESISTANCEmean_RESISTANCEstd_RESISTANCEvar_RESISTANCEskew_RESISTANCEkurtosis_RESISTANCEmin_resistance_diff
    80968.3856646060691.32685504056975362.0288544976556225.538678731493276.160922442548545800.486107299894-0.06229470076093798-1.2994883640937822-38.24098140508306
    " + ], + "text/plain": [ + " sum_RESISTANCE min_RESISTANCE max_RESISTANCE mean_RESISTANCE std_RESISTANCE var_RESISTANCE skew_RESISTANCE kurtosis_RESISTANCE min_resistance_diff\n", + "0 80968.385665 91.326855 362.028854 225.538679 76.160922 5800.486107 -0.062295 -1.299488 -38.240981" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = X_test_df[X_test_df.WELDING_ID==ID2]\n", + "df = df.drop(columns=[\"WELDING_ID\",\"anomaly_int\"])\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "4b55d286-5420-4e04-96bb-5d23cb55581d", + "metadata": {}, + "source": [ + "

    Next, we call the explainer using the selected instance and the model object created using the RandomForestClassifier.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "1ee238a5-3122-44c8-8bc3-286df4f8216f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept 0.3665620653425659\n", + "Prediction_local [0.04139662]\n", + "Right: 0.049446324561220036\n" + ] + } + ], + "source": [ + "exp = explainer.explain_instance(df.get_values().flatten(), RF_classifier.modelObj.predict_proba, num_features=9)" + ] + }, + { + "cell_type": "markdown", + "id": "521cc53a-4afb-498a-84ae-4fe45de60c53", + "metadata": {}, + "source": [ + "

    We display the results using the show_in_notebook function of the explainer

    " + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "e7844ab3-87ab-44d6-a1c9-eb43818d5ad7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + "
    \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from IPython import display\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=ResourceWarning)\n", + "exp.show_in_notebook(show_table=True)" + ] + }, + { + "cell_type": "markdown", + "id": "66c2c88e-1aac-46e4-8fff-789f603ff7e7", + "metadata": {}, + "source": [ + "

    Similar to the previous example, the above image shows three graphs that each show essential information about the anomaly.

    \n", + "\n", + "

    The left graph shows the prediction probabilities and the middle and right most show the features and their contribution towards the prediction.

    \n", + "

    Thus, with the explainer functions we try to get explainations using the different feature values on why the weldings have anomaly or do not have anomaly.

    " + ] + }, + { + "cell_type": "markdown", + "id": "29e90d19-1b71-44e8-b6d5-aa53e3b673c1", + "metadata": {}, + "source": [ + "
    \n", + "

    10. Cleanup

    \n", + "

    Work Tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "48a959e6-319f-4592-93af-482d391224b4", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['ADS_train_data', 'ADS_test_data','DF_train', 'DF_Predict', 'DF_Predict_test','additional_metrics_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "c233387e-cff1-4e6e-81a7-2e3b3221b957", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User environment 'osml_env' removed.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_env(\"osml_env\")" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "bbf8f9bc-9f3a-47e9-b2d4-81fd00291bc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No user environment(s) found.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "c5cea44c-e3e0-4634-bfa9-efa65c42ac44", + "metadata": {}, + "source": [ + "

    If you have updated the teradataml package, reinstall the package by uncommenting and running the below code cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "93311aa2-79b1-44bd-926d-5c5bc23a1999", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# !pip install teradataml==17.20.0.6 --force-reinstall\n", + "!pip install scikit-learn==1.0.2 --force-reinstall\n", + "!pip install numpy==1.24.2 --force-reinstall" + ] + }, + { + "cell_type": "markdown", + "id": "d51fd98f-b9b2-48b9-b639-16cc51f9116f", + "metadata": {}, + "source": [ + "
    \n", + "

    11. Exploring the Versatility of this Analytical Approach in Alternative Use Case Settings

    \n", + "

    How this analytic approach can be levaraged in other use case settings

    \n", + "\n", + "

    The analytical approach of leveraging clustering followed by classification for anomaly detection in short time series data is highly adaptable and can be broadly applied across various industries, especially in settings where operations or processes are characterized by short, continuous time series with a defined start and end and where ground truth labels are not initially available.

    \n", + "

    This method begins with unsupervised learning to explore and understand the data, identifying patterns, similarities, and potential outliers through techniques like Dynamic Time Warping (DTW). Such exploration is crucial in settings where anomalies are not predefined or where the data’s inherent complexity requires initial unsupervised insight to develop an understanding of what constitutes normal behavior versus an anomaly. Following the clustering phase, supervised classification models are trained on the newly identified labels to predict anomalies. This generic approach is particularly effective for short time series data, where each sequence represents a process or event whose normal operational parameters need to be defined through exploratory analysis before precise anomaly detection can occur.

    \n", + "

    Potential Use Cases Across Industries:

    \n", + "
  • Telco & Utilities - Power Grid Load Monitoring: Analyzing short time series of electricity load during peak usage times to identify anomalies that could indicate equipment failure, energy theft, or inefficiencies in power distribution. Each series could represent the load profile for a brief, high-demand period.
  • \n", + "
  • Healthcare - ECG or EEG Analysis: Short segments of electrocardiogram (ECG) or electroencephalogram (EEG) readings can be analyzed to detect anomalies indicating cardiac arrhythmias or neurological issues, respectively. Each segment represents a complete heartbeat or a brief brain activity pattern.
  • \n", + "
  • Manufacturing - CNC Machine Operations: Monitoring the torque and force profiles of a CNC (Computer Numerical Control) machine during a single machining operation. Anomalies could indicate tool wear, material inconsistency, or operational errors.
  • \n", + "
  • Travel & Transport - Aircraft Engine Test Runs: Analyzing the time series data of engine parameters (e.g., temperature, pressure, vibration) during short test runs to identify deviations from normal operational profiles, suggesting maintenance or safety issues.
  • \n", + "
  • Hospitality & Entertainment - Theme Park Ride Operations: Analyzing sensor data from individual rides, where each ride cycle produces a time series of mechanical or operational parameters. Anomalies in these series could indicate safety concerns or maintenance needs.
  • \n", + "

    Conclusion

    \n", + "

    In each of these scenarios, the focus is on analyzing the shape or behavior of a curve within a short time frame, similar to observing a spot welding curve. These curves are shaped by the specific activity taking place, whether it’s a machine at work, a health test running, financial trades happening, or people interacting with a service. The method begins by sorting these curves into groups based on their patterns, without needing to know ahead of time which ones are out of the ordinary. Then, it moves on to use a more detailed approach to pinpoint which curves don’t fit the expected pattern, labeling them as either normal or not normal. This way of doing things is great for quickly finding and addressing issues, and it also helps in getting a better grasp of how these processes work. This can lead to making things run more smoothly and keeping equipment in good shape before problems even start.

    " + ] + }, + { + "cell_type": "markdown", + "id": "91bd8857-19e0-4200-b3ae-b2efdbca73d3", + "metadata": {}, + "source": [ + "
    \n", + "Resources\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    \n", + "Filters: \n", + "
  • Industry: Manufacturing
  • \n", + "
  • Functionality: Machine Learning
  • \n", + "
  • Use Case: Anomaly Detection
  • \n", + "Related Resources:\n", + "
  • Hyper-scale time series forecasting done right
  • \n", + "
  • Stay Ahead of Continuous and Rapid Change with a Dynamic Supply Chain
  • \n", + "
  • Achieve industry 4.0 using advanced manufacturing analytics at scale
  • \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "1da48da7-d4de-4693-9365-5d5f63810673", + "metadata": { + "tags": [] + }, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2023, 2024. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Anomaly_Detection/images/AnomalyWelding.png b/VantageCloud_Lake/UseCases/Anomaly_Detection/images/AnomalyWelding.png new file mode 100644 index 00000000..45fec58c Binary files /dev/null and b/VantageCloud_Lake/UseCases/Anomaly_Detection/images/AnomalyWelding.png differ diff --git a/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_EFS.ipynb b/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_EFS.ipynb new file mode 100644 index 00000000..bef893af --- /dev/null +++ b/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_EFS.ipynb @@ -0,0 +1,934 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f2f3523-c5f5-42c1-8e78-eb8a818fd487", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Telco Churn using Enterprise Feature Store in Vantage\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "eff89c2c-c21b-40c0-b60f-24dedde6270b", + "metadata": {}, + "source": [ + "

    Introduction

    \n", + "\n", + "

    \n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.

    \n", + "\n", + "

    Successful AI/ML implementations face three main challenges:

    \n", + "
  • The Data Problem: Quality data and feature engineering consume 80% of the implementation time. Even when different use cases share the same source data and features, organizations often handle data preparation separately.
  • \n", + "
  • The Scale Problem: Real-world use cases often require multiple models. In production, these models require fresh features engineered in the same way as during training. Ensuring the auditability of these features behind model decisions is crucial.
  • \n", + "
  • The Deployment Problem: Transitioning prototypes to production, especially operationalizing data prep pipelines, is often problematic.
  • \n", + "\n", + "

    Addressing these challenges requires strategic planning, skilled talent, and integration with existing systems. Oraganizations with a history in Data Management recognize the benefits of reusable Data Products, making Enterprise Feature Stores a valuable investment.

    \n", + "\n", + "

    A Feature Store is a curated repository of pre-calculated features, simplifying the journey from data to actionable insights. An Enterprise Feature Store extends across domains/teams, incorporating a Governance Framework for predictable feature delivery.

    \n", + " \n", + "

    While most features are reusable, some need model-specific calculations before integration into a unified dataset.

    \n", + " \n", + "

    The key difference between Feature Store (FS) and Enterprise Feature Store (EFS) is the scope across multiple domains/teams along with the Governance Framework (that gives an assurance that features are delivered under predictable SLAs and it also defines the operating model how the EFS is used across different teams/domains and how features lifecycle is managed). Although most Features are considered as re-usable, there is still some minor part of Features that must be calculated as model-specific (e.g., scaled variables, principal components, etc.) and then combined with the rest of the pre-calculated Features into a single data set (ADS). The figure below describes this co-existence of model-specific ADS(es) and model-independent EFS.

    \n", + "\n", + "\n", + "\n", + "\n", + "

    Business Values

    \n", + "\n", + "
  • Rapid model creation and deployment through enterprise feature reuse.
  • \n", + "
  • Flexible creation and usage of new features without extensive engineering support.
  • \n", + "
  • Consistent definitions ensure accuracy and quick deployment.
  • \n", + "
  • Collaboration and sharing of features among teams.
  • \n", + "
  • Maintained feature lifecycle for compliance and auditability.
  • \n", + "

    \n", + "\n", + "

    Why Vantage?

    \n", + "

    There are several reasons why EFS naturally fits to Teradata Vantage:

    \n", + "
  • Utilizes Teradata Vantage with its powerful Analytical Library and SQL Engine.
  • \n", + "
  • Primary Index enables efficient single-row access for online feature use.
  • \n", + "
  • Single platform for both online and offline feature stores.
  • \n", + "
  • Macros reduce parsing overhead from API access.
  • \n", + "
  • R and Python code execution within SQL Engine.
  • \n", + "
  • Bi-temporal querying capability.
  • \n", + "
  • Scalable MPP power for feature computation.
  • \n", + "
  • Industry-specific Logical Data Model as a feature source.
  • \n", + "
  • Connectivity to Object Storage via NOS for feature data sourcing.
  • \n", + "
  • Query Grid facilitates access to multiple data sources.
  • \n", + "
  • Close-to-real-time feature delivery using Query Services and Teradata Intelligent Memory.
  • \n", + "
  • Workload management prioritizes tasks effectively.
  • \n", + "

    The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "\n", + "\n", + "

    Methodology

    \n", + "

    In this demo we have used a methodology which involves analyzing a time series of data, where each data point represents the outstanding amount at the end of each month. To detect anomalies, we use the following steps:

    \n", + "\n", + "
  • Model the Distribution: We assume that the historical data of monthly balances follow a normal distribution. This distribution is characterized by two parameters: the mean (μ) and the standard deviation (σ). These are the features of the Entity
  • \n", + "\n", + "
  • Compute the Z-Score: For the most recent monthly balance (the latest data point in the time series), we compute its Z-score. The Z-score is a statistical measure that describes a value's relationship to the mean of a group of values. It is calculated using the formula:
  • \n", + "\n", + "

    Z = (X - μ) / σ

    \n", + "\n", + "

    where X is the value in question, μ is the mean, and σ is the standard deviation.

    \n", + "\n", + "
  • Threshold for Anomaly Detection: We set a threshold for the Z-score. If the absolute value of the Z-score for the latest monthly balance exceeds this threshold, it is flagged as an anomaly.
  • \n", + "\n", + "

    It's important to note that the computation of the Z-score and the anomaly flag is dependent on the values of the mean and standard deviation. These dependent features are not computed at the same time as the static features but are derived later, once the latest outstanding amount (the new data point) becomes available.

    \n", + "\n", + "

    Feature Engineering

    \n", + "

    Feature engineering is a crucial step in the entity-feature paradigm, as it involves creating and transforming features to better represent the underlying problem for predictive modeling. In our case, the feature engineering process is twofold, each with its specific inputs and outputs. Below are the processes that are a part of this feature engineering

    \n", + "\n", + "
  • Process 1: Computing Mean and Standard Deviation
  • \n", + "
  • Process 2: Computing Z-Score and Anomaly Flag
  • \n", + "
  • Roll Out: Feature Engineering rollout\n", + "
      \n", + "
    • Addressing Circular Dependency
    • \n", + "
    • Roll out after adjusting circular dependency
    \n", + "
  • Validation: Feature Store Validation
  • \n", + "

    " + ] + }, + { + "cell_type": "markdown", + "id": "fe80ce62-d761-4ffe-b171-93b036739e92", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d409c76-27ef-4940-98c0-c541aa9cb1a0", + "metadata": {}, + "outputs": [], + "source": [ + "# '%%capture' suppresses the display of installation steps of the following packages\n", + "%%capture\n", + "!pip install dotenv\n", + "!pip install --upgrade teradataml" + ] + }, + { + "cell_type": "markdown", + "id": "0f0df2d7-7b1c-413c-bd59-5f87035f9a90", + "metadata": {}, + "source": [ + "
    \n", + "

    Note: Please execute the above pip install to get the latest version of the required library. Be sure to restart the kernel after executing those lines to bring the installed libraries into memory. The simplest way to restart the Kernel is by typing zero zero: 0 0

    \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48fb9b4e-8690-4de6-ab49-211f6413cfd1", + "metadata": {}, + "outputs": [], + "source": [ + "# Standard libraries\n", + "import json\n", + "import warnings\n", + "import getpass\n", + "from dotenv import load_dotenv, dotenv_values\n", + "\n", + "# Suppress warnings\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "# Teradata libraries\n", + "from teradataml import *\n", + "display.max_rows = 5\n", + "\n", + "# Data manipulation and visualization libraries\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "1bcca830-84d7-452b-9a4a-21853933afd4", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cbf3a421-cfeb-4c21-978f-d2f9bea77bf9", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_Telco_Customer_Churn_using_EFS.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a4794150-a65b-4c3f-8d24-d42f34b6167b", + "metadata": {}, + "source": [ + "

    Setup a Feature Store

    " + ] + }, + { + "cell_type": "markdown", + "id": "0776e2ee-bdb8-4927-9d14-5cff5583b6ee", + "metadata": {}, + "source": [ + "

    We can now set-up the feature store using the FeatureStore.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e72cc2f-2e36-47cd-a2da-19da50f892d2", + "metadata": {}, + "outputs": [], + "source": [ + "username=env_vars.get(\"username\")\n", + "fs = FeatureStore(repo=username)\n", + "fs.setup()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7f40c3-8efe-4042-9ede-a767681f0fcf", + "metadata": {}, + "outputs": [], + "source": [ + "# List whether FeatureStore is setup or not.\n", + "fs.list_repos()" + ] + }, + { + "cell_type": "markdown", + "id": "e80e18e4-d009-4d88-8340-72636ca8f0dd", + "metadata": {}, + "source": [ + "
    \n", + "

    3.Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below

    " + ] + }, + { + "cell_type": "markdown", + "id": "d8540286-8309-47c6-9aff-fe153700ee9d", + "metadata": {}, + "source": [ + "
    \n", + "

    4. Feature Engineering

    " + ] + }, + { + "cell_type": "markdown", + "id": "c4992424-3837-4a9f-b532-2e8d188d8c02", + "metadata": {}, + "source": [ + "

    The code creates a DataFrame named df using the DataFrame function. The in_schema function specifies the schema, which in this case is \"DEMO_Telco\", and the table name \"Customer_Churn\". Let us now start with feature engineering.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "271b6d33-7792-442d-8c46-7e4d659e5920", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame(in_schema(\"DEMO_Telco\", \"Customer_Churn\"))\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "4c1d8130-0f2a-409f-be7e-6b9c32649208", + "metadata": {}, + "source": [ + "

    This code performs the following operations:

    \n", + "
      \n", + "
    1. Assigning New Values: The df.assign() function is used to create new columns or modify existing ones in the DataFrame df.
    2. \n", + "
    3. Replacing Values:\n", + "
        \n", + "
      • MultipleLines: Replaces \"No phone service\" with \"No\".
      • \n", + "
      • OnlineSecurity, OnlineBackup, DeviceProtection, TechSupport, StreamingTV, StreamingMovies: Replaces \"No internet service\" with \"No\" for each of these columns.
      • \n", + "
      \n", + "
    4. \n", + "
    5. Converting Churn Values:\n", + "
        \n", + "
      • Churn: Uses the case function to convert \"Yes\" to 1 and \"No\" to 0. If the value is neither \"Yes\" nor \"No\", it defaults to 0.
      • \n", + "
      \n", + "
    6. \n", + "
    7. Displaying the DataFrame: The final df statement displays the modified DataFrame.
    8. \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4cb4956-2260-4c65-a6fa-b91ec4048cc2", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(\n", + " MultipleLines = df.MultipleLines.replace(\"No phone service\",\"No\"),\n", + " OnlineSecurity = df.OnlineSecurity.replace(\"No internet service\",\"No\"),\n", + " OnlineBackup = df.OnlineBackup.replace(\"No internet service\",\"No\"),\n", + " DeviceProtection = df.DeviceProtection.replace(\"No internet service\",\"No\"),\n", + " TechSupport = df.TechSupport.replace(\"No internet service\",\"No\"),\n", + " StreamingTV = df.StreamingTV.replace(\"No internet service\",\"No\"),\n", + " StreamingMovies = df.StreamingMovies.replace(\"No internet service\",\"No\"),\n", + " Churn = case({ \"Yes\" : 1, \"No\" : 0}, value=df.Churn,else_=0)\n", + ")\n", + "\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b383aab-7165-4858-9e21-df0a9297b146", + "metadata": {}, + "outputs": [], + "source": [ + "print(df.show_query())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83827624-1f49-4b05-898f-13aa03b346c3", + "metadata": {}, + "outputs": [], + "source": [ + "df = ConvertTo(\n", + " data=df,\n", + " target_columns=['CustomerID', 'Gender', 'Partner', 'Dependents', 'PhoneService',\n", + " 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup',\n", + " 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',\n", + " 'Contract', 'PaperlessBilling', 'PaymentMethod'],\n", + " target_datatype=[\"VARCHAR(charlen=10,charset=UNICODE,casespecific=NO)\"]\n", + ").result" + ] + }, + { + "cell_type": "markdown", + "id": "c8f2207c-e7e9-4ffa-ac7a-37a78b129eb5", + "metadata": {}, + "source": [ + "

    Let's store the transformed data to table.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df39f7d9-37b5-445f-bb29-4f4883c0d021", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(\n", + " df=df,\n", + " table_name='transformed_data',\n", + " if_exists='replace'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3cf33828-2e9e-42e5-883b-469cb47dd515", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Save feature and feature processing to Feature Store

    " + ] + }, + { + "cell_type": "markdown", + "id": "2ec83605-4d29-47d5-93c3-7c05791d1782", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "98794d87-4d1e-4e58-ac39-1337beeace57", + "metadata": {}, + "source": [ + "

    Now we will proceed to save the features as well as the feature processing logic in feature store.

    \n", + "

    This will allow us to re-use the features and processing later-on, avoiding to re-write the processing logic.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19557186-c5be-4be6-a452-1ff6869865e3", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame('transformed_data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edbe4e7e-0b44-4624-9657-da2f5b7aff02", + "metadata": {}, + "outputs": [], + "source": [ + "# Create FeatureGroup for this DataFrame.\n", + "fg = FeatureGroup.from_DataFrame(name='telcom', df=df, entity_columns='CustomerID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0565b334-f84e-4843-92b2-7e784494ac07", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at Features\n", + "fg.features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cb9c82f-8e99-4cca-bd7a-5d77c7764bd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at Entity.\n", + "fg.entity.columns" + ] + }, + { + "cell_type": "markdown", + "id": "191c113e-08af-4eca-bf7c-fcab24be71ec", + "metadata": {}, + "source": [ + "

    Here we will saving the features and processing with additional metadata such as project names as churn

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c5c15a2-ddca-4af5-919b-bc7cfd5fd11e", + "metadata": {}, + "outputs": [], + "source": [ + "# upload the features in the physical feature store\n", + "fs.apply(fg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37931c07-6843-4835-9b33-edd8b8fbc131", + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_features()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3a60da1-b8ea-4461-a953-2da276a11d8f", + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_feature_groups()" + ] + }, + { + "cell_type": "markdown", + "id": "beda7104-0cc0-48f9-b98f-ad573be558ea", + "metadata": {}, + "source": [ + "
    \n", + "

    6. Re-using features for machine learning

    " + ] + }, + { + "cell_type": "markdown", + "id": "57de3efb-2956-4a2a-9fc9-9c9ddd7bc155", + "metadata": {}, + "source": [ + "

    Now that our features have been stores in feature store, let us re-use them to train a machine learning model

    \n", + "

    We now need to just specify the feature name, we do not need to specify the processing logic

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce440d3-7c72-4400-82b0-2ff09e8fc914", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = fs.get_dataset('telcom')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "fcaca5d6-2e5a-400f-a653-cbf70c1564c8", + "metadata": {}, + "source": [ + "

    We have our training dataset which is created, with all the feature engineering

    \n", + "

    We can see from that the column Multiple lines has only two values yes and no. The same features can also be re-used accross multiple use-cases and models without any data preperation

    " + ] + }, + { + "cell_type": "markdown", + "id": "46f6bf8f-b9e1-441a-b089-c1ceebdbc059", + "metadata": {}, + "source": [ + "

    We split the dataset in to training and testing dataset with 80:20 split ratio.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a5f3a51-60a2-4b80-8f82-a77d8adcc322", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Performing sampling to get 80% for trainning and 20% for testing\n", + "tdf_sample = df.sample(frac = [0.8, 0.2])\n", + "\n", + "# Fetching train and test data\n", + "tdf_train= tdf_sample[tdf_sample['sampleid'] == 1].drop('sampleid', axis=1)\n", + "tdf_test = tdf_sample[tdf_sample['sampleid'] == 2].drop('sampleid', axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "c86ae3c5-c44c-4c1d-bf08-c8e8d3d83d58", + "metadata": {}, + "source": [ + "
    \n", + "7. AutoML Training" + ] + }, + { + "cell_type": "markdown", + "id": "337e3e67-6a98-4f1f-8479-45a637ca0bb5", + "metadata": {}, + "source": [ + "

    AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.

    \n", + "\n", + "

    We create a AutoClassifier instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs specifies the time limit in seconds for model training.\n", + "

    \n", + "verbose: specifies the detailed execution steps based on verbose level as follows:\n", + "

    \n", + "\n", + "
      \n", + "
    • 0: prints the progress bar and leaderboard
    • \n", + "
    • 1: prints the execution steps of AutoML.
    • \n", + "
    • 2: prints the intermediate data between the execution of each step of AutoML.
    • \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "ddce4f43-4035-44af-ba24-4944f7705a9c", + "metadata": {}, + "source": [ + "
    \n", + "4. AutoML Training" + ] + }, + { + "cell_type": "markdown", + "id": "e3b86886-ba91-4e36-9d0f-b0415dd59ccd", + "metadata": {}, + "source": [ + "

    AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.

    \n", + "\n", + "

    We create a AutoClassifier instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs specifies the time limit in seconds for model training.\n", + "

    \n", + "verbose: specifies the detailed execution steps based on verbose level as follows:\n", + "

    \n", + "\n", + "
      \n", + "
    • 0: prints the progress bar and leaderboard
    • \n", + "
    • 1: prints the execution steps of AutoML.
    • \n", + "
    • 2: prints the intermediate data between the execution of each step of AutoML.
    • \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb8137fc-7250-427f-92c7-6070a8ffddea", + "metadata": {}, + "outputs": [], + "source": [ + "# Creating AutoClassifier Instance\n", + "# Selecting 'Auto' mode for AutoML training\n", + "# Excluding knn,glm and svm model from default model list for training\n", + "# Used early stopping timer criteria with value 600 sec\n", + "\n", + "aml = AutoClassifier(\n", + " exclude = ['knn','svm','glm'],\n", + " verbose = 2,\n", + " max_runtime_secs = 600\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "46e64a2e-648d-4321-8bd6-5bdd8051cbb6", + "metadata": {}, + "source": [ + "

    Note: Since the AutoML functionality does a lot of steps like Feature exploration and Data Preparation along with Model Training and Evaluating to select the Best model the below step may take anywhere between 12-15 minutes

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "378743d2-daae-4a91-af06-fbd566072902", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fitting train data \n", + "aml.fit(data = tdf_train, target_column = 'Churn')" + ] + }, + { + "cell_type": "markdown", + "id": "3ea7b521-d7c1-4994-b380-b0308c85743d", + "metadata": {}, + "source": [ + "
    \n", + "8. Model Leaderboard Generation" + ] + }, + { + "cell_type": "markdown", + "id": "4223d717-dd65-449f-8c2b-c415e1471ac3", + "metadata": {}, + "source": [ + "

    Here, we generate model leaderboard and leader for a given dataset. Leaderboard is a ranked table with a list of models with all their evaluation metrics.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22d69751-fe5e-46dd-9c24-3322a5bd1487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching leaderboard\n", + "\n", + "aml.leaderboard()" + ] + }, + { + "cell_type": "markdown", + "id": "7a330d76-3671-42f0-a2e3-e0b7f2138048", + "metadata": {}, + "source": [ + "
    \n", + "9. Best Performing Model" + ] + }, + { + "cell_type": "markdown", + "id": "59cbeea7-d5d8-4333-b25a-8a02aa4b4cff", + "metadata": {}, + "source": [ + "

    The following function displays the best performing model.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2054141f-3c2e-47be-b9db-aef8b4c3424d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching best performing model\n", + "aml.leader()" + ] + }, + { + "cell_type": "markdown", + "id": "55a1096f-7c8b-4e2f-b028-aa7f89c22f15", + "metadata": {}, + "source": [ + "
    \n", + "10. Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "41a2ffe3-91f5-4da0-ac6c-6276323e4eb3", + "metadata": {}, + "source": [ + "

    The predict function generates predictions using either the default test data or any specified dataset, based on the model's rank in the leaderboard, and displays the performance metrics of the chosen model. If the test data contains a target column, both predictions and performance metrics are displayed; otherwise, only the predictions are shown.\n", + "

    \n", + "You can also use the rank parameter in the predict function. The rank parameter specifies the model's rank in the leaderboard to be used for prediction. By default, the rank is set to 1, meaning the best-performing model is used.

    " + ] + }, + { + "cell_type": "markdown", + "id": "0c2ea7e4-e02b-47d9-a4eb-18493f90d104", + "metadata": {}, + "source": [ + "
    \n", + "10.1 Generating prediction on external test data" + ] + }, + { + "cell_type": "markdown", + "id": "bff63293-7104-4308-ab81-a6527e9e1a4f", + "metadata": {}, + "source": [ + "

    Here, we specify the tdf_test dataset for prediction. When using external data instead of the default test data, the predict function applies all the data transformation steps performed during the training phase on the external data before passing the data to the model for prediction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68dd9ecd-0b1c-4428-8911-61e4f5719d81", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching prediction and metrics on test data\n", + "prediction = aml.predict(tdf_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f563395c-b381-4fe4-ab20-7b1a7a4882bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Printing prediction\n", + "prediction" + ] + }, + { + "cell_type": "markdown", + "id": "d841fcd4-df97-4d31-bd25-28cce37c1a47", + "metadata": {}, + "source": [ + "
    \n", + "11. Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "d74e6e4a-ae3a-4249-a6ca-8d644f592db9", + "metadata": {}, + "source": [ + "

    We used feature store to store features as well as its processing. We re-used it in model training. The features and processing can be re-used accross multiple machine leanring models and use-case , helping to improve data science productivity

    \n", + "\n", + "

    Teradata's AutoML functionality plays a crucial role in this context by automating the complex process of building and deploying machine learning models. AutoML ensures the most optimal preparation and training of models, delivering high-quality machine learning models in minutes. Through hyperparameter tuning (HPT), Teradata's AutoML can automatically select the best parameters for machine learning algorithms using grid search and random search techniques, significantly enhancing model performance.\n", + "

    \n", + "By leveraging Teradata's AutoML, companies can save time and reduce costs associated with manual model building and tuning. The technology not only improves the accuracy of predictive models but also democratizes the power of machine learning, allowing customers to utilize advanced analytics without requiring extensive coding or data science expertise. This capability enables companies to swiftly and effectively analyze customer churn data, develop predictive models, and implement proactive strategies to retain customers and enhance their satisfaction.\n", + "

    \n", + "In conclusion, Teradata's AutoML functionality is a vital tool for banks aiming to reduce customer churn. By automating and optimizing the machine learning process, Teradata empowers various industries to make data-driven decisions that improve customer retention and drive long-term profitability.

    " + ] + }, + { + "cell_type": "markdown", + "id": "3fafdffb-2cde-4d99-8682-9ae64c74497d", + "metadata": {}, + "source": [ + "
    \n", + "12. Cleanup

    \n", + "

    Work Tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff3fb9b-4d13-4628-988d-f82463d96537", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['transformed_data']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21fac6bb-3e3a-488c-848b-41473d6156e7", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "72bfa61c-3daa-4d47-b0d7-0a69ef13dc1a", + "metadata": {}, + "source": [ + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "0ff7e25c-dc4e-45d7-a67f-8c70e2c517f4", + "metadata": {}, + "source": [ + "Required Materials\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    " + ] + }, + { + "cell_type": "markdown", + "id": "fc4938d2-5ce6-412e-a665-5d62a3b1a1b5", + "metadata": {}, + "source": [ + "

    Filters:

    \n", + "
      \n", + "
    • Industry: Telco
    • \n", + "
    • Functionality: Feature Store and AutoML
    • \n", + "
    • Use Case: Customer Retention
    • \n", + "
    \n", + "

    Related Resources:

    \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "b4fd5272-ceb7-4d47-bd5c-c3aea31e471a", + "metadata": {}, + "source": [ + "

    Reference Links:

    \n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "0d6b60cb-d919-4daf-bc2a-35d7bf17eec7", + "metadata": {}, + "source": [ + "Dataset:\n", + "\n", + "- `CustomerID`: unique id of customer\n", + "- `Gender`: Whether the customer is a male or a female\n", + "- `SeniorCitizen`:Whether the customer is a senior citizen or not (1, 0)\n", + "- `Partner`:Whether the customer has a partner or not (Yes, No)\n", + "- `Dependents`:Whether the customer has dependents or not (Yes, No)\n", + "- `Tenure`:Number of months the customer has stayed with the company\n", + "- `PhoneService`:Whether the customer has a phone service or not (Yes, No)\n", + "- `MultipleLines`:Whether the customer has multiple lines or not (Yes, No, No phone service)\n", + "- `InternetService`:Customer’s internet service provider (DSL, Fiber optic, No)\n", + "- `OnlineSecurity`:Whether the customer has online security or not (Yes, No, No internet service)\n", + "- `OnlineBackup`:Whether the customer has online backup or not (Yes, No, No internet service)\n", + "- `DeviceProtection`:Whether the customer has device protection or not (Yes, No, No internet service)\n", + "- `TechSupport`:Whether the customer has tech support or not (Yes, No, No internet service)\n", + "- `StreamingTV`:Whether the customer has streaming TV or not (Yes, No, No internet service)\n", + "- `StreamingMovies`:Whether the customer has streaming movies or not (Yes, No, No internet service)\n", + "- `Contract`:The contract term of the customer (Month-to-month, One year, Two year)\n", + "- `PaperlessBilling`:Whether the customer has paperless billing or not (Yes, No)\n", + "- `PaymentMethod`:The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))\n", + "- `MonthlyCharges`:The amount charged to the customer monthly\n", + "- `TotalCharges`:The total amount charged to the customer\n", + "- `Churn`:Whether the customer churned or not (Yes or No)" + ] + }, + { + "cell_type": "markdown", + "id": "d7e28609-20ff-47e0-a640-48db6a7fa523", + "metadata": {}, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2025. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_tdfs4ds_Feature_Store.ipynb b/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_tdfs4ds_Feature_Store.ipynb new file mode 100644 index 00000000..cd8b5e3c --- /dev/null +++ b/VantageCloud_Lake/UseCases/EFS_Demo/VCL_Telco_Churn_using_tdfs4ds_Feature_Store.ipynb @@ -0,0 +1,1027 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f2f3523-c5f5-42c1-8e78-eb8a818fd487", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Telco Churn using Feature Store in Vantage\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "eff89c2c-c21b-40c0-b60f-24dedde6270b", + "metadata": {}, + "source": [ + "

    Introduction

    \n", + "\n", + "

    \n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.

    \n", + "\n", + "

    Successful AI/ML implementations face three main challenges:

    \n", + "
  • The Data Problem: Quality data and feature engineering consume 80% of the implementation time. Even when different use cases share the same source data and features, organizations often handle data preparation separately.
  • \n", + "
  • The Scale Problem: Real-world use cases often require multiple models. In production, these models require fresh features engineered in the same way as during training. Ensuring the auditability of these features behind model decisions is crucial.
  • \n", + "
  • The Deployment Problem: Transitioning prototypes to production, especially operationalizing data prep pipelines, is often problematic.
  • \n", + "\n", + "

    Addressing these challenges requires strategic planning, skilled talent, and integration with existing systems. Oraganizations with a history in Data Management recognize the benefits of reusable Data Products, making Enterprise Feature Stores a valuable investment.

    \n", + "\n", + "

    A Feature Store is a curated repository of pre-calculated features, simplifying the journey from data to actionable insights. An Enterprise Feature Store extends across domains/teams, incorporating a Governance Framework for predictable feature delivery.

    \n", + " \n", + "

    While most features are reusable, some need model-specific calculations before integration into a unified dataset.

    \n", + " \n", + "

    The key difference between Feature Store (FS) and Enterprise Feature Store (EFS) is the scope across multiple domains/teams along with the Governance Framework (that gives an assurance that features are delivered under predictable SLAs and it also defines the operating model how the EFS is used across different teams/domains and how features lifecycle is managed). Although most Features are considered as re-usable, there is still some minor part of Features that must be calculated as model-specific (e.g., scaled variables, principal components, etc.) and then combined with the rest of the pre-calculated Features into a single data set (ADS). The figure below describes this co-existence of model-specific ADS(es) and model-independent EFS.

    \n", + "\n", + "\n", + "\n", + "\n", + "

    Business Values

    \n", + "\n", + "
  • Rapid model creation and deployment through enterprise feature reuse.
  • \n", + "
  • Flexible creation and usage of new features without extensive engineering support.
  • \n", + "
  • Consistent definitions ensure accuracy and quick deployment.
  • \n", + "
  • Collaboration and sharing of features among teams.
  • \n", + "
  • Maintained feature lifecycle for compliance and auditability.
  • \n", + "

    \n", + "\n", + "

    Why Vantage?

    \n", + "

    There are several reasons why EFS naturally fits to Teradata Vantage:

    \n", + "
  • Utilizes Teradata Vantage with its powerful Analytical Library and SQL Engine.
  • \n", + "
  • Primary Index enables efficient single-row access for online feature use.
  • \n", + "
  • Single platform for both online and offline feature stores.
  • \n", + "
  • Macros reduce parsing overhead from API access.
  • \n", + "
  • R and Python code execution within SQL Engine.
  • \n", + "
  • Bi-temporal querying capability.
  • \n", + "
  • Scalable MPP power for feature computation.
  • \n", + "
  • Industry-specific Logical Data Model as a feature source.
  • \n", + "
  • Connectivity to Object Storage via NOS for feature data sourcing.
  • \n", + "
  • Query Grid facilitates access to multiple data sources.
  • \n", + "
  • Close-to-real-time feature delivery using Query Services and Teradata Intelligent Memory.
  • \n", + "
  • Workload management prioritizes tasks effectively.
  • \n", + "

    The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.

    \n", + "\n", + "\n", + "

    Methodology

    \n", + "

    In this demo we have used a methodology which involves analyzing a time series of data, where each data point represents the outstanding amount at the end of each month. To detect anomalies, we use the following steps:

    \n", + "\n", + "
  • Model the Distribution: We assume that the historical data of monthly balances follow a normal distribution. This distribution is characterized by two parameters: the mean (μ) and the standard deviation (σ). These are the features of the Entity
  • \n", + "\n", + "
  • Compute the Z-Score: For the most recent monthly balance (the latest data point in the time series), we compute its Z-score. The Z-score is a statistical measure that describes a value's relationship to the mean of a group of values. It is calculated using the formula:
  • \n", + "\n", + "

    Z = (X - μ) / σ

    \n", + "\n", + "

    where X is the value in question, μ is the mean, and σ is the standard deviation.

    \n", + "\n", + "
  • Threshold for Anomaly Detection: We set a threshold for the Z-score. If the absolute value of the Z-score for the latest monthly balance exceeds this threshold, it is flagged as an anomaly.
  • \n", + "\n", + "

    It's important to note that the computation of the Z-score and the anomaly flag is dependent on the values of the mean and standard deviation. These dependent features are not computed at the same time as the static features but are derived later, once the latest outstanding amount (the new data point) becomes available.

    \n", + "\n", + "

    Feature Engineering

    \n", + "

    Feature engineering is a crucial step in the entity-feature paradigm, as it involves creating and transforming features to better represent the underlying problem for predictive modeling. In our case, the feature engineering process is twofold, each with its specific inputs and outputs. Below are the processes that are a part of this feature engineering

    \n", + "\n", + "
  • Process 1: Computing Mean and Standard Deviation
  • \n", + "
  • Process 2: Computing Z-Score and Anomaly Flag
  • \n", + "
  • Roll Out: Feature Engineering rollout\n", + "
      \n", + "
    • Addressing Circular Dependency
    • \n", + "
    • Roll out after adjusting circular dependency
    \n", + "
  • Validation: Feature Store Validation
  • \n", + "

    " + ] + }, + { + "cell_type": "markdown", + "id": "1ab6f86e-5e58-435a-a8f2-a4eb3176aa3d", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d409c76-27ef-4940-98c0-c541aa9cb1a0", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv\n", + "# # '%%capture' suppresses the display of installation steps of the following packages\n", + "!pip install tdfs4ds --upgrade" + ] + }, + { + "cell_type": "markdown", + "id": "0f0df2d7-7b1c-413c-bd59-5f87035f9a90", + "metadata": {}, + "source": [ + "
    \n", + "

    Note: Please execute the above pip install to get the latest version of the required library. Be sure to restart the kernel after executing those lines to bring the installed libraries into memory. The simplest way to restart the Kernel is by typing zero zero: 0 0

    \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48fb9b4e-8690-4de6-ab49-211f6413cfd1", + "metadata": {}, + "outputs": [], + "source": [ + "#import libraries\n", + "import matplotlib.pyplot as plt \n", + "import getpass\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "from teradataml import *\n", + "import pandas as pd\n", + "import json\n", + "from sqlalchemy import func\n", + "from dotenv import load_dotenv, dotenv_values\n", + "import tdfs4ds\n", + "from tdfs4ds.utils.lineage import crystallize_view\n", + "from tdfs4ds.feature_store.feature_query_retrieval import get_feature_versions\n", + "\n", + "display.max_rows=5" + ] + }, + { + "cell_type": "markdown", + "id": "1bcca830-84d7-452b-9a4a-21853933afd4", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5aae25d0-f786-45d2-8e2a-32c342d353ad", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_Telco_Churn_using_tdfs4ds_Feature_Store.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a4794150-a65b-4c3f-8d24-d42f34b6167b", + "metadata": {}, + "source": [ + "

    Setup a Feature Store

    " + ] + }, + { + "cell_type": "markdown", + "id": "0776e2ee-bdb8-4927-9d14-5cff5583b6ee", + "metadata": {}, + "source": [ + "

    We can now set-up the feature store using the tdfs4dslibrary.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e72cc2f-2e36-47cd-a2da-19da50f892d2", + "metadata": {}, + "outputs": [], + "source": [ + "username=env_vars.get(\"username\")\n", + "tdfs4ds.setup(database=username)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7f40c3-8efe-4042-9ede-a767681f0fcf", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.connect(database=username)" + ] + }, + { + "cell_type": "markdown", + "id": "e80e18e4-d009-4d88-8340-72636ca8f0dd", + "metadata": {}, + "source": [ + "
    \n", + "

    3.Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below

    " + ] + }, + { + "cell_type": "markdown", + "id": "d8540286-8309-47c6-9aff-fe153700ee9d", + "metadata": {}, + "source": [ + "
    \n", + "

    4. Feature Engineering

    " + ] + }, + { + "cell_type": "markdown", + "id": "c4992424-3837-4a9f-b532-2e8d188d8c02", + "metadata": {}, + "source": [ + "

    Let us now start with feature engineering. We will replace multiple values which indicate absence of a service by No

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "271b6d33-7792-442d-8c46-7e4d659e5920", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame(in_schema(\"DEMO_Telco\", \"Customer_Churn\"))\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4cb4956-2260-4c65-a6fa-b91ec4048cc2", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(\n", + " oreplace_MultipleLines = func.oreplace(\n", + " df.MultipleLines.expression,\"No phone service\",\"No\"\n", + " ),\n", + " oreplace_OnlineSecurity = func.oreplace(\n", + " df.OnlineSecurity.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_OnlineBackup = func.oreplace(\n", + " df.OnlineBackup.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_DeviceProtection = func.oreplace(\n", + " df.DeviceProtection.expression, \"No internet service\",\"No\"\n", + " ), \n", + " oreplace_TechSupport = func.oreplace(\n", + " df.TechSupport.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_StreamingTV = func.oreplace(\n", + " df.StreamingTV.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_StreamingMovies = func.oreplace(\n", + " df.StreamingMovies.expression, \"No internet service\",\"No\"\n", + " )\n", + ")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "b702973d-8a23-4070-94cc-b031013a11e1", + "metadata": {}, + "source": [ + "

    We will also convert Churn column value from Yes / No to 1 or 0

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e491f404-c614-4c13-a2f0-31daa7305750", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(\n", + " drop_columns = True,\n", + " CustomerID = df.CustomerID,\n", + " Gender = df.Gender,\n", + " SeniorCitizen = df.SeniorCitizen,\n", + " Partner = df.Partner,\n", + " Dependents = df.Dependents,\n", + " Tenure = df.Tenure,\n", + " PhoneService = df.PhoneService,\n", + " MultipleLines = df.oreplace_MultipleLines,\n", + " InternetService = df.InternetService,\n", + " OnlineSecurity = df.oreplace_OnlineSecurity,\n", + " OnlineBackup = df.oreplace_OnlineBackup,\n", + " DeviceProtection = df.oreplace_DeviceProtection,\n", + " TechSupport = df.oreplace_TechSupport,\n", + " StreamingTV = df.oreplace_StreamingTV,\n", + " StreamingMovies = df.oreplace_StreamingMovies,\n", + " Contract = df.Contract,\n", + " PaperlessBilling = df.PaperlessBilling,\n", + " PaymentMethod = df.PaymentMethod,\n", + " MonthlyCharges = df.MonthlyCharges,\n", + " TotalCharges = df.TotalCharges,\n", + " Churn = case({ \"Yes\" : 1, \"No\" : 0},value=df.Churn,else_=0)\n", + ") \n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "3cf33828-2e9e-42e5-883b-469cb47dd515", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Save feature and feature processing to Feature Store

    " + ] + }, + { + "cell_type": "markdown", + "id": "2ec83605-4d29-47d5-93c3-7c05791d1782", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "98794d87-4d1e-4e58-ac39-1337beeace57", + "metadata": {}, + "source": [ + "

    Now we will proceed to save the features as well as the feature processing logic in feature store.

    \n", + "

    This will allow us to re-use the features and processing later-on, avoiding to re-write the processing logic.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edbe4e7e-0b44-4624-9657-da2f5b7aff02", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.DATA_DOMAIN='efs_telco'\n", + "tdfs4ds.VARCHAR_SIZE=50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ed9af79-d838-4ac3-8d67-3bb80f380bb9", + "metadata": {}, + "outputs": [], + "source": [ + "df = crystallize_view(df, view_name = 'PROC_FEATURE_ENGINEERING', schema_name = username,output_view=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68668b95-3b30-4f34-99e0-a19e37a52aec", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f61ac745-3c56-43c0-8e24-b32ed6c24246", + "metadata": {}, + "outputs": [], + "source": [ + "# define the set of columns defining the entity id\n", + "entity_id = ['CustomerID']\n", + "# list the columns dealing with the features\n", + "features = df.columns[1::]\n", + "features" + ] + }, + { + "cell_type": "markdown", + "id": "cc8da6b1-fa36-44f8-ab70-bc0aa0329897", + "metadata": {}, + "source": [ + "

    We will create a Data Domain for the feature store

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9665f2d9-4968-4ac6-b880-fc3f6fa46b12", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.DATA_DOMAIN" + ] + }, + { + "cell_type": "markdown", + "id": "191c113e-08af-4eca-bf7c-fcab24be71ec", + "metadata": {}, + "source": [ + "

    Here we will saving the features and processing with additional metadata such as project names as churn

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c5c15a2-ddca-4af5-919b-bc7cfd5fd11e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# upload the features in the physical feature store\n", + "tdfs4ds.upload_features(\n", + " df,\n", + " entity_id = entity_id,\n", + " feature_names = features,\n", + " metadata = {'project': 'churn'}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "81390658-0330-423d-87b4-001311a51289", + "metadata": {}, + "source": [ + "

    We can now use the feature catalog command to visualize all features which have been saved in the feature store

    \n", + "

    All features are time dependent, as seen by the column validity start and end

    \n", + "

    This means you can change the processing logic, but still keep the history of the features

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37931c07-6843-4835-9b33-edd8b8fbc131", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.feature_catalog()" + ] + }, + { + "cell_type": "markdown", + "id": "beda7104-0cc0-48f9-b98f-ad573be558ea", + "metadata": {}, + "source": [ + "
    \n", + "

    6. Re-using features for machine learning

    " + ] + }, + { + "cell_type": "markdown", + "id": "57de3efb-2956-4a2a-9fc9-9c9ddd7bc155", + "metadata": {}, + "source": [ + "

    Now that our features have been stores in feature store, let us re-use them to train a machine learning model

    \n", + "

    We now need to just specify the feature name, we do not need to specify the processing logic

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce440d3-7c72-4400-82b0-2ff09e8fc914", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.connect(database=username)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "537bc561-8519-41d3-88f9-37ebfa215836", + "metadata": {}, + "outputs": [], + "source": [ + "entity_id = ['CustomerID']\n", + "features = ['Gender',\n", + " 'SeniorCitizen',\n", + " 'Partner',\n", + " 'Dependents',\n", + " 'Tenure',\n", + " 'PhoneService',\n", + " 'MultipleLines',\n", + " 'InternetService',\n", + " 'OnlineSecurity',\n", + " 'OnlineBackup',\n", + " 'DeviceProtection',\n", + " 'TechSupport',\n", + " 'StreamingTV',\n", + " 'StreamingMovies',\n", + " 'Contract',\n", + " 'PaperlessBilling',\n", + " 'PaymentMethod',\n", + " 'MonthlyCharges',\n", + " 'TotalCharges',\n", + " 'Churn']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a64b16-9913-4b10-a3f7-1f1be67fef8d", + "metadata": {}, + "outputs": [], + "source": [ + "selected_features = get_feature_versions(entity_name=entity_id,features=features)\n", + "selected_features" + ] + }, + { + "cell_type": "markdown", + "id": "f5dd9918-f5d1-4cf3-9f4e-cbf1241ff47f", + "metadata": {}, + "source": [ + "

    We can now build our training dataset by specifying the build_dataset command

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8d3d05c-2d1d-4307-bce3-b879a8624f38", + "metadata": {}, + "outputs": [], + "source": [ + "df = tdfs4ds.build_dataset(\n", + " entity_id = entity_id,\n", + " selected_features = selected_features,\n", + " view_name = 'mydataset',\n", + " schema_name = username,\n", + " comment = 'dataset for churn prediction'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fcaca5d6-2e5a-400f-a653-cbf70c1564c8", + "metadata": {}, + "source": [ + "

    We have our training dataset which is created, with all the feature engineering

    \n", + "

    We can see from that the column Multiple lines has only two values yes and no. The same features can also be re-used accross multiple use-cases and models without any data preperation

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1ff7df-5216-4c26-95ff-641e16d9ba9c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df = DataFrame(in_schema(username , 'mydataset'))\n", + "copy_to_sql(df, table_name='fs_dataset', if_exists ='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73057d11-c8b9-4ce5-9cc3-1f7375d0f3f7", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame('fs_dataset')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "46f6bf8f-b9e1-441a-b089-c1ceebdbc059", + "metadata": {}, + "source": [ + "

    We split the dataset in to training and testing dataset with 80:20 split ratio.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a5f3a51-60a2-4b80-8f82-a77d8adcc322", + "metadata": {}, + "outputs": [], + "source": [ + "# Performing sampling to get 80% for trainning and 20% for testing\n", + "tdf_sample = df.sample(frac = [0.8, 0.2])\n", + "\n", + "# Fetching train and test data\n", + "tdf_train= tdf_sample[tdf_sample['sampleid'] == 1].drop('sampleid', axis=1)\n", + "tdf_test = tdf_sample[tdf_sample['sampleid'] == 2].drop('sampleid', axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "c86ae3c5-c44c-4c1d-bf08-c8e8d3d83d58", + "metadata": {}, + "source": [ + "
    \n", + "7. AutoML Training" + ] + }, + { + "cell_type": "markdown", + "id": "337e3e67-6a98-4f1f-8479-45a637ca0bb5", + "metadata": {}, + "source": [ + "

    AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.

    \n", + "\n", + "

    We create a AutoClassifier instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs specifies the time limit in seconds for model training.\n", + "

    \n", + "verbose: specifies the detailed execution steps based on verbose level as follows:\n", + "

    \n", + "\n", + "
      \n", + "
    • 0: prints the progress bar and leaderboard
    • \n", + "
    • 1: prints the execution steps of AutoML.
    • \n", + "
    • 2: prints the intermediate data between the execution of each step of AutoML.
    • \n", + "
    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb8137fc-7250-427f-92c7-6070a8ffddea", + "metadata": {}, + "outputs": [], + "source": [ + "# Creating AutoClassifier Instance\n", + "# Selecting 'Auto' mode for AutoML training\n", + "# Excluding knn,glm and svm model from default model list for training\n", + "# Used early stopping timer criteria with value 600 sec\n", + "\n", + "aml = AutoClassifier(\n", + " exclude = ['knn','svm','glm'],\n", + " verbose = 2,\n", + " max_runtime_secs = 600\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "46e64a2e-648d-4321-8bd6-5bdd8051cbb6", + "metadata": {}, + "source": [ + "

    Note: Since the AutoML functionality does a lot of steps like Feature exploration and Data Preparation along with Model Training and Evaluating to select the Best model the below step may take anywhere between 12-15 minutes

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "378743d2-daae-4a91-af06-fbd566072902", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fitting train data \n", + "aml.fit(data = tdf_train,target_column = 'Churn')" + ] + }, + { + "cell_type": "markdown", + "id": "3ea7b521-d7c1-4994-b380-b0308c85743d", + "metadata": {}, + "source": [ + "
    \n", + "8. Model Leaderboard Generation" + ] + }, + { + "cell_type": "markdown", + "id": "4223d717-dd65-449f-8c2b-c415e1471ac3", + "metadata": {}, + "source": [ + "

    Here, we generate model leaderboard and leader for a given dataset. Leaderboard is a ranked table with a list of models with all their evaluation metrics.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22d69751-fe5e-46dd-9c24-3322a5bd1487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching leaderboard\n", + "\n", + "aml.leaderboard()" + ] + }, + { + "cell_type": "markdown", + "id": "7a330d76-3671-42f0-a2e3-e0b7f2138048", + "metadata": {}, + "source": [ + "
    \n", + "9. Best Performing Model" + ] + }, + { + "cell_type": "markdown", + "id": "59cbeea7-d5d8-4333-b25a-8a02aa4b4cff", + "metadata": {}, + "source": [ + "

    The following function displays the best performing model.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2054141f-3c2e-47be-b9db-aef8b4c3424d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching best performing model\n", + "aml.leader()" + ] + }, + { + "cell_type": "markdown", + "id": "55a1096f-7c8b-4e2f-b028-aa7f89c22f15", + "metadata": {}, + "source": [ + "
    \n", + "10. Prediction" + ] + }, + { + "cell_type": "markdown", + "id": "41a2ffe3-91f5-4da0-ac6c-6276323e4eb3", + "metadata": {}, + "source": [ + "

    The predict function generates predictions using either the default test data or any specified dataset, based on the model's rank in the leaderboard, and displays the performance metrics of the chosen model. If the test data contains a target column, both predictions and performance metrics are displayed; otherwise, only the predictions are shown.\n", + "

    \n", + "You can also use the rank parameter in the predict function. The rank parameter specifies the model's rank in the leaderboard to be used for prediction. By default, the rank is set to 1, meaning the best-performing model is used.

    " + ] + }, + { + "cell_type": "markdown", + "id": "0c2ea7e4-e02b-47d9-a4eb-18493f90d104", + "metadata": {}, + "source": [ + "
    \n", + "10.1 Generating prediction on external test data" + ] + }, + { + "cell_type": "markdown", + "id": "bff63293-7104-4308-ab81-a6527e9e1a4f", + "metadata": {}, + "source": [ + "

    Here, we specify the tdf_test dataset for prediction. When using external data instead of the default test data, the predict function applies all the data transformation steps performed during the training phase on the external data before passing the data to the model for prediction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68dd9ecd-0b1c-4428-8911-61e4f5719d81", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching prediction and metrics on test data\n", + "prediction = aml.predict(tdf_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f563395c-b381-4fe4-ab20-7b1a7a4882bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Printing prediction\n", + "prediction" + ] + }, + { + "cell_type": "markdown", + "id": "d841fcd4-df97-4d31-bd25-28cce37c1a47", + "metadata": {}, + "source": [ + "
    \n", + "11. Conclusion" + ] + }, + { + "cell_type": "markdown", + "id": "d74e6e4a-ae3a-4249-a6ca-8d644f592db9", + "metadata": {}, + "source": [ + "

    We used feature store to store features as well as its processing. We re-used it in model training. The features and processing can be re-used accross multiple machine leanring models and use-case , helping to improve data science productivity

    \n", + "\n", + "

    Teradata's AutoML functionality plays a crucial role in this context by automating the complex process of building and deploying machine learning models. AutoML ensures the most optimal preparation and training of models, delivering high-quality machine learning models in minutes. Through hyperparameter tuning (HPT), Teradata's AutoML can automatically select the best parameters for machine learning algorithms using grid search and random search techniques, significantly enhancing model performance.\n", + "

    \n", + "By leveraging Teradata's AutoML, companies can save time and reduce costs associated with manual model building and tuning. The technology not only improves the accuracy of predictive models but also democratizes the power of machine learning, allowing customers to utilize advanced analytics without requiring extensive coding or data science expertise. This capability enables companies to swiftly and effectively analyze customer churn data, develop predictive models, and implement proactive strategies to retain customers and enhance their satisfaction.\n", + "

    \n", + "In conclusion, Teradata's AutoML functionality is a vital tool for banks aiming to reduce customer churn. By automating and optimizing the machine learning process, Teradata empowers various industries to make data-driven decisions that improve customer retention and drive long-term profitability.

    " + ] + }, + { + "cell_type": "markdown", + "id": "3fafdffb-2cde-4d99-8682-9ae64c74497d", + "metadata": {}, + "source": [ + "
    \n", + "12. Cleanup

    \n", + "

    Work Tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a07f45cf-ff45-4b95-810c-061fb4d1e528", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP VIEW {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_V')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e0c61f8-ce5a-4d77-a8dc-ed1628b499d0", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_T')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c9de55f-8b22-40f6-8044-8b63cf5617e6", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc241d13-6921-4582-8a9c-d1f5da3ec360", + "metadata": {}, + "outputs": [], + "source": [ + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t in ['temp','tdfs__fgjnojnsmdoignmosnig']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21fac6bb-3e3a-488c-848b-41473d6156e7", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "72bfa61c-3daa-4d47-b0d7-0a69ef13dc1a", + "metadata": {}, + "source": [ + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "0ff7e25c-dc4e-45d7-a67f-8c70e2c517f4", + "metadata": {}, + "source": [ + "Required Materials\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    " + ] + }, + { + "cell_type": "markdown", + "id": "fc4938d2-5ce6-412e-a665-5d62a3b1a1b5", + "metadata": {}, + "source": [ + "

    Filters:

    \n", + "
      \n", + "
    • Industry: Telco
    • \n", + "
    • Functionality: Feature Store and AutoML
    • \n", + "
    • Use Case: Customer Retention
    • \n", + "
    \n", + "

    Related Resources:

    \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "b4fd5272-ceb7-4d47-bd5c-c3aea31e471a", + "metadata": {}, + "source": [ + "

    Reference Links:

    \n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "0d6b60cb-d919-4daf-bc2a-35d7bf17eec7", + "metadata": {}, + "source": [ + "Dataset:\n", + "\n", + "- `CustomerID`: unique id of customer\n", + "- `Gender`: Whether the customer is a male or a female\n", + "- `SeniorCitizen`:Whether the customer is a senior citizen or not (1, 0)\n", + "- `Partner`:Whether the customer has a partner or not (Yes, No)\n", + "- `Dependents`:Whether the customer has dependents or not (Yes, No)\n", + "- `Tenure`:Number of months the customer has stayed with the company\n", + "- `PhoneService`:Whether the customer has a phone service or not (Yes, No)\n", + "- `MultipleLines`:Whether the customer has multiple lines or not (Yes, No, No phone service)\n", + "- `InternetService`:Customer’s internet service provider (DSL, Fiber optic, No)\n", + "- `OnlineSecurity`:Whether the customer has online security or not (Yes, No, No internet service)\n", + "- `OnlineBackup`:Whether the customer has online backup or not (Yes, No, No internet service)\n", + "- `DeviceProtection`:Whether the customer has device protection or not (Yes, No, No internet service)\n", + "- `TechSupport`:Whether the customer has tech support or not (Yes, No, No internet service)\n", + "- `StreamingTV`:Whether the customer has streaming TV or not (Yes, No, No internet service)\n", + "- `StreamingMovies`:Whether the customer has streaming movies or not (Yes, No, No internet service)\n", + "- `Contract`:The contract term of the customer (Month-to-month, One year, Two year)\n", + "- `PaperlessBilling`:Whether the customer has paperless billing or not (Yes, No)\n", + "- `PaymentMethod`:The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))\n", + "- `MonthlyCharges`:The amount charged to the customer monthly\n", + "- `TotalCharges`:The total amount charged to the customer\n", + "- `Churn`:Whether the customer churned or not (Yes or No)" + ] + }, + { + "cell_type": "markdown", + "id": "d7e28609-20ff-47e0-a640-48db6a7fa523", + "metadata": {}, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2025. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS.png b/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS.png new file mode 100644 index 00000000..bdb2f059 Binary files /dev/null and b/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS.png differ diff --git a/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS_process.png b/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS_process.png new file mode 100644 index 00000000..8d0d921d Binary files /dev/null and b/VantageCloud_Lake/UseCases/EFS_Demo/images/EFS_process.png differ diff --git a/VantageCloud_Lake/UseCases/Financial_Fraud_Detection/VCL_Financial_Fraud_Detection_Python.ipynb b/VantageCloud_Lake/UseCases/Financial_Fraud_Detection/VCL_Financial_Fraud_Detection_Python.ipynb new file mode 100644 index 00000000..efc4ed50 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Financial_Fraud_Detection/VCL_Financial_Fraud_Detection_Python.ipynb @@ -0,0 +1,990 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Financial Fraud Detection with Python and TeradataML\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Introduction

    \n", + "

    \n", + " In recent years we have seen a massive increase in Fraud attempts, making fraud detection imperative for Banking and Financial Institutions. Despite countless efforts and human supervision, hundreds of millions of dollars are lost due to fraud. Fraud can happen using various methods, i.e., stolen credit cards, misleading accounting, phishing emails, etc. Due to small cases in significant populations, fraud detection has become more and more challenging. \n", + "
    \n", + "
    \n", + " With ClearScape Analytics, data scientists can use their preferred language, tools and platform to develop models to identify this fraud. Even in large scale operations, users have the guarantee that Vantage can scale to their needs and reduce fraud.

    \n", + " \n", + "

    Business Values

    \n", + "
      \n", + "
    • Identification of financial fraud in multiple accounts
    • \n", + "
    • Pattern recognition of fraudulent versus normal transactions
    • \n", + "
    • Reduction of money lost due to recovering fraudulent charges
    • \n", + "
    • Improved customer satisfaction and reduction of customer churn
    • \n", + "
    \n", + "\n", + "

    Why Vantage?

    \n", + "

    To maximize the business value of advanced analytic techniques including Machine Learning and Artificial Intelligence, it is estimated that organizations must scale their model development and deployment pipelines to 100s or 1000s of times greater amounts of data, models, or both.\n", + "
    \n", + "
    \n", + " ClearScape Analytics provides powerful, flexible end-to-end data connectivity, feature engineering, model training, evaluation, and operational functions that can be deployed at scale as enterprise data assets; treating the products of ML and AI as first-class analytic processes in the enterprise.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "1. Configuring the Environment\n", + "

    Here, we import the required libraries, set environment variables and environment paths (if required).

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Standard Libraries\n", + "import os\n", + "import json\n", + "import getpass\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "# Teradata Libraries\n", + "from teradataml import *\n", + "\n", + "from dotenv import load_dotenv, dotenv_values\n", + "# Configuration\n", + "spacing_large = \" \"*95\n", + "spacing_small = \" \"*12\n", + "display.max_rows = 5\n", + "configure.val_install_location = 'td_val'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "2. Connect to Vantage\n", + "

    We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql(\"SET query_band='DEMO=VCL_Financial_Fraud_Detection_Python.ipynb;' UPDATE FOR SESSION;\")\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We begin running steps with Shift + Enter keys.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    3.Load the data and Data Exploration

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_GLM_Fraud\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    Note: The tables are available in DEMO_GLM_Fraud_DB databases and we have created views in DEMO_GLM_Fraud databases which are used in the cells below

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

    We loaded the data from https://www.kaggle.com/code/georgepothur/4-financial-fraud-detection-xgboost/data into Vantage in a table named \"transaction_data\". We checked the data size and printed sample rows: 63k rows and 12 columns.

    \n", + "

    *Please scroll down to the end of the notebook for detailed column descriptions of the dataset.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_data = DataFrame(in_schema('DEMO_GLM_Fraud', 'transaction_data'))\n", + "\n", + "print(txn_data.shape)\n", + "txn_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    In this simulated scenario, deceptive agents engage in transactions with the objective of taking control of customers' accounts, transferring funds to another account, and ultimately cashing out for profit.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.1 How many fraudulent transactions do we have in our dataset?

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# There are 92 fraud transactions i.e. 0.14% of fraud transactions in the dataset.\n", + "print(\"No of fraud transactions: %d\\nPercentage of fraud transactions: %.2f%%\"%(\n", + " txn_data.loc[txn_data.isFraud == 1].shape[0],\n", + " txn_data.loc[txn_data.isFraud == 1].shape[0]/txn_data.shape[0]*100)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.2 How many transactions do we have group by transaction type?

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter data for fraud transactions and group by 'type'\n", + "transactions_by_type = txn_data.groupby('type').count().get(['type','count_txn_id'])\n", + "\n", + "\n", + "# Sort by 'count_step' column in descending order\n", + "transactions_by_type = transactions_by_type.sort('count_txn_id', ascending = False)\n", + "\n", + "transactions_by_type = transactions_by_type.assign(\n", + " type_int = case([\n", + " (transactions_by_type.type == 'CASH_IN', 0),\n", + " (transactions_by_type.type == 'CASH_OUT', 1),\n", + " (transactions_by_type.type == 'DEBIT', 2),\n", + " (transactions_by_type.type == 'PAYMENT ', 3),\n", + " (transactions_by_type.type == 'TRANSFER', 4),\n", + " ])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transactions_by_type.plot(\n", + " x = transactions_by_type.type_int,\n", + " y = transactions_by_type.count_txn_id,\n", + " kind = 'bar',\n", + " legend = ['Count by Type'],\n", + " ylabel = 'Count of Transactions',\n", + " xlabel = spacing_small.join(sorted(list(transactions_by_type[['type']].get_values().flatten()))),\n", + " title = \"Number of Transactions per Transaction Type\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.3 How many fraudulent transactions do we have group by transaction type?

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Filter data for fraud transactions and group by 'type'\n", + "fraud_transactions_by_type = txn_data.loc[txn_data.isFraud == 1].groupby('type').count().get(['type','count_txn_id'])\n", + "\n", + "# Sort by 'count_step' column in descending order\n", + "fraud_transactions_by_type = fraud_transactions_by_type.sort('count_txn_id', ascending = False)\n", + "\n", + "fraud_transactions_by_type = fraud_transactions_by_type.assign(\n", + " total_fraud = txn_data.loc[txn_data.isFraud == 1].shape[0],\n", + " type_int = case([(fraud_transactions_by_type.type == 'TRANSFER', 0)], else_ = 1)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fraud_transactions_by_type.plot(\n", + " x = fraud_transactions_by_type.type_int,\n", + " y = [fraud_transactions_by_type.total_fraud, fraud_transactions_by_type.count_txn_id],\n", + " kind = 'bar',\n", + " figsize = (800, 500),\n", + " legend = ['Total Fraud', 'Count by Type'],\n", + " ylabel = 'Count of Fraud Transactions',\n", + " xlabel = 'TRANSFER' + spacing_large + 'CASH_OUT',\n", + " title = \"Number of Fraud Transactions by Transaction Type\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    From the above result, we can see that out of the 92 fraud transactions, 47 are from transaction type \"TRANSFER\" and 45 are from \"CASH_OUT\".

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.4 What percentage of fraudulent transactions do we have where transaction amount is equal to old balance in the origin account?

    \n", + "\n", + "

    This might be the case where the fraudster emptied the account of the victim.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"No of cleanout fraud transactions: %d\\nPercentage of cleanout fraud transactions: %.2f%%\"%(\n", + " txn_data.loc[txn_data['amount'] == txn_data.oldbalanceOrig].loc[txn_data['isFraud'] == 1].shape[0],\n", + " txn_data.loc[txn_data['amount'] == txn_data.oldbalanceOrig].loc[txn_data['isFraud'] == 1].shape[0] / txn_data.loc[txn_data.isFraud == 1].shape[0]*100)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    From the above result, we can see that out of 92 Fraud transactions, the amount involved in 90 fraud transactions was equal to the total balance in the account.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    Below are some insights about the dataset:

    \n", + "
      \n", + "
    1. We have 92 fraud transactions, which account for 0.14% of the dataset.
    2. \n", + "
    3. Out of these 92 fraud transactions, 47 are of type TRANSFER, and 45 are of type CASH_OUT.
    4. \n", + "
    5. Approximately 97.83% of our fraud transactions have a transaction amount equal to oldbalanceOrig, indicating account cleanout.
    6. \n", + "
    7. About 71.74% of our fraud transactions have the recipient's old balance as zero.
    8. \n", + "
    9. The isFlaggedFraud indicator is correct only two times among our 92 fraud transactions.
    10. \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.5 Univariate statistics

    \n", + "\n", + "

    The describe funtion computes the count, mean, std, min, percentiles, and max for numeric columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "txn_data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.6 Checking for Null Values

    \n", + "

    The ColumnSummary() function can be used to take a quick look at the columns, their datatypes, and summary of NULLs/non-NULLs for a given table.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "colsum = ColumnSummary(\n", + " data = txn_data,\n", + " target_columns = [':']\n", + ")\n", + "colsum.result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.7 Checking for Outliers

    \n", + "

    The OutlierFilterFit() function calculates the lower percentile, upper percentile, count of rows and median for all the \"target_columns\" provided by the user. These metrics for each column help the function OutlierTransform() detect outliers in data.

    \n", + "\n", + "

    Here we are using teradataml syntax for the function. The same can be achived using the following SQL as well.

    \n", + "\n", + "SELECT * FROM TD_OutlierFilterFit(\n", + " ON \"DEMO_GLM_Fraud\".\"transaction_data\" AS InputTable\n", + " OUT TABLE OutputTable(\"DEMO_USER\".\"Outlier_output\")\n", + " USING\n", + " TargetColumns('amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig')\n", + ") as dt;\n", + "\n", + "

    *Please note that both the versions run in-database and there is no data transfer involved.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fit_object = OutlierFilterFit(\n", + " data = txn_data,\n", + " target_columns = ['amount','newbalanceOrig', 'oldbalanceDest','newbalanceDest','oldbalanceOrig']\n", + ")\n", + "\n", + "res = fit_object.transform(data = txn_data).result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Rows before removing outliers: {txn_data.shape[0]}\\n\\\n", + "Rows after removing outliers: {res.shape[0]}\\n\\\n", + "Total outliers: {txn_data.shape[0] - res.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outliers = td_minus([txn_data, res])\n", + "outliers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "4. Data Preparation\n", + "\n", + "

    We'll perform the following steps:

    \n", + "
      \n", + "
    • We will one-hot encode the categorical \"type\" column.
    • \n", + "
    • We will perform feature scaling using ScaleFit and ScaleTransform on numerical columns.
    • \n", + "
    • We will split the data into training and testing datasets (80:20 split).
    • \n", + "
    \n", + "\n", + "

    We perform feature scaling during data pre-processing to handle highly varying magnitudes, values, or units. If feature scaling is not done, then a machine learning algorithm tends to weigh greater values higher and consider smaller values as lower ones, regardless of the unit of the values.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    4.1 Drop redundant columns

    \n", + "

    We don't need nameDest, nameOrigin, and isFlaggedFraud for model training as they do not impact the outcome. We have txn_id to uniquely identify each transaction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_data = txn_data.drop(['nameDest', 'nameOrig', 'isFlaggedFraud'], axis = 1)\n", + "txn_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    4.2 One-hot encoding

    \n", + "

    \n", + "Here, we are one-hot encoding the \"type\" column. We find one-hot encoding necessary in many cases to represent categorical variables as binary values, enable numerical processing, ensure feature independence, handle non-numeric data, and improve the performance and interpretability of our machine learning models.\n", + "

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_type_encoder = OneHotEncoder(\n", + " values = [\"CASH_IN\", \"CASH_OUT\", \"DEBIT\", \"PAYMENT\", \"TRANSFER\"],\n", + " columns = \"type\"\n", + ")\n", + "\n", + "retain = Retain(\n", + " columns = ['step', 'amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig', 'isFraud']\n", + ")\n", + "\n", + "obj = valib.Transform(\n", + " data = txn_data,\n", + " one_hot_encode = txn_type_encoder,\n", + " retain = retain,\n", + " index_columns = 'txn_id'\n", + ")\n", + "txn_trans = obj.result\n", + "txn_trans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above output shows that we have transformed the data into a transfromed dataset.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(txn_trans, table_name = 'clean_data', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "5. Create training and testing datasets in Vantage\n", + "

    We'll create two datasets for training and testing in the ratio of 80:20.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TrainTestSplit_out = TrainTestSplit(\n", + " data = txn_trans,\n", + " id_column = \"txn_id\",\n", + " train_size = 0.80,\n", + " test_size = 0.20,\n", + " seed = 25\n", + ")\n", + "\n", + "df_train = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 1].drop(['TD_IsTrainRow'], axis = 1)\n", + "df_test = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 0].drop(['TD_IsTrainRow'], axis = 1)\n", + "\n", + "print(\"Training Set = \" + str(df_train.shape[0]) + \". Testing Set = \" + str(df_test.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(df_train, table_name = 'clean_data_train', if_exists = 'replace')\n", + "copy_to_sql(df_test, table_name = 'clean_data_test', if_exists = 'replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df_train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above output shows that we have transformed the data into a scaled dataset. Scaling our data makes it easy for our model to learn and understand the problem.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "6. In-Database XGBoost model training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The XGBoost() function, also known as eXtreme Gradient Boosting, is an implementation of the gradient boosted decision tree algorithm designed for speed and performance. It has recently been dominating applied machine learning.

    \n", + "

    In gradient boosting, each iteration fits a model to the residuals (errors) of the previous iteration to correct the errors made by existing models. The predicted residual is multiplied by this learning rate and then added to the previous prediction. Models are added sequentially until no further improvements can be made. It is called gradient boosting because it uses a gradient descent algorithm to minimize the loss when adding new models.

    \n", + "\n", + "

    Here we are using teradataml syntax for the function. The same can be achived using the following SQL as well.

    \n", + "\n", + "SELECT * FROM TD_XGBoost(\n", + "\tON \"DEMO_USER\".\"clean_data_train\" AS \"input\"\n", + "\tPARTITION BY ANY\n", + "\tUSING InputColumns('amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig','CASH_IN_type','CASH_OUT_type','DEBIT_type','PAYMENT_type','TRANSFER_type')\n", + "\tResponseColumn('isFraud')\n", + "\tMaxDepth(7)\n", + "\tSeed(42)\n", + "\tModelType('Classification')\n", + "\tRegularizationLambda(120.0)\n", + "\tShrinkageFactor(0.1)\n", + ") as sqlmr\n", + "\n", + "

    *Please note that both the versions run in-database and there is no data transfer involved.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cols = df_train.columns\n", + "cols.remove('txn_id')\n", + "cols.remove('step')\n", + "cols.remove('isFraud')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoost_out = XGBoost(\n", + " data=df_train,\n", + " input_columns=cols,\n", + " response_column = 'isFraud',\n", + " lambda1 = 120.0,\n", + " model_type='Classification',\n", + " seed=42,\n", + " shrinkage_factor=0.1,\n", + " max_depth=7\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoost_out.output_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The function output is a trained XGBoost model, and we can input it to the XGBoostPredict() function for prediction.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "7. In-Database XGBoost model scoring" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The XGBoostPredict() function runs the predictive algorithm based on the model generated by XGBoost(). The XGBoost() function, also known as eXtreme Gradient Boosting, performs classification or regression analysis on datasets.

    \n", + "

    \n", + "When using the function, we should provide only numeric features. We need to convert the categorical features to numeric values before prediction.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoostPredict_out = XGBoostPredict(\n", + " newdata=df_test,\n", + " object=XGBoost_out.result,\n", + " model_type='Classification',\n", + " id_column='txn_id',\n", + " object_order_column=['task_index', 'tree_num',\n", + " 'iter', 'tree_order'],\n", + " accumulate='isFraud',\n", + " output_prob=True,\n", + " output_responses=['0', '1']\n", + ").result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoostPredict_out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The output above shows our prob_1, i.e., the transaction is fraud, and prob_0, i.e., the transaction is not a fraud. We use these probabilities in our prediction column to assign a class label.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "combined_df = df_test.join(XGBoostPredict_out, on='txn_id', lsuffix='test', rsuffix='pred')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "combined_df[combined_df['Prediction']==1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "out = XGBoostPredict_out.assign(Prediction = XGBoostPredict_out.Prediction.cast(type_ = BYTEINT))\n", + "out = out.assign(Prediction = out.Prediction.cast(type_ = VARCHAR(2)))\n", + "out = out.assign(isFraud = out.isFraud.cast(type_ = VARCHAR(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ClassificationEvaluator_obj = ClassificationEvaluator(\n", + " data = out,\n", + " observation_column = 'isFraud',\n", + " prediction_column = 'Prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ClassificationEvaluator_obj.output_data.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "8. Visualize the results (ROC curve and AUC)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We create the ROC curve, which is a graph between TPR (True Positive Rate) and FPR (False Positive Rate). We use the area under the ROC curve as a metric to evaluate how well our model can distinguish between positive and negative classes. A higher AUC indicates better performance in distinguishing between the positive and negative categories. We generally consider an AUC above 0.75 as decent.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import ROC\n", + "\n", + "roc_out = ROC(\n", + " probability_column = '\"Prob_1\"',\n", + " observation_column = \"isFraud\",\n", + " positive_class = \"1\",\n", + " data = XGBoostPredict_out,\n", + " num_thresholds=300\n", + ")\n", + "\n", + "roc_out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Assigning new index column\n", + "roc_out.result = roc_out.result.assign(row = 1)\n", + "# Changing the index label.\n", + "roc_out.result._index_label = [\"row\"]\n", + "auc = roc_out.result.get_values()[0][0]\n", + "\n", + "figure = Figure(width=500, height=400, heading=\"Receiver Operating Characteristic (ROC) Curve\")\n", + "\n", + "plot = roc_out.output_data.plot(\n", + " x=roc_out.output_data.fpr,\n", + " y=[roc_out.output_data.tpr, roc_out.output_data.fpr],\n", + " xlabel='False Positive Rate',\n", + " ylabel='True Positive Rate',\n", + " color='carolina blue',\n", + " figure=figure,\n", + " legend=[f'XGBoost AUC = {round(auc, 4)}', 'AUC Baseline'],\n", + " legend_style='lower right',\n", + " grid_linestyle='--',\n", + " grid_linewidth=0.5,\n", + " linestyle = ['-', '--']\n", + ")\n", + "\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Looking at the above ROC Curve, we can confidently say that our model has performed well on testing data. The AUC value is above 0.75 and resonates with our understanding that the model is performing well.

    \n", + "\n", + "

    Conclusion

    \n", + "\n", + "

    In this demonstration, we have illustrated a simplified - but complete - overview of how we can implement a typical machine learning workflow completely inside the database using Vantage. This allows us to leverage Vantage's operational scale, power, and stability.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "9. Cleanup" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Work Tables

    \n", + "

    We need to clean up our work tables to prevent errors next time.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['clean_data', 'clean_data_train', 'clean_data_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name = table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "Required Materials\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    \n", + "\n", + "

    Filters:

    \n", + "
      \n", + "
    • Industry: Finance
    • \n", + "
    • Functionality: Machine Learning
    • \n", + "
    • Use Case: Fraud Detection
    • \n", + "
    \n", + "\n", + "

    Related Resources:

    \n", + "\n", + "\n", + "\n", + "Dataset:\n", + "\n", + "- `txn_id`: transaction id\n", + "- `step`: maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (31 days simulation).\n", + "- `type`: CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER\n", + "- `amount`: amount of the transaction in local currency\n", + "- `nameOrig`: customer who started the transaction\n", + "- `oldbalanceOrig`: customer's balance before the transaction\n", + "- `newbalanceOrig`: customer's balance after the transaction\n", + "- `nameDest`: customer who is the recipient of the transaction\n", + "- `oldbalanceDest`: recipient's balance before the transaction\n", + "- `newbalanceDest`: recipient's balance after the transaction\n", + "- `isFraud`: identifies a fraudulent transaction (1) and non fraudulent (0)\n", + "- `isFlaggedFraud`: flags illegal attempts to transfer more than 200,000 in a single transaction\n", + "\n", + "

    Links:

    \n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2025. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_ARIMA/VCL_Store_Sales_Forecasting_ARIMA_Python.ipynb b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_ARIMA/VCL_Store_Sales_Forecasting_ARIMA_Python.ipynb new file mode 100644 index 00000000..a410cf42 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_ARIMA/VCL_Store_Sales_Forecasting_ARIMA_Python.ipynb @@ -0,0 +1,917 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Store Sales Forecasting with In-Database Time Series\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    \n", + "

    Introduction

    \n", + "\n", + "\n", + "

    Retail stores rely on sales and an accurate amount of inventory to support these sales. However, demand can be everchanging leading to stores being overstocked or out of stock. In these situations, retail stores need to quickly adjust to increase revenues and avoid additional unnecessary costs. The best way to keep ROI up is with retail demand forecasting in Teradata Vantage and ClearScape Analytics. Teradata’s capabilities allow users to combine and analyze sales and inventory data across all stores, while taking into consideration seasonal events, such as holidays or the weather. Bringing together all the components that influence customers to buy products allows retail stores to accurately predict sales and demand to ensure for precise inventory.

    \n", + "\n", + "\n", + "

    Good Eats Grocery is a renowned retail corporation that operates a chain of hypermarkets. Here, Good Eats Grocery has provided a data combining of 45 stores including store information and monthly sales. The data is provided on weekly basis. Good Eats Grocery tries to find the impact of holidays on the sales of store. For which it has included four holidays’ weeks into the dataset which are Christmas, Thanksgiving, Super Bowl, Labor Day.
    \n", + "
    \n", + "Our Main Objective is to predict sales of store in a week. As in dataset size and time related data are given as feature, so analyze if sales are impacted by time-based factors and space- based factor. Most importantly how inclusion of holidays in a week soars the sales in store?\n", + "
    \n", + " \n", + "

    Business Value

    \n", + "
  • Predict sales over a specified period of time.
  • \n", + "
  • Identify seasonal trends in sales and demand to improve inventory management.
  • \n", + "
  • Plan for historic increase and decrease in sales unrelated to the calendar year.
  • \n", + "
  • Increase customer satisfaction.
  • \n", + "

    \n", + "

    Why Vantage?

    \n", + "

    Unbounded Array Framework (UAF) is the Teradata framework for building end-to-end time series forecasting pipelines. It also provides functions for digital signal processing and 4D spatial analytics. The series can reside in any Teradata supported or Teradata accessible table or in an analytic result table (ART). The UAF architecture provides a range of unique benefits including:

    \n", + "\n", + "
  • Rapid data exploration, preparation, and testing functions that can analyze massive amounts of data across an unlimited number of forecasts in parallel; drastically reducing the development and testing times.
  • \n", + "
  • The creation of a nearly unlimited number of forecasts in parallel, unlocking value in hyper-segmented (per-store-per-SKU inventory demand, per-household energy consumption) predictions, based on individualized models.
  • \n", + "
  • The ability to deploy the preparation and forecasting functions into automated pipelines that can run in near-real-time, eliminating the gaps between preparation, development, and deployment. \n", + "
  • \n", + "

    UAF provides data scientists with the tools for all phases of forecasting:

    \n", + "
  • Data preparation functions
  • \n", + "
  • Data exploration functions
  • \n", + "
  • Model coefficient estimation functions
  • \n", + "
  • Model validation functions
  • \n", + "
  • Model scoring functions
  • \n", + "

    Plus, with Teradata Vantage, users can perform these functions at scale and analyze and forecast hundreds/thousands series at once. Time Series analysis requires significant effort in analyzing, preparing, and testing forecast models. Traditional approaches require users to perform these laborious tasks multiple times for each prediction, so scaling forecasting efforts beyond a small number of different forecasts becomes prohibitive.

    \n", + " \n", + "\n", + "

    Data

    \n", + "

    The dataset contains historical sales data for 45 Good Eats Grocery stores located in different regions. Each store contains a number of departments, and you are tasked with predicting the department-wide sales for each store.

    \n", + "\n", + "

    In addition, Good Eats Grocery runs several promotional markdown events throughout the year. These markdowns precede prominent holidays, the four largest of which are the Super Bowl, Labor Day, Thanksgiving, and Christmas. The weeks including these holidays are weighted five times higher in the evaluation than non-holiday weeks. Part of the challenge presented by this competition is modelling the effects of markdowns on these holiday weeks in the absence of complete/ideal historical data.

    \n", + "\n", + "

    The basic idea of analyzing the Good Eats Grocery Forecasting dataset is to get a fair idea about the factors affecting the Sales of the Good Eats Grocery Store.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from teradataml import * \n", + "from teradataml.context.context import *\n", + "from teradataml.dataframe.dataframe import DataFrame\n", + "\n", + "from dotenv import load_dotenv, dotenv_values\n", + "from teradataml.dataframe.copy_to import copy_to_sql\n", + "import getpass\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "display.max_rows=5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_Store_Sales_Forecasting_ARIMA_Python.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    3.Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_SalesForecasting\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_SalesForecasting_DB database and we have created views in DEMO_SalesForecasting database which are used in the cells below

    \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    3.1 Prepare data to do some basic Analysis of the Sales data.

    \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Let us start by creating a \"Virtual DataFrame\" that points directly to the dataset in Vantage. We begin our analysis by obtaining the necessary data types for columns and extract values such as Sales_week, Sales_year, etc., from the Sales_date column. These extracted values will be used in our subsequent analysis.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df=DataFrame(in_schema('DEMO_SalesForecasting','Weekly_Sales'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml.dataframe.sql_functions import case\n", + "from teradatasqlalchemy import TIMESTAMP, VARCHAR, INTEGER\n", + "from sqlalchemy import func\n", + "df = df.assign(IsHoliday = case([(df.IsHoliday == 0, 'False')], else_ = 'True'))\n", + "df = df.assign(Sales_Week = func.td_week_of_year(df.Sales_Date.expression))\n", + "df = df.assign(Sales_Date = df.Sales_Date.cast(type_=TIMESTAMP))\n", + "df = df.assign(Sales_Year = df.Sales_Date.cast(type_=VARCHAR(10)))\n", + "df = StrApply(data=df,\n", + " target_columns='Sales_Year',\n", + " string_operation='SUBSTRING',\n", + " string_length = 4,\n", + " accumulate = ['Store', 'Dept', 'Sales_Date', 'Weekly_Sales', 'IsHoliday','Sales_Week'],\n", + " in_place=True).result\n", + "df = df.assign(Sales_Year = df.Sales_Year.cast(type_=INTEGER))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "testdf=df\n", + "testdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

    Let's do some basic analysis of the dataset

    \n", + "

    We group the weekly sales by Sales Date and calculate the Average Sales based on Sales date. Alongside aggregating the data, we leverage the InDB plot() function for teradataml dataframes to visualize the data. This allows us to avoid transferring data to the client side even for visualizations.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df=testdf.select(['Sales_Date','Weekly_Sales']).groupby('Sales_Date')\n", + "df_plot=df.avg()\n", + "df_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=800, heading=\"Average Weekly Sales\")\n", + "plot = df_plot.plot(x=df_plot.Sales_Date, y=df_plot.avg_Weekly_Sales,\n", + " xtick_format='YYYY-MM',\n", + " xlabel='Week', ylabel='Sales', color=\"blue\",figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the Average Sales per week. We can see that there are peaks mainly during the Year end period.

    \n", + "

    Next we try to get the average sales for each Store, for that we group the Weekly Sales by each Store.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "weekly_sales = testdf.select(['Store','Weekly_Sales']).groupby('Store')\n", + "ws_plot=weekly_sales.avg()\n", + "ws_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=800, heading=\"Average Sales per Store\")\n", + "plot = ws_plot.plot(x=ws_plot.Store, y=ws_plot.avg_Weekly_Sales,\n", + " kind='bar',\n", + " xlabel='Store', ylabel='Sales', figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the Average Weekly Sales for each store. We can see that Store 4 shows highest weekly sales while Store 5 shows the lowest weekly sales.

    \n", + "

    Next we try to get the Weekly Sales for each year separately. For this we group the data for all 3 years by Sales Date for each year

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "week_df = testdf.select(['Sales_Week','Sales_Year','Weekly_Sales'])\n", + "week_df = week_df.assign(Weekly_Sales_2010 = case([(week_df.Sales_Year == 2010, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.assign(Weekly_Sales_2011 = case([(week_df.Sales_Year == 2011, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.assign(Weekly_Sales_2012 = case([(week_df.Sales_Year == 2012, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.select(['Sales_Week','Weekly_Sales_2010','Weekly_Sales_2011','Weekly_Sales_2012'])\n", + "week_df = week_df.groupby('Sales_Week')\n", + "week_df = week_df.avg()\n", + "week_df = week_df[((week_df.avg_Weekly_Sales_2010 != 0.0 ) & (week_df.avg_Weekly_Sales_2011 != 0.0) &\n", + " (week_df.avg_Weekly_Sales_2012 != 0.0))]\n", + "week_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=600, heading=\"Average Weekly Sales per Year\")\n", + "week_df.plot(x=week_df.Sales_Week, y=[week_df.avg_Weekly_Sales_2010, week_df.avg_Weekly_Sales_2011, week_df.avg_Weekly_Sales_2012], \n", + " style=['dark orange', 'green','blue'], xlabel='Week', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['2010','2011','2012'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the Average Weekly Sales for different years. We can see that there are peaks mainly during 10-15th week and 20-30th week.

    \n", + "

    We try to get the comparison of Sales during Holidays and Other Working Days. We do a grouping of data for Sales based on whether the Sale is on Holiday or Working Day

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "week_holiday_df = testdf.select(['Sales_Date','Sales_Week','IsHoliday','Weekly_Sales'])\n", + "week_holiday_df = week_holiday_df.assign(Weekly_Sales_True = case([(week_holiday_df.IsHoliday == 'True', week_holiday_df.Weekly_Sales)], else_ = 0))\n", + "week_holiday_df = week_holiday_df.assign(Weekly_Sales_False = case([(week_holiday_df.IsHoliday == 'False', week_holiday_df.Weekly_Sales)], else_ = 0))\n", + "week_holiday_df = week_holiday_df.select(['Sales_Date','Sales_Week','Weekly_Sales_True','Weekly_Sales_False'])\n", + "week_holiday_df = week_holiday_df.groupby(['Sales_Date','Sales_Week'])\n", + "week_holiday_df = week_holiday_df.sum()\n", + "week_holiday_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=600, heading=\"Total Sales per Week\")\n", + "week_holiday_df.plot(x=week_holiday_df.Sales_Week, y=[week_holiday_df.sum_Weekly_Sales_True, week_holiday_df.sum_Weekly_Sales_False], \n", + " style=['blue','brown'], xlabel='Week', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Holidays','Week Days'],kind='bar', figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the Weekly Sales per Week. The Orange colored bars show weekly sales during working days while the Blue colored bars show weekly sales during holidays.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    4. Preparing Dataset by joining the datasets.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    \n", + "

  • Weekly_Sales is our variable of interest.
  • \n", + "
  • Type, Size, Temperature, isHoliday, Fuel_Price, MarkDown1, MarkDown2, MarkDown3, MarkDown4, MarkDown4 are exogenous variables.
  • \n", + "

    \n", + "\n", + "

    We prepare the dataset by creating a view by joining data from Weekly Sales, Stores and features. The view is created using SQL to reduce the number of steps to join and data preocessing which gets used in further steps.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query2='''REPLACE VIEW Weekly_Sales_Details AS\n", + "SELECT\n", + " w.Sales_date AS times,\n", + " CAST('2012-02-03' AS DATE) AS cutoff_date,\n", + " w.Dept,\n", + " w.Store,\n", + " CAST(w.Sales_Date AS TIMESTAMP) AS Sales_Date,\n", + " ZEROIFNULL(Weekly_Sales) AS Weekly_Sales,\n", + " ZEROIFNULL(Store_Size) AS Store_Size,\n", + " Store_Type AS Store_Type,\n", + " w.IsHoliday,\n", + " ZEROIFNULL(Temperature) AS Temperature,\n", + " ZEROIFNULL(MarkDown1) AS MarkDown1,\n", + " ZEROIFNULL(MarkDown2) AS MarkDown2,\n", + " ZEROIFNULL(MarkDown3) AS MarkDown3,\n", + " ZEROIFNULL(MarkDown4) AS MarkDown4,\n", + " ZEROIFNULL(MarkDown5) AS MarkDown5,\n", + " ZEROIFNULL(CPI) AS CPI,\n", + " ZEROIFNULL(Unemployment) AS Unemployment,\n", + " ZEROIFNULL(Fuel_Price) AS Fuel_Price,\n", + " CAST(TRIM(w.Dept) || TRIM(w.Store) AS INT) AS idcols\n", + "FROM\n", + " Demo_SalesForecasting.Weekly_Sales w\n", + "LEFT JOIN\n", + " Demo_SalesForecasting.Stores s ON w.Store = s.Store\n", + "LEFT JOIN\n", + " Demo_SalesForecasting.Features f ON w.Store = f.store AND w.Sales_Date = f.Sales_Date\n", + "WHERE\n", + " w.Store IN (20, 4);\n", + "'''\n", + "\n", + "execute_sql(query2)\n", + "modeldf=DataFrame.from_query('select * from Weekly_Sales_Details;')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dfacheck = modeldf.groupby([\"idcols\"])\n", + "dfacheck=dfacheck.count().select([\"idcols\",\"count_Sales_Date\"])\n", + "\n", + "dfa4=modeldf.join(dfacheck, on = 'idcols', how = \"left\", lsuffix = 't1', rsuffix = 't2').drop(['idcols_t2'],axis=1)\n", + "dfa4=dfa4.assign(idcols = dfa4['idcols_t1'])\n", + "dfa4=dfa4.drop(['idcols_t1'],axis=1)\n", + "\n", + "# filter out incomplete time series \n", + "\n", + "modeldf1 = dfa4[dfa4.count_Sales_Date == 143]\n", + "modeldf1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "modeldf1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Checking for Stationarity of Time Series using the Dickey Fuller Test

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    To be able to model a time series, it needs to be stationary. ARIMA models deal with non-stationary time series by differencing (The \"d' parameter in ARIMA determines the number of differences needed to make a series stationary)

    \n", + "

    Here we will check for stationarity of all time series using the Dickey-Fuller Test. For more info on the test, see here. \n", + "

    The null hypothesis for the test is that the data is non-stationary. We want to REJECT the null hypothesis for this test. So, we want a p-value of less than 0.05 (or smaller) and a negative coefficient value for the lag term in our regression model.

    \n", + "

    The Dickey fuller function needs series data, so we use the TDSeries function to create a series and apply DickeyFuller to check the stationarity of the data.

    \n", + "

    We use the OutlierFilterFit and the OutlierFilterTransform functions to remove the outliers in the series and then use the Rescaled Data to check the stationarity of the data using the DickeyFuller function.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_df=modeldf1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The OutlierFilterFit() function calculates the lower_percentile, upper_percentile, count of rows and median for all the \"target_columns\" provided by the user. These metrics for each column helps the function OutlierTransform() detect outliers in the input table. It also stores parameters from arguments into a FIT table used during transformation. The lower_percentile specifies lower range of percentile to be used to detect if value is outlier or not and the upper_percentile specifies upper range of percentile to be used to detect if value is outlier or not.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OutlierFilterFit\n", + "OutlierFilterFit_out = OutlierFilterFit(data = sales_df,\n", + " target_columns = \"Weekly_Sales\",\n", + " )\n", + "out_df=OutlierFilterFit_out.output_data\n", + "out_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    \n", + "

    The OutlierFilterfit creates a fit table with different values which need to be applied on the data to get the transformed data.

    \n", + "

    \n", + "

    OutlierFilterTransform() function filters the outliers from the input teradataml DataFrame.

    \n", + "

    OutlierFilterTransform() uses the result DataFrame from OutlierFilterFit() function to get statistics like median, count of rows, lower percentile and upper percentile for every column specified in target columns argument and filters the outliers in the input data.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OutlierFilterFit, OutlierFilterTransform\n", + "obj = OutlierFilterTransform(data=sales_df,\n", + " object=OutlierFilterFit_out.result)\n", + "out_transform_df = obj.result\n", + "out_transform_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    \n", + "

    The OutlierFilterTransform transforms the data and creates the output data after applying the Fit Table details on the data.

    \n", + "

    \n", + "

    The Resample() function transforms an irregular time series into a regular time series. It can also be used to alter the sampling interval for a time series. The Resample functions requires a series as inuput for which we use the TDSeries function.

    \n", + "\n", + "

    TDSeries object from a teradataml DataFrame representing a SERIES in time series which is used as input to Unbounded Array Framework, time series functions. A series is a one-dimensional array. They are the basic input of UAF functions. A series is identified by its series ID, i.e., \"id\" argument, and indexed by \"row_index\" argument. Series is passed to and returned from UAF functions as wavelets. Wavelets are collections of rows, grouped by one or more fields, and ordered on the \"row_index\" argument.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Resample\n", + "data_series_df = TDSeries(data=obj.result,\n", + " id=\"idcols\",\n", + " row_index=(\"Sales_Date\"),\n", + " row_index_style= \"TIMECODE\",\n", + " payload_field=\"Weekly_Sales\",\n", + " payload_content=\"REAL\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "uaf_out1 = Resample(data=data_series_df,\n", + " interpolate='LINEAR',\n", + " timecode_start_value=\"TIMESTAMP '2010-02-05 00:00:00'\",\n", + " timecode_duration=\"WEEKS(1)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df=uaf_out1.result\n", + "df1=df.select(['idcols','ROW_I', 'Weekly_Sales']).assign(Sales_Date=df.ROW_I)\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    \n", + "

    The DickeyFuller() function tests for the presence of one or more unit roots in a series to determine if the series is non-stationary. When a series contains unit roots, it is non-stationary. When a series contains no unit roots, whether the series is stationary is based on other factors.

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import DickeyFuller\n", + "data_series_df_1 = TDSeries(data=df1,\n", + " id=\"Sales_Date\",\n", + " row_index=(\"idcols\"),\n", + " row_index_style= \"SEQUENCE\",\n", + " payload_field=\"Weekly_Sales\",\n", + " payload_content=\"REAL\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = DickeyFuller( data=data_series_df_1,\n", + " algorithm='NONE')\n", + "\n", + "# Print the result DataFrame.\n", + "print(df_out.result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    \n", + "

    In the above output the p-value corresponding to the calculated test statistic is less than 0.05. It means that the series is stationary. The output column NULL_HYP which means NULL HYPOTHESIS can have 2 values \n", + "

  • ACCEPT means the null hypothesis is accepted. No Unit roots are present, and therefore the process is stationary.
  • \n", + "
  • REJECT means the null hypothesis is rejected. Unit roots are present, and the process may or may not be stationary, depending on other factors.
  • \n", + "

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    6. ARIMA Modelling

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    ARIMA stands for Autoregressive Integrated Moving Average. It is a statistical method used for time series forecasting and analysis. ARIMA is a form of regression analysis that gauges the strength of one dependent variable relative to other changing variables. ARIMA models are popular in various fields, including finance, economics, and environmental science, for predicting future points in a time series based on its historical values.

    \n", + "

    The ArimaEstimate() function estimates the coefficients corresponding to an ARIMA (AutoRegressive Integrated Moving Average) model, and to fit a series with an existing ARIMA model. The function can also provide the \"goodness of fit\" and the residuals of the fitting operation. The function generates model layer used as input for the ArimaValidate() and ArimaForecast() functions. This function is for univariate series.

    \n", + "\n", + "

    The following procedure is an example of how to use ArimaEstimate() function:

    \n", + "
  • Run the ArimaEstimate() function to get the coefficients for the ARIMA model.\n", + "
  • [Optional] Run ArimaValidate() function to validate the 'goodness of fit' of the ARIMA model, when \"fit_percentage\" argument value is not 100 in ArimaEstimate() function.\n", + "
  • Run the ArimaForecast() function with input from step 1 or step 2 to forecast the future periods beyond the last observed period.
  • \n", + "

    \n", + "\n", + "

    Here the input series to the ArimaEstimate is the output series of the Resample function. The series is created by using the output of Resample function and passed to ArimaEstimate.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaEstimate\n", + "# Execute ArimaEstimate function.\n", + "arima_est_out = ArimaEstimate(data1=data_series_df_1,\n", + " nonseasonal_model_order=[2,1,1],\n", + " constant=False,\n", + " algorithm=\"MLE\",\n", + " coeff_stats=True,\n", + " fit_metrics=True,\n", + " residuals=True,\n", + " fit_percentage=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "est_result=arima_est_out.fitresiduals\n", + "est_result = est_result.groupby('Sales_Date').avg()\n", + "est_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We plot the Actual Value of Weekly Sales vs the Calculated Value of the ArimaEstimate function.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Comparison of Actual vs Predicted Sales\")\n", + "est_result.plot(x=est_result.Sales_Date, y=[est_result.avg_ACTUAL_VALUE, est_result.avg_CALC_VALUE], \n", + " style=['dark orange', 'green'], xlabel='Sales Date', ylabel='Sales', grid_color='black',xtick_format='YYYY-MM',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Actual Value','Predicted Value'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The ArimaValidate() function performs an in-sample forecast for both seasonal and non-seasonal auto-regressive (AR), moving-average (MA), ARIMA models and Box-Jenkins seasonal ARIMA model formula followed by an analysis of the produced residuals. The aim is to provide a collection of metrics useful to select the model and expose the produced residuals such that multiple model validation and statistical tests can be conducted.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaValidate\n", + "data_art_df = TDAnalyticResult(data=arima_est_out.result)\n", + "\n", + "\n", + "arima_val_out = ArimaValidate(data=data_art_df, fit_metrics=True, residuals=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "val_result=arima_val_out.fitresiduals\n", + "val_result = val_result.groupby('Sales_Date').avg()\n", + "val_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We plot the Actual Value of Weekly Sales vs the Calculated Value of the ArimaValidate function.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Comparison of Actua vs Predicted\")\n", + "val_result.plot(x=val_result.Sales_Date, y=[val_result.avg_ACTUAL_VALUE, val_result.avg_CALC_VALUE], \n", + " style=['dark orange', 'green'], xlabel='Sales Date', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Actual Value','Predicted Value'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The ArimaForecast() function is used to forecast a user-defined number of periods based on models fitted from the ArimaEstimate() function.

    \n", + "

    Here we are considering 7 periods (forecast_periods=7)

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaForecast\n", + "arima_estimate_op = ArimaEstimate(data1=data_series_df_1,\n", + " nonseasonal_model_order=[2,1,1],\n", + " constant=False,\n", + " algorithm=\"MLE\",\n", + " coeff_stats=True,\n", + " fit_metrics=True,\n", + " residuals=True,\n", + " fit_percentage=100)\n", + "\n", + "# Create teradataml TDAnalyticResult object over the result attribute of 'arima_estimate_op'\n", + "data_art_df = TDAnalyticResult(data=arima_estimate_op.result)\n", + " \n", + "arima_forcast_out = ArimaForecast(data=data_art_df, forecast_periods=7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "forecast_result=arima_forcast_out.result\n", + "forecast_result = forecast_result.groupby('ROW_I').avg()\n", + "forecast_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We plot the Forecasted Value of Weekly Sales for the defined number of periods.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Forecast Sales\")\n", + "forecast_result.plot(x=forecast_result.ROW_I, y=forecast_result.avg_FORECAST_VALUE, \n", + " xlabel='Forecast Period', ylabel='Forecast Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    7. Conclusion:

    \n", + "

    We have trained and validated the ARIMA model on the Weekly Sales dataset, and the results closely match the actual data. The goodness of fit metrics calculated in the estimate and validate phase also resonate with our understanding that the model is well-trained to forecast. This can be observed in the Estimate and the Validate function graphs. So, we can say that the model is well trained to forecast the Weekly Sales.

    \n", + "\n", + "

    Thus with Teradata Vantage we can do rapid data exploration, preparation, and testing functions that can analyze massive amounts of data across an unlimited number of forecasts in parallel, drastically reducing the development and testing times. We can create unlimited number of forecasts in parallel, unlocking value in hyper-segmented (per-store-per-SKU inventory demand, per-household energy consumption) predictions, based on individualized models.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    8. Cleanup

    \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " db_drop_view('Weekly_Sales_Details')\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "Required Materials\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    \n", + "\n", + "

    Dataset

    \n", + "

    This is the historical data that covers sales from 2010-02-05 to 2012-11-01. Within this file you will find the following fields:

    \n", + "\n", + "
  • Store - the store number
  • \n", + "
  • Date - the week of sales
  • \n", + "
  • Weekly_Sales - sales for the given store
  • \n", + "
  • Holiday_Flag - whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week
  • \n", + "
  • Temperature - Temperature on the day of sale
  • \n", + "
  • Fuel_Price - Cost of fuel in the region
  • \n", + "
  • CPI – Prevailing consumer price index
  • \n", + "
  • Unemployment - Prevailing unemployment rate
  • \n", + "
  • Holiday Events: Super Bowl, Labour Day, Thanksgiving, Christmas
  • \n", + "

    \n", + "\n", + "

    Filters:

    \n", + "
  • Industry: Retail
  • \n", + "
  • Functionality: ARIMA Estimate and Forecast
  • \n", + "
  • Use Case: Sales Forecasting
  • \n", + "

    \n", + "

    Related Resources:

    \n", + "
  • In the fight to improve customer experience, NPS is a metric, not the goal
  • \n", + "
  • Hyper-scale time series forecasting done right
  • \n", + "
  • Crystal Ball, Black Box or Advanced Forecasting and Demand Planning in Retail and CPG
  • " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2023, 2024, 2025. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/VCL_Store_Sales_Forecasting_Prophet_OAF.ipynb b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/VCL_Store_Sales_Forecasting_Prophet_OAF.ipynb new file mode 100644 index 00000000..32ee0d49 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/VCL_Store_Sales_Forecasting_Prophet_OAF.ipynb @@ -0,0 +1,1226 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Store Sales Forecasting with Prophet using Script Table Operator\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    \n", + "

    Introduction

    \n", + "\n", + "

    Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied.

    \n", + "
    \n", + "

    Our Main Objective is to predict sales of store in a week. We are using the python Prophet model and using the Open Analytics Framework(OAF) of VantageCloud Lake for forecasting the Store Sales.

    \n", + " \n", + "

    The Open Analytics Framework builds on the existing Vantage facilities for data scientists and analysts to do the following:

    \n", + "
  • Score multiple models concurrently in parallel with minimal effort.
  • \n", + "
  • Train single or multiple micro models based on data stored in Vantage.
  • \n", + "
  • Enable scripting and the use of open source resources to experiment and iterate with analytics, machine learning (ML), and artificial intelligence (AI) use cases.
  • \n", + "

    APPLY table operator is the VantageCloud Lake successor to the Vantage Enterprise SCRIPT and ExecR table operators. The APPLY table operator bears more similarities to the SCRIPT operating mode, in that APPLY takes an external language script as input to run, rather than ingesting external language statements in a contract function as ExecR does. The APPLY table operator is nevertheless designed to expand its features in the future in a way that encompasses additional key features from both the SCRIPT and ExecR table operators. The fastpath APPLY table operator runs a user-installed script or any Linux command inside the remote user environment using Open Analytics Framework. Installed script runs in parallel with data from Analytics Database.

    \n", + "\n", + "

    An overview of the steps for using the Open Analytics Framework follow.

    \n", + "
  • Connect to your target VantageCloud Lake system.
  • \n", + "
  • Assume you use the Vantage Python client library, teradataml, as the software tool to connect.
  • \n", + "
  • Upon connecting, create a user environment with the desired configuration of interpreter and libraries using Open Analytics Framework APIs.
  • \n", + "
  • Upload the language script, model, and any other relevant files to your target user environment.
  • \n", + "
  • Run the script by invoking the APPLY table operator inside the Analytics Database in your system.
  • \n", + "
  • Run the corresponding APPLY table operator query in the primary cluster to retrieve data, then send the data to a compute cluster to run with your language script in your user environment.
  • \n", + "

    \n", + "
    \n", + "\n", + "\n", + "

    Hence as a data science consultant, we are showcasing the complete approach about how we can make prediction of sales for different stores in advance. We are demonstrating how we can train our models and use them for scoring using the ClearScape Analytics platform. The data we are using is a sample dataset and the results and predictions may not be entirely accurate.\n", + "

    \n", + "

    Data

    \n", + "

    The dataset contains historical sales data for 1,115 Rossmann stores. The task is to forecast the \"Sales\" column for the test set. Note that some stores in the dataset were temporarily closed for refurbishment.

    \n", + "\n", + "

    Most of the fields are self-explanatory. The following are descriptions for those that aren't.

    \n", + "\n", + "
  • Store - a unique Id for each store
  • \n", + "
  • Sales - the turnover for any given day (this is what you are predicting)
  • \n", + "
  • Customers - the number of customers on a given day
  • \n", + "
  • Open - an indicator for whether the store was open: 0 = closed, 1 = open
  • \n", + "
  • StateHoliday - indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays. Note that all schools are closed on public holidays and weekends. a = public holiday, b = Easter holiday, c = Christmas, 0 = None
  • \n", + "
  • SchoolHoliday - indicates if the (Store, Date) was affected by the closure of public schools
  • \n", + "
  • StoreType - differentiates between 4 different store models: a, b, c, d
  • \n", + "
  • Assortment - describes an assortment level: a = basic, b = extra, c = extended
  • \n", + "
  • CompetitionDistance - distance in meters to the nearest competitor store
  • \n", + "
  • CompetitionOpenSince[Month/Year] - gives the approximate year and month of the time the nearest competitor was opened
  • \n", + "
  • Promo - indicates whether a store is running a promo on that day
  • \n", + "
  • Promo2 - Promo2 is a continuing and consecutive promotion for some stores: 0 = store is not participating, 1 = store is participating
  • \n", + "
  • Promo2Since[Year/Week] - describes the year and calendar week when the store started participating in Promo2
  • \n", + "
  • PromoInterval - describes the consecutive intervals Promo2 is started, naming the months the promotion is started anew. E.g., \"Feb,May,Aug,Nov\" means each round starts in February, May, August, November of any given year for that store.
  • \n", + "

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv\n", + "!pip install scikit-learn==1.1.3" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    Note: After installing the above libraries, Please restart the kernel. The simplest way is by typing zero zero: 0 0

    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    In the section, we import the required libraries and set environment variables and environment paths (if required).

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "import getpass\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from time import time\n", + "import os\n", + "# from prophet import Prophet\n", + "import warnings\n", + "import itertools\n", + "from dotenv import load_dotenv, dotenv_values\n", + "import pickle\n", + "import base64\n", + "import time\n", + "from teradataml import *\n", + "from IPython.display import display as ipydisplay\n", + "from IPython.display import clear_output\n", + "from time import sleep\n", + "\n", + "display.max_rows=5\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_Store_Sales_Forecasting_Prophet_OAF.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    3.Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below

    \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    3.1 Prepare data to do some basic Analysis of the Sales data.

    \n", + "\n", + "

    We create dataframe for the Stores and the Sales Data using tables from Vantage. To gain insights into the data's characteristics, we display a sample of 5 rows each.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store=DataFrame(in_schema('DEMO_ProphetSTO','Store'))\n", + "store " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The Store dataset contains description of the Stores like, StoreType, distance from the Competition Store and also various Promotion codes and Details.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales=DataFrame(in_schema('DEMO_ProphetSTO','Sales_Data'))\n", + "sales " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The Store Sales dataset contains the Store, DayofWeek, Date of Sales , Sales done, Customer involved, SalesOpen is a flag mentioning if the Store is Open or Closed and Promotion Code applied for the Sales.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    4. Data Analysis and Transformation

    \n", + "

    In this first section we go through the Sales and store data, handle missing values and create new features for further analysis.

    \n", + "

    We check the missing values for the CompetitionDistance column and replace it with the median values.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import SimpleImputeFit, SimpleImputeTransform\n", + "fit_obj = SimpleImputeFit(data=store,\n", + " stats_columns=\"CompetitionDistance\",\n", + " stats=\"median\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obj = SimpleImputeTransform(data=store,\n", + " object=fit_obj.output)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store=obj.result\n", + "store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We join the Store and Sales dataset to get the required columns for our analysis.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales.merge(right = store, how = \"inner\", on = \"store=store\",lsuffix='l', rsuffix='r')\n", + "sales_store=sales_store.assign(Store=sales_store.Store_l)\n", + "sales_store=sales_store.drop(['Store_l', 'Store_r'], axis=1)\n", + "sales_store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The final dataset used for analysis contains 18 columns and 91,256 rows.

    \n", + "

    Based on the data available we do some transformations on the data and create various features. From the SalesDate we, generate columns like , Year, Month, DayOfWeek , WeekofYear etc. Using the columns related to Competition like CompetionOpenSinceYear and CompetitionOpenSinceMonth we calculate if the Competition Store is Open or not(CompetitionOpen). Similarly, we do the processing for Promotions and create a flag(PromoOpen)

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(CompetitionOpenSinceYear = \n", + " case([(sales_store.CompetitionOpenSinceYear.isnull() == True, '0')], else_ = sales_store.CompetitionOpenSinceYear),\n", + " CompetitionOpenSinceMonth = \n", + " case([(sales_store.CompetitionOpenSinceMonth.isnull() == True, '0')], else_ = sales_store.CompetitionOpenSinceMonth),\n", + " Promo2SinceYear = \n", + " case([(sales_store.Promo2SinceYear.isnull() == True, '0')], else_ = sales_store.Promo2SinceYear),\n", + " Promo2SinceWeek = \n", + " case([(sales_store.Promo2SinceWeek.isnull() == True, '0')], else_ = sales_store.Promo2SinceWeek)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(Year = sales_store.SalesDate.year(),\n", + " Month = sales_store.SalesDate.month(),\n", + " Day = sales_store.SalesDate.day_of_month(),\n", + " DayOfWeek = sales_store.SalesDate.day_of_week(),\n", + " WeekOfYear = sales_store.SalesDate.week_of_year())\n", + "\n", + "sales_store = sales_store.assign(CompetitionOpen = 12 * (sales_store.Year - sales_store.CompetitionOpenSinceYear)+\n", + " (sales_store.Month - sales_store.CompetitionOpenSinceMonth),\n", + " PromoOpen = 12 * (sales_store.Year - sales_store.Promo2SinceYear)+\n", + " (sales_store.WeekOfYear - sales_store.Promo2SinceWeek) / 4.0)\n", + "\n", + "\n", + "sales_store = sales_store.assign(CompetitionOpen = case([(sales_store.CompetitionOpen > 0, sales_store.CompetitionOpen)], else_ = 0),\n", + " PromoOpen = case([(sales_store.PromoOpen > 0, sales_store.PromoOpen)], else_ = 0))\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(StoreType = case([(sales_store.StoreType == '0', 0),(sales_store.StoreType == 'a', 1),\n", + " (sales_store.StoreType == 'b', 2),(sales_store.StoreType == 'c', 3),\n", + " (sales_store.StoreType == 'd', 4)]),\n", + " Assortment = case([(sales_store.Assortment == '0', 0),(sales_store.Assortment == 'a', 1),\n", + " (sales_store.Assortment == 'b', 2),(sales_store.Assortment == 'c', 3),\n", + " (sales_store.Assortment == 'd', 4)]),\n", + " StateHoliday = case([(sales_store.StateHoliday == '0', 0),(sales_store.StateHoliday == 'a', 1),\n", + " (sales_store.StateHoliday == 'b', 2),(sales_store.StateHoliday == 'c', 3),\n", + " (sales_store.StateHoliday == 'd', 4)])\n", + " \n", + " ) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(monthStr = case([(sales_store.Month == 1, 'Jan'),(sales_store.Month == 2, 'Feb'),\n", + " (sales_store.Month == 3, 'Mar'),(sales_store.Month == 4, 'Apr'),\n", + " (sales_store.Month == 5, 'May'),(sales_store.Month == 6, 'Jun'),\n", + " (sales_store.Month == 7, 'Jul'),(sales_store.Month == 8, 'Aug'),\n", + " (sales_store.Month == 9, 'Sep'),(sales_store.Month == 10, 'Oct'),\n", + " (sales_store.Month == 11,' Nov'),(sales_store.Month == 12, 'Dec')]),\n", + " IsPromoMonth = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_sales = sales_store.select(['Month','Sales']).groupby('Month').mean()\n", + "plot = plot_sales.plot(x=plot_sales.Month, y=plot_sales.mean_Sales,\n", + " kind='bar', xlabel='Month', ylabel='Sales', color=\"orange\")\n", + " \n", + "# Display the plot.\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the total sales across months for all stores. We can see that the sales are highest in December which is the Holiday Season.

    \n", + "

    Now we will see the same metrics across different Store types and also based on whether there was any Promotion available(Promo=1) or not (Promo=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Catplot month Vs Sales\n", + "features_df = sales_store.to_pandas(all_rows=True)\n", + "sns.catplot(data = features_df, x = 'Month', y = \"Sales\", \n", + " col = 'StoreType', # per store type in cols\n", + " palette = 'plasma',\n", + " # hue = 'StoreType',\n", + " row = 'Promo' # per promo in the store in rows\n", + " # color ='Year'\n", + " ) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph shows the Sales per Month for each of the 4 StoreTypes(a,b,c,d) for all the 1,115 Stores. The Top row shows the sales for Promo=0 and the bottom row is for Promo=1. Each dot represents the sum of sales for a particular store in a month depending on the Store Type and Promo Code. We can see that there are peaks mainly during the Year end period.

    \n", + "

    All store types follow the same trend but at different scales depending on the presence of the promotion `Promo` and `StoreType` except for the StoreType = b.\n", + "

    \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "

    Next we try to get four stores from store types to represent their group:

    \n", + "
  • Store number 2 for `StoreType` A
  • \n", + "
  • Store number 85 for `StoreType` B
  • \n", + "
  • Store number 1 for `StoreType` C
  • \n", + "
  • Store number 15 for `StoreType` D
  • \n", + "\n", + "

    It also makes sense to down sample the data from days to weeks using the `resample` method to see the present trends more clearly.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = sales_store.select(['Store','SalesDate','Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_a = train_df[train_df.Store == 2].select(['SalesDate','Sales']).groupby('SalesDate').mean()\n", + "sales_b = train_df[train_df.Store == 85].select(['SalesDate','Sales']).groupby('SalesDate').sum()\n", + "# .sort_index(ascending = True) # solve the reverse order\n", + "sales_c = train_df[train_df.Store == 1].select(['SalesDate','Sales']).groupby('SalesDate').sum()\n", + "sales_d = train_df[train_df.Store == 15].select(['SalesDate','Sales']).groupby('SalesDate').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = subplots(nrows=4, ncols=1)\n", + " \n", + "plot = sales_a.plot(x=sales_a.SalesDate, y=sales_a.mean_Sales,\n", + " ax=axes[0], figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 2\", color=\"blue\",figsize=(1200, 1600))\n", + " \n", + "plot = sales_b.plot(x=sales_b.SalesDate, y=sales_b.sum_Sales,\n", + " ax=axes[1],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 85\", color=\"blue\")\n", + " \n", + "plot = sales_c.plot(x=sales_c.SalesDate, y=sales_c.sum_Sales,\n", + " ax=axes[2],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 1\", color=\"blue\")\n", + "\n", + "plot = sales_d.plot(x=sales_d.SalesDate, y=sales_d.sum_Sales,\n", + " ax=axes[3],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 15\", color=\"blue\")\n", + " \n", + "# Display the plot.\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Retail sales for all store types tend to peak for the Christmas season and then decline after the holidays.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Next we check the Yearly trend for these Store Types thing to check the presence of a trend in series. Time series decomposition is the process of separating time series data into its core components. These components include a potential trend (overall rise or fall in the mean), seasonality (a recurring cycle), and the remaining random residual. Python’s statsmodels library has a method for time series decomposition called seasonal_decompose(). The model type parameter can either be additive or multiplicative, here we consider additive as If the seasonality’s amplitude is independent of the level then you should use the additive model. The \"period\" parameter is the number of observations in a seasonal cycle. For example, if you have daily observations, the period is 1.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sorting with 'date'\n", + "pd_sales_store = features_df\n", + "train_df = pd_sales_store.set_index('SalesDate')\n", + "# Sales datacheck\n", + "train_df['Sales'] = train_df['Sales'] * 1.0\n", + "# storewise sales data\n", + "sales_a = train_df[train_df.Store == 2]['Sales']\n", + "sales_b = train_df[train_df.Store == 85]['Sales']\n", + "# .sort_index(ascending = True) # solve the reverse order\n", + "sales_c = train_df[train_df.Store == 1]['Sales']\n", + "sales_d = train_df[train_df.Store == 15]['Sales']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Seasonal decompose\n", + "from statsmodels.tsa.seasonal import seasonal_decompose\n", + "\n", + "f, (ax1, ax2, ax3, ax4) = plt.subplots(4, figsize = (15, 15))\n", + "\n", + "# monthly\n", + "decomposition_a = seasonal_decompose(sales_a, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_a.trend.plot(ax = ax1)\n", + "\n", + "decomposition_b = seasonal_decompose(sales_b, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_b.trend.plot( ax = ax2)\n", + "\n", + "decomposition_c = seasonal_decompose(sales_c, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_c.trend.plot( ax = ax3)\n", + "\n", + "decomposition_d = seasonal_decompose(sales_d, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_d.trend.plot( ax = ax4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Overall sales follow similar Trend for all StoreTypes as seen above. There are spikes around the year end which indicate higher sales over the year end holiday season.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Creating the model and forecasting using Prophet in python (stoSalesForecastnew.py).

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.

    \n", + "

    Prophet follows the sklearn model API. We create an instance of the Prophet class and then call its fit and predict methods.

    \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    All the below steps which include the Prophet model are executed in the python in the file stoSalesForecastnew.py file. We then use this py file in the Script command and get the forecasted values.

    \n", + "

    The input to Prophet is always a dataframe with two columns: ds and y. The ds (datestamp) column should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. The y column must be numeric and represents the measurement we wish to forecast.

    \n", + "\n", + "

    The below code shows the creation of the Sales DataFrame and the holidays Dataframe which are used in the model creation and model fit.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Create Sales data dataframe using data from Vantage

    \n", + "\n", + "```python \n", + "# create Sales data \n", + "sales = pd_sales_store.rename(columns = {'SalesDate': 'ds','Sales': 'y'})\n", + "``` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Create holidays dataframe

    \n", + "\n", + "```python\n", + "#create holidays dataframe\n", + " \n", + "\n", + "school_dates = df[df.SchoolHoliday == 1].loc[:, 'Date'].values\n", + "\n", + "school = pd.DataFrame({'holiday': 'school_holiday',\n", + " 'ds': pd.to_datetime(school_dates)})\n", + "\n", + "holidays = school \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    We fit the model by instantiating a new Prophet object. Any settings to the forecasting procedure are passed into the constructor. Then you call its fit method and pass in the historical dataframe(sales).

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Instantiate and fit model using Prophet

    \n", + "\n", + "```python\n", + "\n", + "# Prophet implementation \n", + "my_model = Prophet(interval_width = 0.95, \n", + " holidays = holidays.head(50000))\n", + "my_model.fit(sales) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Predictions are then made on a dataframe with a column ds containing the dates for which a prediction is to be made. You can get a suitable dataframe that extends into the future a specified number of days using the helper method Prophet.make_future_dataframe. By default, it will also include the dates from the history, so we will see the model fit as well.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Create future dates for forecasting

    \n", + "\n", + "```python\n", + "dt = min(sales['ds'].values)\n", + "date1 = datetime.datetime.strptime(dt, \"%y/%m/%d\").date()\n", + "\n", + "\n", + "\n", + "# # Subtract one month\n", + "start_date = date1 - relativedelta(months=1)\n", + "\n", + "# Get man date and then get future dates for 1 month\n", + "dt1 = max(sales['ds'].values)\n", + "date2 = datetime.datetime.strptime(dt1, \"%y/%m/%d\").date()\n", + "# date2 = datetime.datetime.strptime(datetime_str, \"%Y/%m/%dT%H:%M:%S.%f\").date()\n", + "end_date = date2 + relativedelta(months=1)\n", + "# end_date= str(end_value)\n", + "\n", + "\n", + "# # date_range = pd.date_range(start_date, periods=num_days)\n", + "date_range = pd.date_range(str(start_date), str(end_date))\n", + "\n", + "future_dates = pd.DataFrame({'ds': date_range}) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, it will provide an in-sample fit. The forecast object here is a new dataframe that includes the \"yhat\" column, which is the forecast values for sales, as well as columns for components and uncertainty intervals.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Create dataframe with forecast values

    \n", + "\n", + "```python\n", + "# forecast\n", + "forecast = my_model.predict(future_dates.head(10000)) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The forecasted values will be sent back to Vantage using the Returns clause of the Script function as seen in the section below.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "


    \n", + "\n", + "

    6. Using APPLY Command to get the forecasted values back to Vantage.

    \n", + "
    \n", + "\n", + "

    6.1 Create virtual environment for executing the script

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Function to set the Authentication token to connect to User Environment Service in VantageCloud Lake.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We've already loaded all the values into our environment variables and into a dictionary, env_vars.\n", + "# username=env_vars.get(\"username\") isn't required when using base_url, pat and pem.\n", + "\n", + "if set_auth_token(base_url=env_vars.get(\"ues_uri\"),\n", + " pat_token=env_vars.get(\"access_token\"), \n", + " pem_file=env_vars.get(\"pem_file\"),\n", + " valid_from=int(time.time())\n", + " ):\n", + " print(\"UES Authentication successful\")\n", + "else:\n", + " print(\"UES Authentication failed. Check credentials.\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Set the session to use the Analytic compute group and cluster to execute the OpenSourceML function.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Check the user environments and create an environment for the usecase.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list_user_envs()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " env = create_env(\n", + " env_name=\"oaf_demo_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for SalesForecasting Prophet\"\n", + " )\n", + "except:\n", + " remove_env(\"oaf_demo_env\")\n", + " env = create_env(\n", + " env_name=\"oaf_demo_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for SalesForecasting Prophet\"\n", + " )\n", + " \n", + "env " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Confirm that the versions in the local environment are same to the virtual environment.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip list | grep scikit-learn\n", + "!pip list | grep scipy\n", + "!pip list | grep numpy\n", + "!pip list | grep pandas\n", + "!pip list | grep prophet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claim_id = env.install_lib([\"pandas==2.1.3\",\n", + " \"scipy==1.11.2\",\n", + " \"scikit-learn==1.1.3\",\n", + " \"numpy==1.24.2\",\n", + " \"sklearn-pandas==2.2.0\", \n", + " \"prophet==1.1.4\"], asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the status of installation using status() API.\n", + "# Create a loop here for demo purposes\n", + "\n", + "ipydisplay(env.status(claim_id))\n", + "stage = env.status(claim_id)['Stage'].iloc[-1]\n", + "while stage == 'Started':\n", + " stage = env.status(claim_id)['Stage'].iloc[-1]\n", + " clear_output()\n", + " ipydisplay(env.status(claim_id))\n", + " sleep(5)\n", + " \n", + "# Verify the Python libraries have been installed correctly.\n", + "ipydisplay(env.libs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Set the user environment to the created virtual environment for the execution of the python script.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "configure.openml_user_env = env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    6.2 Install the file and any additional artifacts

    \n", + "\n", + "

    Use the install_file() method to install this python file to the container. As a reminder, this container is persistent, so these steps need only be done infrequently.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.install_file(\"stoSalesForecastnew.py\", replace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "\n", + "

    6.3 APPLY using Python

    \n", + "

    The process is as follows

    \n", + "\n", + "
  • Construct a dictionary that will define the return columns and data types
  • \n", + "
  • Construct a teradataml DataFrame representing the data to be processed - note this is a \"virtual\" object representing data and logic in-database
  • \n", + "
  • Execute the module function. This constructs the function call in the database, but does not execute anything. Note the Apply function takes several arguments - the input data, environment name, and the command to run
  • \n", + "
  • In order to execute the function, an \"execute_script()\" method must be called. This method returns the server-side DataFrame representing the complete operation. This DataFrame can be used in further processing, stored as a table, etc.
  • \n", + "

    \n", + "\n", + " \n", + "

    First we will create a dataset which can be passed to the Apply function.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qry='''CREATE SET TABLE Store_Sales_ID \n", + " (\n", + " SlsID INTEGER,\n", + " Store INTEGER,\n", + " DayOfWeek INTEGER,\n", + " SalesDate DATE FORMAT 'yyyy/mm/dd',\n", + " Sales INTEGER,\n", + " Customers INTEGER,\n", + " SalesOpen INTEGER,\n", + " Promo INTEGER,\n", + " StateHoliday CHAR(1) CHARACTER SET LATIN NOT CASESPECIFIC,\n", + " SchoolHoliday INTEGER)\n", + " PRIMARY INDEX ( SlsID ); '''\n", + "qry1='''insert into Store_Sales_ID select 1, Store ,\n", + " DayOfWeek ,\n", + " SalesDate ,\n", + " Sales ,\n", + " Customers ,\n", + " SalesOpen ,\n", + " Promo ,\n", + " StateHoliday,\n", + " SchoolHoliday from DEMO_prophetSTO.Sales_Data where Store <= 5;'''\n", + "try:\n", + " execute_sql(qry)\n", + " execute_sql(qry1) \n", + "except:\n", + " db_drop_table('Store_Sales_ID')\n", + " execute_sql(qry)\n", + " execute_sql(qry1) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_table_df2 = DataFrame('Store_Sales_ID')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Install the user script file on Vantage. In case of rerun if the file already exists we first remove it and then install again.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# return types\n", + "types_dict = OrderedDict({})\n", + "types_dict[\"ds\"] = VARCHAR(100)\n", + "types_dict[\"yhat\"] = VARCHAR(100)\n", + "types_dict[\"yhat_lower\"] = VARCHAR(100)\n", + "types_dict[\"yhat_upper\"] = VARCHAR(100)\n", + "types_dict[\"trend\"] = VARCHAR(100)\n", + "types_dict[\"trend_lower\"] = VARCHAR(100)\n", + "types_dict[\"trend_upper\"] = VARCHAR(100)\n", + "\n", + "# \"ds\":TIMESTAMP(0), \"yhat\": FLOAT(), \"yhat_lower\": FLOAT(), \"yhat_upper\": FLOAT() , \n", + "# \"trend\": FLOAT(), \"weekly\": FLOAT(), \"yearly\": FLOAT()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.install_file(\"stoSalesForecastnew.py\", replace=True)\n", + "apply_obj = Apply(\n", + " data=final_table_df2,\n", + " apply_command=\"python stoSalesForecastnew.py\",\n", + " # returns={\"ds\": VARCHAR(100)},\n", + " returns=types_dict,\n", + " env_name=env,\n", + " delimiter=\"\\t\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    Execute the script in SQL using APPLY command with the following SQL code:

    \n", + "

    Since the entire process of model training , fitting and scoring takes place in the .py file when used in the script command the below query make take some time approximately 50-60 seconds.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_forecast_df = apply_obj.execute_script()\n", + "sales_forecast_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_forecast_df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The output contains 5005 rows(1 for each date) and 7 columns.\n", + "

    The forecasting output contains information for:\n", + "

    \n", + "
  • The forecasted value (yhat)
  • \n", + "
  • Range for the forecasted values (yhat_lower and yhat_upper)
  • \n", + "
  • The overall trend for a given date (also incorporates seasonality)
  • \n", + "
  • Additive terms to adjust the trend to get the forecasted value
  • " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    To plot the forecast Values we select only the required columns and convert the teradataml dataframe to pandas dataframe.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_output = sales_forecast_df.to_pandas(all_rows=True).reset_index()\n", + "plot_output[\"ds\"] = pd.to_datetime(plot_output['ds']).dt.date\n", + "plot_output[\"yhat\"] = pd.to_numeric(plot_output['yhat'])\n", + "plot_output[\"yhat_lower\"] = pd.to_numeric(plot_output['yhat_lower'])\n", + "plot_output[\"yhat_upper\"] = pd.to_numeric(plot_output['yhat_upper'])\n", + "plot_output[\"trend\"] = pd.to_numeric(plot_output['trend'])\n", + "plot_output[\"trend_lower\"] = pd.to_numeric(plot_output['trend_lower'])\n", + "plot_output[\"trend_upper\"] = pd.to_numeric(plot_output['trend_upper'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_output_forecast = plot_output[['ds','yhat','yhat_lower','yhat_upper']].sort_values('ds', ascending=True)\n", + "# .tail(300)\n", + "plot_output_forecast = plot_output_forecast.reset_index()\n", + "plot_output_forecast.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    To plot the forecast Values and the confidence level we set the lower and upper bounds of the confidence interval to yhat_lower and yhat_upper.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "\n", + "# Create the data for the line graph, including the x-values and the corresponding upper and lower bounds\n", + "x_values = plot_output_forecast['ds'].values\n", + "y_values = plot_output_forecast['yhat'].values\n", + "lower_bounds = plot_output_forecast['yhat_lower'].values\n", + "upper_bounds = plot_output_forecast['yhat_upper'].values\n", + "\n", + " \n", + "plt.figure(figsize=(12, 8))\n", + "# Plot the line graph\n", + "plt.plot(x_values, y_values, color='black', label='Forecast Values')\n", + "plt.fill_between(x_values, lower_bounds, upper_bounds, color='lightblue', alpha=0.3, label='Confidence Interval')\n", + "\n", + " \n", + "\n", + "# Customize the plot\n", + "\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Forecast Values')\n", + "plt.title('Forecast Sales Values with Confidence Interval')\n", + "plt.legend()\n", + "\n", + " \n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    The above graph contains the Forecast values(black line) and the light blue area is the range of the lower(yhat_lower) and upper(yhat_upper) limits of the forecasted values.

    \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    7. Conclusion:

    \n", + "

    We have trained and validated the Prophet model using the python script and used the APPLY Operator using OAF and data from Vantage. We get the forecasted data in Vantage using the python script.

    " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "

    8. Cleanup

    \n", + "

    Work Tables

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db_drop_table(table_name='Store_Sales_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_env(\"oaf_demo_env\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "

    If you have updated the teradataml package, reinstall the package by uncommenting and running the below code cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# !pip install teradataml==17.20.0.6 --force-reinstall\n", + "!pip install scikit-learn==1.0.2 --force-reinstall\n", + "!pip install numpy==1.24.2 --force-reinstall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
    \n", + "
    ClearScape Analytics™
    \n", + "
    \n", + "
    \n", + " Copyright © Teradata Corporation - 2023, 2024, 2025. All Rights Reserved\n", + "
    \n", + "
    \n", + "
    " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png new file mode 100644 index 00000000..632c32c9 Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png new file mode 100644 index 00000000..4dac803c Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png new file mode 100644 index 00000000..8916da63 Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py new file mode 100644 index 00000000..aee10640 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py @@ -0,0 +1,150 @@ +# ####################################################################################################################### +# The code in the file gets input from Vantage table and creates prophet model and forecats sales using the forecast +# function of the Prophet model. These forecasted values are passed back to Vantage when this script is called using the +# Vantage Script command. +# ####################################################################################################################### +# Import the necessary libraries +import sys +import numpy as np +import pandas as pd +import subprocess + +# Prophet Library +from prophet import Prophet +import pickle +import base64 +import sys, os + +from contextlib import contextmanager +import logging +import datetime +# from datetime import date +from dateutil.relativedelta import relativedelta + +logging.basicConfig(format='%(process)d-%(levelname)s-%(message)s') + +# create a class which will be used to supress the output of the model.fit function +class suppress_stdout_stderr(object): + """ + Filter out Prophet logs from stdout and stderr + + from https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions + Update: https://github.com/facebook/prophet/issues/223 randlet, 2017-09-31 + """ + def __init__(self): + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + self.save_fds = [os.dup(1), os.dup(2)] + + def __enter__(self): + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) + + def __exit__(self, *_): + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) + for fd in self.null_fds + self.save_fds: + os.close(fd) + + +### +### Read input +### + + +delimiter = '\t' +inputData = [] + +for line in sys.stdin.read().splitlines(): + line = line.split(delimiter) + inputData.append(line) + +### +### If no data received, gracefully exit rather than producing an error later. +### + +if not inputData: + sys.exit() + +### +### Set up input DataFrame according to input schema +### + +# Know your data: You must know in advance the number of incoming columns from the database! + +columns = ['SlsID','Store','DayOfWeek', 'SalesDate', 'Sales', 'Customers', 'SalesOpen','Promo', 'StateHoliday', + 'SchoolHoliday'] + +df = pd.DataFrame(inputData, columns=columns).copy() + +del inputData + +# create sales dataframe using the SalesDate as 'ds' and Sales as 'y' which is needed as input to the Prophet model +sales = df.rename(columns = {'SalesDate': 'ds', + 'Sales': 'y'}) + +sales=sales[['ds','y']] + +# Get dates for school holidays + +school_dates_df=df[['SalesDate','SchoolHoliday']] +school_dates_df['SchoolHoliday'] = pd.to_numeric(school_dates_df['SchoolHoliday']) +school_dates = school_dates_df.loc[school_dates_df.SchoolHoliday == 1, 'SalesDate'].values + + +school = pd.DataFrame({'holiday': 'school_holiday', + 'ds': pd.to_datetime(school_dates)}) + + +holidays = school + + +# # Prophet implementation +# Train model +my_model = Prophet(interval_width = 0.70, changepoint_prior_scale=0.05,seasonality_prior_scale=0.03,holidays_prior_scale=0.03, + holidays = holidays.head(1000)) + + +# Fit model using the Sales data +with suppress_stdout_stderr(): + my_model.fit(sales) + + + +# dataframe that extends into future and history +# future_dates = my_model.make_future_dataframe(periods=365) + +# Get min date and then go back 1 month +dt = min(sales['ds'].values) +# date1 = datetime.datetime.strptime(dt, "%y/%m/%d").date() +date1 = datetime.datetime.strptime(dt, "%Y-%m-%d").date() + +# Subtract one month +start_date = date1 - relativedelta(months=1) + +# Get max date and then get future dates for 1 month +dt1 = max(sales['ds'].values) +# date2 = datetime.datetime.strptime(dt1, "%y/%m/%d").date() +date2 = datetime.datetime.strptime(dt1, "%Y-%m-%d").date() + +# Add one month +end_date = date2 + relativedelta(months=1) +# end_date= str(end_value) + +# Create date range using start date and end date +date_range = pd.date_range(str(start_date), str(end_date)) + +# Create data frame for the dates to be passed to predict function +future_dates = pd.DataFrame({'ds': date_range}) + +# forecast +forecast_df = my_model.predict(future_dates) +# df_5 = forecast_df.head(5) +# for index, row in sales.iterrows(): +# print(row['ds']) +# Export results to Advanced SQL Engine through standard output in expected format. +# for index, row in future_dates.iterrows(): +# print(row['ds']) +# for ind, column in enumerate(forecast_df.columns): +# print(column) +for index, row in forecast_df.iterrows(): + print(row['ds'], delimiter, row['yhat'], delimiter,row['yhat_lower'], delimiter, row['yhat_upper'], delimiter, row['trend'], delimiter, row['trend_lower'], delimiter, row['trend_upper']) diff --git a/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb new file mode 100644 index 00000000..eb4db4d3 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb @@ -0,0 +1,1576 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "78ad8a32", + "metadata": {}, + "source": [ + "
    \n", + "

    \n", + " Telco Customer Churn\n", + "
    \n", + " \"Teradata\"\n", + "

    \n", + "
    " + ] + }, + { + "cell_type": "markdown", + "id": "454c6ae9", + "metadata": {}, + "source": [ + "

    Introduction

    \n", + "\n", + "

    \n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.

    \n", + "

    Business Values

    \n", + "
      \n", + "
    • Determine characteristics of the company that are less favorable to consumers.
    • \n", + "
    • Identify customers at risk of leaving.
    • \n", + "
    • Identify customer behavior before churning to allow for time to intervene to save customers.
    • \n", + "
    • Determine when to begin targeted marketing or offer promotions.
    • \n", + "
    • Increase customer retention and reduce churn rate.
    • \n", + "
    \n", + "

    Why Vantage?

    \n", + "

    \n", + "Traditional ML and AI development and deployment pipelines require users to manually combine various tools and techniques across the lifecycle. This leads to lengthy, fragile, manual, error-prone processes that are, in many cases, impossible to migrate out of the lab and into production in order to realize business value.
    ClearScape Analytics helps to solve this “development to deployment gap” by providing highly scalable, performant, and easy-to-use analytic capabilities that address all aspects of the development lifecycle. The same tools and techniques that data scientists use in development can be seamlessly deployed into production using the same code, platform, and operational pipeline.

    \n", + "\n", + "

    \n", + "Managing telco churn is complex and requires continuous monitoring, analysis, and proactive customer engagement strategies. By using data and advanced analytics, telecom companies can better understand customer behavior and preferences, and take proactive measures to retain customers and maintain profitability.

    \n", + "\n", + "

    \n", + "Let's demonstrate this use case with sample data using InDb analytics in Vantage which can pre-process and analyze huge amounts of data and at scale. \n", + "

    " + ] + }, + { + "cell_type": "markdown", + "id": "22173c1b-a4eb-4cd7-b0ae-ba68bc39aba2", + "metadata": {}, + "source": [ + "
    \n", + "

    1. Configure the environment

    \n", + "

    \n", + "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n", + "
    \n", + "Here's how we can do this:

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8873e644-f907-45b7-bf14-6472fe4637d2", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5199b325-11f8-4dbd-a13f-fadf334c58c6", + "metadata": {}, + "outputs": [], + "source": [ + "#import libraries\n", + "import matplotlib.pyplot as plt \n", + "import getpass\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=DeprecationWarning)\n", + "warnings.simplefilter(action='ignore', category=RuntimeWarning)\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "\n", + "from teradataml import *\n", + "\n", + "import plotly.express as px\n", + "from plotly.subplots import make_subplots\n", + "import plotly.graph_objects as go\n", + "from dotenv import load_dotenv, dotenv_values\n", + "\n", + "from sklearn.metrics import mean_absolute_error\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.metrics import roc_curve\n", + "display.max_rows=5" + ] + }, + { + "cell_type": "markdown", + "id": "113ada1d-a5ff-4d5e-9145-f0f03b26b3f2", + "metadata": {}, + "source": [ + "
    \n", + "

    2. Connect to VantageCloud Lake

    \n", + "

    Connect to VantageCloud using create_context from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb0165cd-c7eb-40cc-8eac-84ad0ec3ba52", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=VCL_Telco_Customer_Churn_Python.ipynb;' UPDATE FOR SESSION; ''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "6d8f8388-e318-4bf4-84c1-7dfcf2d1dd40", + "metadata": {}, + "source": [ + "
    \n", + "

    3. Load the data

    \n", + "\n", + "

    We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").

    \n", + " \n", + "

    **Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below

    " + ] + }, + { + "cell_type": "markdown", + "id": "bdd6dd8c", + "metadata": {}, + "source": [ + "
    \n", + "

    4. Data Exploration

    " + ] + }, + { + "cell_type": "markdown", + "id": "13288769-f1b3-40a5-8cad-95e5f4ae92fd", + "metadata": {}, + "source": [ + "

    Customer Churn

    \n", + "

    Let us start by creating a \"Virtual DataFrame\" that points directly to the dataset in Vantage. We then begin our analysis by checking the shape of the DataFrame and examining the data types of all its columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40df274-d9cb-439b-93bb-343d317f052c", + "metadata": {}, + "outputs": [], + "source": [ + "tdf = DataFrame(in_schema(\"DEMO_Telco\", \"Customer_Churn\"))\n", + "tdf" + ] + }, + { + "cell_type": "markdown", + "id": "1d620292-c936-4546-89eb-59fd50c35221", + "metadata": {}, + "source": [ + "

    We can check the demographics of data by shape and info method.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "279ee6be-9288-41ae-b21a-f2389add4623", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Shape of the data: \", tdf.shape)\n", + "tdf.info()" + ] + }, + { + "cell_type": "markdown", + "id": "4d9927f9-f0f6-4f45-966d-7b7b2ca36f84", + "metadata": {}, + "source": [ + "

    As we can see from above result our dataset has 7043 rows with 21 columns.

    " + ] + }, + { + "cell_type": "markdown", + "id": "b03454cf-d47a-4edc-aea4-5b517b7da9d6", + "metadata": {}, + "source": [ + "

    Summary of Columns
    \n", + "

    We can use the ColumnSummary function for quickly examining the columns, their datatypes, and summary of NULLs/non-NULLs for a given table.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21eece2c-533a-40e3-bcad-4ed4bb2b6cf3", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ColumnSummary\n", + "obj = ColumnSummary(data=tdf,\n", + " target_columns=[':']\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eeb1e86-c00a-48d5-8e3b-363118c847ce", + "metadata": {}, + "outputs": [], + "source": [ + "obj.result.head(21)" + ] + }, + { + "cell_type": "markdown", + "id": "a91e3850-12c5-4b74-b17d-852092e81925", + "metadata": {}, + "source": [ + "
    \n", + "

    4.1 Exploratory Data Analysis

    " + ] + }, + { + "cell_type": "markdown", + "id": "befaaeb8-fab9-43f6-8a0a-efdb0e486377", + "metadata": {}, + "source": [ + "

    \n", + "Exploratory Data Analysis (EDA) is a process where we visually and statistically examine, analyze, and summarize data to comprehend its characteristics, patterns, and relationships. This approach is crucial for gaining insights and a deeper understanding of the dataset at hand.
    First let us analyse the Gender and Churn distributions in our data.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e342ab59-0673-4b4d-b017-b14b278a5277", + "metadata": {}, + "outputs": [], + "source": [ + "d1=tdf.select(['Gender','CustomerID']).groupby('Gender').count()\n", + "d1 = d1.assign(drop_columns=True,\n", + " Gender=d1.Gender,\n", + " Count=d1.count_CustomerID)\n", + "d1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75cadab6-8ad8-46bb-914c-24985f0a64a6", + "metadata": {}, + "outputs": [], + "source": [ + "d2=tdf.select(['Churn','CustomerID']).groupby('Churn').count()\n", + "d2 = d2.assign(drop_columns=True,\n", + " Churn=d2.Churn,\n", + " Count=d2.count_CustomerID)\n", + "d2" + ] + }, + { + "cell_type": "markdown", + "id": "25a968ef-4b70-4b6b-a31d-f185be1f2609", + "metadata": {}, + "source": [ + "

    \n", + "We can see that the aggregated data is available to us in teradataml dataframe. Let's visualize this data to better understand the Churn and gender distributions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visulazation tools, this will not only make the calculation faster but also reduce the overall time due to less data movement between tools.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d38b54fe-bbb5-45dd-b685-c82be89211f3", + "metadata": {}, + "outputs": [], + "source": [ + "d1=d1.to_pandas().reset_index()\n", + "d2=d2.to_pandas().reset_index()\n", + "#Gender and Churn percentage distribution\n", + "# Create subplots: use 'domain' type for Pie subplot\n", + "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n", + "fig.add_trace(go.Pie(labels=d1['Gender'], values=d1['Count'], name=\"Gender\"),\n", + " 1, 1)\n", + "fig.add_trace(go.Pie(labels=d2['Churn'], values=d2['Count'], name=\"Churn\"),\n", + " 1, 2)\n", + "\n", + "# Use `hole` to create a donut-like pie chart\n", + "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\", textfont_size=16)\n", + "\n", + "fig.update_layout(\n", + " title_text=\"Gender and Churn Distributions\",\n", + " # Add annotations in the center of the donut pies.\n", + " annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),\n", + " dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6d394641-ba1c-44de-9db3-be2286aa3d13", + "metadata": {}, + "source": [ + "

    From the above plot we can see that 26.6 % of customers switched to another firm.
    And of total customers 49.5 % are female and 50.5 % are male.

    " + ] + }, + { + "cell_type": "markdown", + "id": "58fc0a42-d5d8-404c-bafd-49f40457bd2a", + "metadata": {}, + "source": [ + "

    Now, let us see the chrun with respect to gender.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d127df-8eea-4d51-9f68-7179a3969884", + "metadata": {}, + "outputs": [], + "source": [ + "d3=tdf.select(['Churn','Gender','CustomerID']).groupby(['Churn','Gender']).count()\n", + "d3 = d3.assign(drop_columns=True,\n", + " Churn=d3.Churn,\n", + " Gender=d3.Gender, \n", + " Count=d3.count_CustomerID)\n", + "d3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8075c4b0-f668-4924-bf32-ef8c3a375c92", + "metadata": {}, + "outputs": [], + "source": [ + "d3=d3.to_pandas().reset_index()\n", + "fig2=px.sunburst(d3,path=['Churn','Gender'],values='Count')\n", + "fig2.update_layout(\n", + " title_text=\"Churn Distribution w.r.t Gender\")\n", + "fig2.show()" + ] + }, + { + "cell_type": "markdown", + "id": "eedfa546-3b86-4aa8-a4b9-1c47f922c5db", + "metadata": {}, + "source": [ + "

    We can see that there is negligible difference in customer count who changed the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43b27db1-3a04-440b-a382-da72669babb3", + "metadata": {}, + "outputs": [], + "source": [ + "d4=tdf.select(['Churn','Contract','CustomerID']).groupby(['Churn','Contract']).count()\n", + "d4 = d4.assign(drop_columns=True,\n", + " Churn=d4.Churn,\n", + " Contract=d4.Contract, \n", + " Count=d4.count_CustomerID)\n", + "d4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0d4bd6b-f9fa-4930-ba88-379b1308c795", + "metadata": {}, + "outputs": [], + "source": [ + "d4=d4.to_pandas().reset_index()\n", + "fig4 = px.bar(d4,x=\"Churn\",y=\"Count\", color=\"Contract\", barmode=\"group\", title=\"Customer contract distribution\")\n", + "fig4.update_layout(width=700, height=500, bargap=0.1)\n", + "fig4.show()" + ] + }, + { + "cell_type": "markdown", + "id": "cb97cf05-b143-43e9-bf3a-b7ed267c1ad7", + "metadata": {}, + "source": [ + "

    We can see that about 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customers with One Year Contract and 3% with Two Year Contract.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57466f9f-ab11-4e32-ae9f-7c0fce65ed4d", + "metadata": {}, + "outputs": [], + "source": [ + "d5=tdf.select(['PaymentMethod','CustomerID']).groupby('PaymentMethod').count()\n", + "d5 = d5.assign(drop_columns=True,\n", + " PaymentMethod=d5.PaymentMethod,\n", + " Count=d5.count_CustomerID)\n", + "d5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21d214d3-db22-4602-9fa2-9c694282d056", + "metadata": {}, + "outputs": [], + "source": [ + "d5=d5.to_pandas().reset_index()\n", + "fig5 = go.Figure(data=[go.Pie(labels=d5['PaymentMethod'], values=d5['Count'], hole=.3)])\n", + "fig5.update_layout(title_text=\"Payment Method Distribution\")\n", + "fig5.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea725ada-aa8c-4dec-9648-1cdc26a17cd9", + "metadata": {}, + "outputs": [], + "source": [ + "d6=tdf.select(['Churn','PaymentMethod','CustomerID']).groupby(['Churn','PaymentMethod']).count()\n", + "d6 = d6.assign(drop_columns=True,\n", + " Churn=d6.Churn,\n", + " PaymentMethod=d6.PaymentMethod, \n", + " Count=d6.count_CustomerID)\n", + "d6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a576fade-bb95-4c62-ab6f-94c06d0b5ddf", + "metadata": {}, + "outputs": [], + "source": [ + "d6=d6.to_pandas().reset_index()\n", + "fig6 = px.bar(d6,x=\"Churn\",y=\"Count\", color=\"PaymentMethod\", barmode=\"stack\", title=\"Customer Payment Method distribution w.r.t. Churn\")\n", + "fig6.update_layout(width=700, height=500, bargap=0.1)\n", + "fig6.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8fd84375-3d85-47cd-9076-6a6b8ea3e496", + "metadata": {}, + "source": [ + "

    Major customers who moved out were having Electronic Check as Payment Method.\n", + "
    Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eb375b22-f520-4bc9-a154-3ff05514d85a", + "metadata": {}, + "outputs": [], + "source": [ + "d7=tdf.select(['Churn','InternetService','Gender','CustomerID']).groupby(['Churn','InternetService','Gender']).count()\n", + "d7 = d7.assign(drop_columns=True,\n", + " Churn=d7.Churn,\n", + " InternetService=d7.InternetService, \n", + " Gender=d7.Gender,\n", + " Count=d7.count_CustomerID)\n", + "d7" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c909c210-8342-44a1-be13-b648198251a9", + "metadata": {}, + "outputs": [], + "source": [ + "d7.sort([\"InternetService\"]).head(21)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4945610-0f80-4bc2-8bfd-cc1e0901a999", + "metadata": {}, + "outputs": [], + "source": [ + "d7=d7.to_pandas().reset_index()\n", + "fig7 = go.Figure()\n", + "\n", + "for t in d7['Churn'].unique():\n", + " dfp = d7[d7['Churn']==t]\n", + " fig7.add_traces(go.Bar(x=[dfp['InternetService'], dfp['Gender']],\n", + " y=dfp['Count'],\n", + " width=0.75,\n", + " customdata=d7['Churn'],\n", + " name='Churn :' +str(dfp['Churn'].values[0]) \n", + " )\n", + " )\n", + "\n", + "fig7.update_layout(barmode='stack',\n", + " title_text=\"Churn Distribution w.r.t. Internet Service and Gender\")\n", + "fig7.show()" + ] + }, + { + "cell_type": "markdown", + "id": "1bc9f214-7877-43ad-853a-c780e6e22dba", + "metadata": {}, + "source": [ + "

    We can see that a lot of customers choose the Fiber optic service as compared to DSL but it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.\n", + "
    Customers having DSL service have less churn rate compared to Fiber optic service.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c63e5181-b134-4335-9c25-fd2e02b0a5f5", + "metadata": {}, + "outputs": [], + "source": [ + "d8=tdf.select(['Churn','Dependents','CustomerID']).groupby(['Churn','Dependents']).count()\n", + "d8 = d8.assign(drop_columns=True,\n", + " Churn=d8.Churn,\n", + " Dependents=d8.Dependents,\n", + " Count=d8.count_CustomerID)\n", + "d8" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8de41aa2-1e2c-43aa-ac1b-cb38daa44861", + "metadata": {}, + "outputs": [], + "source": [ + "d8=d8.to_pandas().reset_index()\n", + "color_map = {\"Yes\": \"#FF97FF\", \"No\": \"#AB63FA\"}\n", + "fig8 = px.bar(d8, x=\"Churn\",y=\"Count\", color=\"Dependents\", barmode=\"group\", title=\"Dependents distribution\", color_discrete_map=color_map)\n", + "fig8.update_layout(width=700, height=500, bargap=0.1)\n", + "fig8.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4af82750-0c91-41e9-9ec6-06f3e82eefc1", + "metadata": {}, + "source": [ + "

    Customers without dependents are more likely to churn.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4ca94e9-b31f-4fb5-824b-d21f58ef3055", + "metadata": {}, + "outputs": [], + "source": [ + "d9=tdf.select(['Churn','Partner','CustomerID']).groupby(['Churn','Partner']).count()\n", + "d9 = d9.assign(drop_columns=True,\n", + " Churn=d9.Churn,\n", + " Partner=d9.Partner,\n", + " Count=d9.count_CustomerID)\n", + "d9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e587308-8648-406b-9aca-f8906d8abe60", + "metadata": {}, + "outputs": [], + "source": [ + "d9=d9.to_pandas().reset_index()\n", + "color_map = {\"Yes\": '#FFA15A', \"No\": '#00CC96'}\n", + "fig9 = px.bar(d9, x=\"Churn\",y=\"Count\", color=\"Partner\", barmode=\"group\", title=\"Chrun distribution w.r.t. Partners\", color_discrete_map=color_map)\n", + "fig9.update_layout(width=700, height=500, bargap=0.1)\n", + "fig9.show()" + ] + }, + { + "cell_type": "markdown", + "id": "649567b1-3232-49bf-840a-8518b38c29b4", + "metadata": {}, + "source": [ + "

    Customers that don't have partners are more likely to churn.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5efe37b2-f14b-4499-abde-0f155cb8e3f0", + "metadata": {}, + "outputs": [], + "source": [ + "d10=tdf.select(['Churn','PaperlessBilling','CustomerID']).groupby(['Churn','PaperlessBilling']).count()\n", + "d10 = d10.assign(drop_columns=True,\n", + " Churn=d10.Churn,\n", + " PaperlessBilling=d10.PaperlessBilling,\n", + " Count=d10.count_CustomerID)\n", + "d10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f24940fb-b15f-4460-803a-8c82ffbef623", + "metadata": {}, + "outputs": [], + "source": [ + "d10=d10.to_pandas().reset_index()\n", + "color_map = {\"Yes\": '#FFA15A', \"No\": '#00CC96'}\n", + "fig10 = px.bar(d10, x=\"Churn\",y=\"Count\", color=\"PaperlessBilling\", title=\"Chrun distribution w.r.t. Paperless Billing\", color_discrete_map=color_map)\n", + "fig10.update_layout(width=700, height=500, bargap=0.1)\n", + "fig10.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fd6697e7-1dcc-44b1-9428-290856a1cb0c", + "metadata": {}, + "source": [ + "

    Customers with Paperless Billing are most likely to churn.

    " + ] + }, + { + "cell_type": "markdown", + "id": "fa5ea57f-b8ba-44eb-9d9b-cf07d37b77b5", + "metadata": {}, + "source": [ + "
    \n", + "

    5. Data Preprocessing

    " + ] + }, + { + "cell_type": "markdown", + "id": "b58490d2-1f7a-4941-a62a-e99a4b7f7543", + "metadata": {}, + "source": [ + "

    Before the data can be used for model creation; we will need to do some data cleansing and transformation on it. We can do this InDb with Teradata Vantage's inbuilt functions.
    We will use the CategoricalSummary function to showcase the distinct values and their corresponding counts for each specified column in the input DataFrame. This function provides a concise summary of categorical data, aiding in a quick understanding of the distribution of values within the specified columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddf75f76-eb84-4f9d-856b-4051f0df6d70", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import CategoricalSummary\n", + "CatSum = CategoricalSummary(data=tdf,target_columns=[\"MultipleLines\",\"InternetService\",\"OnlineSecurity\",\"OnlineBackup\",\"DeviceProtection\",\"TechSupport\",\"StreamingTV\",\"StreamingMovies\"])\n", + "CatSum.result.sort(\"ColumnName\")" + ] + }, + { + "cell_type": "markdown", + "id": "84680551-fbca-45e4-83e9-c5f90fa3078b", + "metadata": {}, + "source": [ + "

    \n", + "As we can see from the sample data above and the categorical summary values, the columns

    \n", + "
    • OnlineSecurity
    • \n", + "
    • OnlineBackup
    • \n", + "
    • DeviceProtection
    • \n", + "
    • TechSupport
    • \n", + "
    • StreamingTV
    • \n", + "
    • StreamingMovies
    • \n", + "

    are related to InternetService, wherever InternetService value is \"No\" the column have value of \"No internet service\". For our model let us replace \"No internet service\" to No in our column. We will do similar operation for replacing \"No phone service\" to \"No\".
    We will use sqlalchemy's oreplace function to replace the respective strings to desired value.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f0f4876-7f8d-4177-8afc-f7c474b6ac4a", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import func\n", + "\n", + "\n", + "tdf = tdf.assign(oreplace_MultipleLines=func.oreplace(tdf.MultipleLines.expression, \"No phone service\",\"No\"),\n", + " oreplace_OnlineSecurity=func.oreplace(tdf.OnlineSecurity.expression, \"No internet service\",\"No\"),\n", + " oreplace_OnlineBackup=func.oreplace(tdf.OnlineBackup.expression, \"No internet service\",\"No\"),\n", + " oreplace_DeviceProtection=func.oreplace(tdf.DeviceProtection.expression, \"No internet service\",\"No\"), oreplace_TechSupport=func.oreplace(tdf.TechSupport.expression, \"No internet service\",\"No\"),\n", + " oreplace_StreamingTV=func.oreplace(tdf.StreamingTV.expression, \"No internet service\",\"No\"),\n", + " oreplace_StreamingMovies=func.oreplace(tdf.StreamingMovies.expression, \"No internet service\",\"No\"))\n", + "tdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5439705-8a1a-4060-b344-2f3245b18e57", + "metadata": {}, + "outputs": [], + "source": [ + "# now lets drop the extra columns, rename the columns in dataframe\n", + "from teradataml.dataframe.sql_functions import case\n", + "\n", + "tdf2 = tdf.assign(drop_columns=True\n", + " ,CustomerID=tdf.CustomerID \n", + " ,Gender=tdf.Gender \n", + " ,SeniorCitizen=tdf.SeniorCitizen\n", + " ,Partner=tdf.Partner\n", + " ,Dependents=tdf.Dependents\n", + " ,Tenure=tdf.Tenure\n", + " ,PhoneService=tdf.PhoneService \n", + " ,MultipleLines=tdf.oreplace_MultipleLines \n", + " ,InternetService=tdf.InternetService \n", + " ,OnlineSecurity=tdf.oreplace_OnlineSecurity \n", + " ,OnlineBackup=tdf.oreplace_OnlineBackup \n", + " ,DeviceProtection=tdf.oreplace_DeviceProtection \n", + " ,TechSupport=tdf.oreplace_TechSupport \n", + " ,StreamingTV=tdf.oreplace_StreamingTV \n", + " ,StreamingMovies=tdf.oreplace_StreamingMovies \n", + " ,Contract=tdf.Contract \n", + " ,PaperlessBilling=tdf.PaperlessBilling \n", + " ,PaymentMethod=tdf.PaymentMethod \n", + " ,MonthlyCharges=tdf.MonthlyCharges \n", + " ,TotalCharges=tdf.TotalCharges \n", + " ,Churn = case({ \"Yes\" : 1, \"No\" : 0},value=tdf.Churn,else_=0) ) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5bddfa5-6a90-477b-8c38-aa2985736fe6", + "metadata": {}, + "outputs": [], + "source": [ + "tdf2" + ] + }, + { + "cell_type": "markdown", + "id": "d256d17f-1c10-4bc2-978c-4b9a2a184437", + "metadata": {}, + "source": [ + "

    Onehotencoding & Ordinal encoding

    \n", + "

    From our categorical attributes we can see that there are limited distinct values in each of these columns. We will use Teradata's OneHotEncodingFit and Transform and OrdinalEncodingFit and Transform functions to convert the categorical attributes to numerical.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc112884-d0bf-462d-9024-003bb4c2640a", + "metadata": {}, + "outputs": [], + "source": [ + "onehotfit_df = OneHotEncodingFit(data=tdf2,\n", + " is_input_dense=True,\n", + " approach=\"auto\",\n", + " target_column=[\"Gender\",\"Partner\",\"Dependents\",\"PhoneService\",\"MultipleLines\",\"OnlineSecurity\"\n", + " ,\"OnlineBackup\",\"DeviceProtection\",\"TechSupport\",\"StreamingTV\",\"StreamingMovies\",\n", + " \"PaperlessBilling\"],\n", + " category_counts=[2,2,2,2,2,2,2,2,2,2,2,2])" + ] + }, + { + "cell_type": "markdown", + "id": "2c136b69-41c1-42f1-bf47-89d1dae800a3", + "metadata": {}, + "source": [ + "

    \n", + "The other categorical columns

    \n", + "
      \n", + "
    • InternetService
    • \n", + "
    • Contract
    • \n", + "
    • PaperlessBilling
    • \n", + "
    • PaymentMethod
    • \n", + "

    have more values where we can apply ordinalencoding on it

    \n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6b2946-b138-4ad6-aaa7-dff619fcf1e7", + "metadata": {}, + "outputs": [], + "source": [ + "ordinalfit_df = OrdinalEncodingFit(target_column=['InternetService','Contract','PaperlessBilling','PaymentMethod'],\n", + " default_value=-1,\n", + " data=tdf2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffe66188-9511-485c-84e5-2638f758f4c6", + "metadata": {}, + "outputs": [], + "source": [ + "ordinalfit_df.result" + ] + }, + { + "cell_type": "markdown", + "id": "7a437c68-b650-4300-a3e8-cd39a5e21054", + "metadata": {}, + "source": [ + "

    Scale the numerical values

    For the numercial attributes we will use ScaleFit and ScaleTransform function to scale the specified input table columns i.e perform the specific scale methods like standard deviation, mean etc to the input columns.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "059dd73a-3751-46a8-85bf-7a11a7eacb7c", + "metadata": {}, + "outputs": [], + "source": [ + "scalefit_df = ScaleFit(data=tdf2,\n", + " target_columns=['MonthlyCharges','TotalCharges'],\n", + " scale_method=\"MIDRANGE\",\n", + " miss_value=\"KEEP\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "markdown", + "id": "faec04de-8197-4c10-bcc5-3bc3f605e81e", + "metadata": {}, + "source": [ + "

    Putting it altogether

    We will use ColumnTransformer function to apply all the transformations from the fit tables created below in one go.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d108b118-0970-4c84-9371-df4ef125ce69", + "metadata": {}, + "outputs": [], + "source": [ + "ColumnTransformer_out = ColumnTransformer(fillrowid_column_name=\"output_value\",\n", + " input_data=tdf2,\n", + " onehotencoding_fit_data=onehotfit_df.result,\n", + " ordinalencoding_fit_data=ordinalfit_df.result,\n", + " scale_fit_data=scalefit_df.output)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4da18dae-fec0-4bba-b87c-ab7045fd21e1", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data= ColumnTransformer_out.result.assign(drop_columns=True,\n", + " Churn=tdf2.Churn,\n", + " CustomerID=ColumnTransformer_out.result.CustomerID,\n", + " SeniorCitizen=ColumnTransformer_out.result.SeniorCitizen,\n", + " Tenure=ColumnTransformer_out.result.Tenure,\n", + " InternetService=ColumnTransformer_out.result.InternetService,\n", + " Contract=ColumnTransformer_out.result.Contract,\n", + " PaperlessBilling=ColumnTransformer_out.result.PaperlessBilling,\n", + " PaymentMethod=ColumnTransformer_out.result.PaymentMethod,\n", + " MonthlyCharges=ColumnTransformer_out.result.MonthlyCharges,\n", + " TotalCharges=ColumnTransformer_out.result.TotalCharges,\n", + " Gender_0=ColumnTransformer_out.result.Gender_0,\n", + " Gender_1=ColumnTransformer_out.result.Gender_1,\n", + " Partner_0=ColumnTransformer_out.result.Partner_0,\n", + " Partner_1=ColumnTransformer_out.result.Partner_1,\n", + " Dependents_0=ColumnTransformer_out.result.Dependents_0,\n", + " Dependents_1=ColumnTransformer_out.result.Dependents_1,\n", + " PhoneService_0=ColumnTransformer_out.result.PhoneService_0,\n", + " PhoneService_1=ColumnTransformer_out.result.PhoneService_1,\n", + " MultipleLines_0=ColumnTransformer_out.result.MultipleLines_0,\n", + " MultipleLines_1=ColumnTransformer_out.result.MultipleLines_1,\n", + " OnlineSecurity_0=ColumnTransformer_out.result.OnlineSecurity_0,\n", + " OnlineSecurity_1=ColumnTransformer_out.result.OnlineSecurity_1,\n", + " OnlineBackup_0=ColumnTransformer_out.result.OnlineBackup_0,\n", + " OnlineBackup_1=ColumnTransformer_out.result.OnlineBackup_1,\n", + " DeviceProtection_0=ColumnTransformer_out.result.DeviceProtection_0,\n", + " DeviceProtection_1=ColumnTransformer_out.result.DeviceProtection_1,\n", + " TechSupport_0=ColumnTransformer_out.result.TechSupport_0,\n", + " TechSupport_1=ColumnTransformer_out.result.TechSupport_1,\n", + " StreamingTV_0=ColumnTransformer_out.result.StreamingTV_0,\n", + " StreamingTV_1=ColumnTransformer_out.result.StreamingTV_1,\n", + " StreamingMovies_0=ColumnTransformer_out.result.StreamingMovies_0,\n", + " StreamingMovies_1=ColumnTransformer_out.result.StreamingMovies_1,\n", + " PaperlessBilling_0=ColumnTransformer_out.result.PaperlessBilling_0,\n", + " PaperlessBilling_1=ColumnTransformer_out.result.PaperlessBilling_1)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e4c30b0-4989-4540-bf60-3e4631afeacd", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f38c7f0f-6da8-4ef9-acaf-0774d29c92e8", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data.shape" + ] + }, + { + "cell_type": "markdown", + "id": "0edef2c8-e568-4626-9377-e189d66e3350", + "metadata": {}, + "source": [ + "

    We can see from above how our data is transformed from the original values.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c50e66-97a9-4fc9-8bd1-20654dc318fc", + "metadata": {}, + "outputs": [], + "source": [ + "# Copying the intermediate table to database\n", + "Transformed_data.to_sql(\"Transformed_data\",primary_index = \"CustomerID\", if_exists = \"replace\")" + ] + }, + { + "cell_type": "markdown", + "id": "37551d5e-2366-42cf-83f9-1a48ee438c6c", + "metadata": {}, + "source": [ + "

    Create train and test data

    Now we have transformed our data and it is fit to be used in machine learning models, let us split the whole dataset into train and test sets for model training and scoring. We will use TrainTestSplit function for this task.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b22bf0d0-255e-4ea9-8bdc-ffac9ea02f34", + "metadata": {}, + "outputs": [], + "source": [ + "TrainTestSplit_out = TrainTestSplit(\n", + " data = DataFrame('Transformed_data'),\n", + " id_column = \"CustomerID\",\n", + " train_size = 0.75,\n", + " test_size = 0.25,\n", + " seed = 21\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19d382b8-7f46-43f8-aee9-5598d6f24ebf", + "metadata": {}, + "outputs": [], + "source": [ + "# Split into 2 virtual dataframes\n", + "df_train = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 1].drop(['TD_IsTrainRow'], axis = 1)\n", + "df_test = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 0].drop(['TD_IsTrainRow'], axis = 1)" + ] + }, + { + "cell_type": "markdown", + "id": "88e87734-1eb6-47a6-87d4-3b2d0585088a", + "metadata": {}, + "source": [ + "

    We have done our preprocessing of data and we created our training and test datasets, let's now create some predictive models." + ] + }, + { + "cell_type": "markdown", + "id": "3624ff0f-2e80-450e-a76e-85398a8c73da", + "metadata": {}, + "source": [ + "


    \n", + "\n", + "

    6. InDb Model Training and Scoring

    " + ] + }, + { + "cell_type": "markdown", + "id": "2893fa15-2812-473d-b91c-5949ba436461", + "metadata": {}, + "source": [ + "
    \n", + "

    6.1 Logistic Regression

    " + ] + }, + { + "cell_type": "markdown", + "id": "f4561645-5edd-4e2b-9983-59e77ab4745a", + "metadata": {}, + "source": [ + "

    For our model we will use logistic regression.
    \n", + " Logistic regression is a statistical algorithm used for binary classification problems. It is a type of supervised learning algorithm that predicts the probability of an input belonging to a certain class (e.g., positive or negative) based on its features.
    Logistic regression works by modeling the relationship between the input features and the probability of belonging to a certain class using a logistic function. The logistic function takes the input feature values and maps them onto a probability scale between 0 and 1, which represents the probability of belonging to the positive class.
    \n", + " The GLM function is a generalized linear model (GLM) that performs regression and classification analysis on data sets.\n", + "
    Please refer GLM for function elements and output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6639e3-2427-42d2-b302-08e18196b2b1", + "metadata": {}, + "outputs": [], + "source": [ + "df_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e92723cc-1aec-4fee-97fe-96b3e86e7802", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import GLM, TDGLMPredict\n", + "\n", + "glm_model = GLM(data = df_train,\n", + " #input_columns = train_col,\n", + " input_columns = ['1:8','10:33'], \n", + " response_column = 'Churn',\n", + " family = 'Binomial')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e3b4239-40a5-4b2c-9589-7d974c574641", + "metadata": {}, + "outputs": [], + "source": [ + "glm_model.result" + ] + }, + { + "cell_type": "markdown", + "id": "a24bb51b-7115-486e-957a-848bad5bc4d9", + "metadata": {}, + "source": [ + "

    We have created our model, let's do the predictions on the test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd7ac897-3b98-47f6-a9c8-33a75f6dac4f", + "metadata": {}, + "outputs": [], + "source": [ + "glm_prediction = TDGLMPredict(newdata = df_test, #test_dataset,\n", + " id_column = 'CustomerID',\n", + " object = glm_model.result,\n", + " accumulate = 'Churn',\n", + " family = 'Binomial',\n", + " output_prob=True,\n", + " output_responses = ['0', '1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5280b57-9e01-42c4-9b89-5734dc0968bf", + "metadata": {}, + "outputs": [], + "source": [ + "glm_prediction.result\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "498799ce-2607-4cba-94e6-26a40abd7d0e", + "metadata": {}, + "outputs": [], + "source": [ + "out_glm = glm_prediction.result.assign(prediction = glm_prediction.result.prediction.cast(type_ = BYTEINT))\n", + "out_glm = out_glm.assign(prediction = out_glm.prediction.cast(type_ = VARCHAR(2)))\n", + "out_glm = out_glm.assign(Churn = out_glm.Churn.cast(type_ = VARCHAR(2)))\n", + "out_glm" + ] + }, + { + "cell_type": "markdown", + "id": "121887bb-4fb2-429f-99d7-120c6dd47e51", + "metadata": {}, + "source": [ + "

    The output above shows prob_1, i.e. customer will Churn and prob_0, i.e. customer will not Churn. The prediction column uses these probabilities to give a class label, i.e. prediction column.

    " + ] + }, + { + "cell_type": "markdown", + "id": "7fa0a4c0-c4f8-499b-9e60-8bc87684a5c8", + "metadata": { + "tags": [] + }, + "source": [ + "
    \n", + "

    6.2 Evaluation of Logistic Regression Model

    \n", + "

    We will use the ClassificationEvaluator function to evaluate the trained glm model on test data. This will let us know how well our model has performed on unseen data.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "683015ad-d54b-4a33-a71f-345a348ee912", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_glm = ClassificationEvaluator(\n", + " data = out_glm,\n", + " observation_column = 'Churn',\n", + " prediction_column = 'prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "679b7a22-e08e-414a-aec3-e8ae6b1e0701", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_glm.output_data.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "d281b3fa-0793-4e86-986b-26bda09833ec", + "metadata": {}, + "source": [ + "

    The above output shows recall, and F1-score values of confusion matrix.

    \n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
    ColumnDescription
    PrecisionThe positive predictive value. Refers to the fraction of relevant instances among\n", + "the total retrieved instances.\n", + " Precision answers the following question: what proportion of predicted Positives is truly Positive? \n", + " Precision = (TP)/(TP+FP)
    RecallRefers to the fraction of relevant instances retrieved over the total amount of\n", + "relevant instances. Recall answers a different question: what proportion of actual Positives is correctly classified?\n", + "Recall = (TP)/(TP+FN)
    F1F1 score, defined as the harmonic mean of the precision and recall and is a number between 0 and 1. F1 score maintains a balance between the precision and recall for your classifier. \n", + " F1 = 2*(precision*recall/precision+recall)
    SupportThe number of times a label displays in the Observation Column.
    \n", + "

    **TP:- True Positive , FP :- False Positive, TN :- True Negative , FN :- False Negative

    " + ] + }, + { + "cell_type": "markdown", + "id": "6aaf3776-22c7-4697-a7f8-e5e334067b18", + "metadata": { + "tags": [] + }, + "source": [ + "

    We can also calculate mean absolute error and AUC(Area Under the Curve) for Receiver Operating Characteristic Curve.
    Mean Absolute Error is the summation of the difference between actual and predicted values averaged over the number of observations.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "750ff62e-db60-44cd-b968-38be7a6fcc0d", + "metadata": {}, + "outputs": [], + "source": [ + "glm_pred = glm_prediction.result.to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b108741e-a4a9-4c29-a521-e9cc8146cf53", + "metadata": {}, + "outputs": [], + "source": [ + "print(mean_absolute_error(glm_pred['Churn'], glm_pred['prob_1']))" + ] + }, + { + "cell_type": "markdown", + "id": "35733201-e225-4ba5-b712-105992bf177e", + "metadata": {}, + "source": [ + "

    The ROC curve is a graph between TPR(True Positive Rate) and FPR(False Positive Rate). The area under the ROC curve is a metric of how well the model can distinguish between positive and negative classes. The higher the AUC, the better the model's performance in distinguishing between the positive and negative classes.

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5962c501-8e8e-4594-ab63-d6e0df2e07a8", + "metadata": {}, + "outputs": [], + "source": [ + "AUC = roc_auc_score(glm_pred['Churn'], glm_pred['prob_1'])\n", + "AUC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "483506fb-d59d-4df0-a2fe-9d1e86ffdf72", + "metadata": {}, + "outputs": [], + "source": [ + "fpr, tpr, thresholds = roc_curve(glm_pred['Churn'], glm_pred['prob_1'])\n", + "plt.plot(fpr, tpr, color='orange', label='ROC. AUC = {}'.format(str(AUC)))\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver Operating Characteristic (ROC) Curve')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ae6647ac-3dd6-406a-8d73-fb793b54d28f", + "metadata": {}, + "source": [ + "
    \n", + "

    6.3 XGB

    \n", + "

    \n", + " XGBoost (eXtreme Gradient Boosting) is based on the gradient boosting framework, which is an ensemble learning method that combines multiple weak or base models (typically decision trees) to create a more accurate and robust predictive model. XGBoost improves upon traditional gradient boosting by using a number of optimization techniques, including parallelization, regularization, and efficient handling of missing values, to achieve faster training times and better model performance.
    \n", + " Teradata's XGBoost function is an implementation of the gradient boosted decision tree designed for speed and performance. In gradient boosting, each iteration fits a model to the residuals (errors) of the previous iteration to correct the errors made by existing models. The predicted residual is multiplied by this learning rate and then added to the previous prediction. Models are added sequentially until no further improvements can be made. It is called gradient boosting because it uses a gradient descent algorithm to minimize the loss when adding new models.\n", + "
    Please refer XGBoost for function elements and output." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aff1ebc7-ecb6-44e7-8c8d-b1da9505f9bb", + "metadata": {}, + "outputs": [], + "source": [ + "XGBoost_model = XGBoost(\n", + " data = df_train,\n", + " input_columns = ['1:8','10:33'],\n", + " response_column = 'Churn',\n", + " model_type = 'CLASSIFICATION',\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "062f024d-5e3d-4a59-93d3-763812d70053", + "metadata": {}, + "outputs": [], + "source": [ + "XGBoostPredict_out = XGBoostPredict(\n", + " newdata = df_test,\n", + " object = XGBoost_model.result,\n", + " id_column = 'CustomerID',\n", + " accumulate = 'Churn',\n", + " model_type = 'CLASSIFICATION',\n", + " object_order_column = ['task_index', 'tree_num', 'iter', 'class_num', 'tree_order'],\n", + " output_responses = ['0', '1'],\n", + " output_prob = True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c70378d-9b30-41e9-bfc3-ae901884f9c4", + "metadata": {}, + "outputs": [], + "source": [ + "out_xgb = XGBoostPredict_out.result.assign(Prediction = XGBoostPredict_out.result.Prediction.cast(type_ = BYTEINT))\n", + "out_xgb = out_xgb.assign(Prediction = out_xgb.Prediction.cast(type_ = VARCHAR(2)))\n", + "out_xgb = out_xgb.assign(Churn = out_xgb.Churn.cast(type_ = VARCHAR(2)))\n", + "out_xgb" + ] + }, + { + "cell_type": "markdown", + "id": "48a42d5b-5fba-4036-b094-19858e2c560f", + "metadata": {}, + "source": [ + "

    We have created our model, let's do the predictions on the test dataset." + ] + }, + { + "cell_type": "markdown", + "id": "0bd173a4-10ef-4528-a6e2-b67d5f4df07e", + "metadata": {}, + "source": [ + "


    \n", + "

    6.4 Evaluation of XGB Model

    " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23fd8827-dc00-4604-9c9e-4ef7352cf834", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_xgb = ClassificationEvaluator(\n", + " data = out_xgb,\n", + " observation_column = 'Churn',\n", + " prediction_column = 'Prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aefe83b-aedc-4173-9396-68572258fcd4", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_xgb.output_data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e7466ae-b42c-4005-a094-729bb1230b33", + "metadata": {}, + "outputs": [], + "source": [ + "xgb_pred = XGBoostPredict_out.result.to_pandas().reset_index().sort_values(\"CustomerID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c69f90-824e-4f84-a5d1-46e771a0ee00", + "metadata": {}, + "outputs": [], + "source": [ + "print(mean_absolute_error(xgb_pred['Churn'], xgb_pred['Prob_1']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71ab9ce-e83c-4f1b-82fb-550119a98704", + "metadata": {}, + "outputs": [], + "source": [ + "AUC = roc_auc_score(xgb_pred['Churn'], xgb_pred['Prob_1'])\n", + "AUC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78aa9c8e-e5e0-4a89-af63-27e1e638023f", + "metadata": {}, + "outputs": [], + "source": [ + "fpr, tpr, thresholds = roc_curve(xgb_pred['Churn'], xgb_pred['Prob_1'])\n", + "plt.plot(fpr, tpr, color='orange', label='ROC. AUC = {}'.format(str(AUC)))\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver Operating Characteristic (ROC) Curve')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e91ddacb-4ca3-405c-93c8-c587788191c4", + "metadata": {}, + "source": [ + "

    Conclusion

    " + ] + }, + { + "cell_type": "markdown", + "id": "516e0588-5e98-4373-8f30-8e8f40898835", + "metadata": {}, + "source": [ + "

    In this demo we have seen how we can do analysis and pre-processing of the data in Vantage using InDb functions. We have also used created two commonly used predictive models for classification and predicted the customers that are likely to churn. " + ] + }, + { + "cell_type": "markdown", + "id": "35ebb886-8da9-479a-8995-c6dd7ccebffd", + "metadata": {}, + "source": [ + "


    \n", + "

    7. Cleanup

    " + ] + }, + { + "cell_type": "markdown", + "id": "f0b01f3e-03fa-4a14-b388-02eeb210b8c1", + "metadata": {}, + "source": [ + "

    Work Tables

    \n", + "

    \n", + "We need to clean up our work tables to prevent errors next time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "914cfbcf-f229-496c-be13-b63c62729291", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['Transformed_data']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name = table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad94d1e-d82d-4611-b5c7-4180397f6c94", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "10724002-0091-4ef1-b091-71c0a2fdda5a", + "metadata": {}, + "source": [ + "


    " + ] + }, + { + "cell_type": "markdown", + "id": "3eee6695-147e-4b5b-a0e0-ae6a1d9629db", + "metadata": {}, + "source": [ + "Required Materials\n", + "

    Let’s look at the elements we have available for reference for this notebook:

    " + ] + }, + { + "cell_type": "markdown", + "id": "90d6c2a3-92e9-4121-a46a-2beaba63cac2", + "metadata": {}, + "source": [ + "

    Filters:

    \n", + "
      \n", + "
    • Industry: Telco
    • \n", + "
    • Functionality: Machine Learning
    • \n", + "
    • Use Case: Customer Retention
    • \n", + "
    \n", + "

    Related Resources:

    \n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "9094fe39-0a98-43d6-a62b-7c2d36acf654", + "metadata": {}, + "source": [ + "

    Reference Links:

    \n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "a8f9f644", + "metadata": {}, + "source": [ + "Dataset:\n", + "\n", + "- `CustomerID`: unique id of customer\n", + "- `Gender`: Whether the customer is a male or a female\n", + "- `SeniorCitizen`:Whether the customer is a senior citizen or not (1, 0)\n", + "- `Partner`:Whether the customer has a partner or not (Yes, No)\n", + "- `Dependents`:Whether the customer has dependents or not (Yes, No)\n", + "- `Tenure`:Number of months the customer has stayed with the company\n", + "- `PhoneService`:Whether the customer has a phone service or not (Yes, No)\n", + "- `MultipleLines`:Whether the customer has multiple lines or not (Yes, No, No phone service)\n", + "- `InternetService`:Customer’s internet service provider (DSL, Fiber optic, No)\n", + "- `OnlineSecurity`:Whether the customer has online security or not (Yes, No, No internet service)\n", + "- `OnlineBackup`:Whether the customer has online backup or not (Yes, No, No internet service)\n", + "- `DeviceProtection`:Whether the customer has device protection or not (Yes, No, No internet service)\n", + "- `TechSupport`:Whether the customer has tech support or not (Yes, No, No internet service)\n", + "- `StreamingTV`:Whether the customer has streaming TV or not (Yes, No, No internet service)\n", + "- `StreamingMovies`:Whether the customer has streaming movies or not (Yes, No, No internet service)\n", + "- `Contract`:The contract term of the customer (Month-to-month, One year, Two year)\n", + "- `PaperlessBilling`:Whether the customer has paperless billing or not (Yes, No)\n", + "- `PaymentMethod`:The customer’s payment method (Electronic check, Mailed check, Bank transfer (automatic), Credit card (automatic))\n", + "- `MonthlyCharges`:The amount charged to the customer monthly\n", + "- `TotalCharges`:The total amount charged to the customer\n", + "- `Churn`:Whether the customer churned or not (Yes or No)" + ] + }, + { + "cell_type": "markdown", + "id": "c30802a4-8141-47f6-971d-0bb79be6f5bf", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "toc-autonumbering": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Telco_Customer_Churn/__pycache__/oaf_utils.cpython-39.pyc b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/__pycache__/oaf_utils.cpython-39.pyc new file mode 100644 index 00000000..f1d97cbf Binary files /dev/null and b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/__pycache__/oaf_utils.cpython-39.pyc differ