diff --git a/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py b/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py
new file mode 100644
index 00000000..3760fd5e
--- /dev/null
+++ b/VantageCloud_Lake/Getting_Started/Demo_XGB_Scoring.py
@@ -0,0 +1,71 @@
+# Load dependency packages
+import sys
+import csv
+import numpy as np
+import pandas as pd
+from xgboost import XGBClassifier, Booster
+import warnings
+
+# pickle will issue a caution warning, if model pickling was done with
+# different library version than used here. The following disables any warnings
+# that might otherwise show in the scriptlog files on the Advanced SQL Engine
+# nodes in this case. Yet, do keep an eye for incompatible pickle versions.
+warnings.filterwarnings('ignore')
+
+# Know your data: You must know in advance the number and data types of the
+# incoming columns from the SQL Engine database!
+# For this script, the input expected format is:
+colNames = ['txn_id',
+ 'txn_type_CASH_OUT',
+ 'txn_type_CASH_IN',
+ 'txn_type_TRANSFER',
+ 'txn_type_DEBIT',
+ 'txn_type_PAYMENT',
+ 'txn_type_other',
+ 'amount',
+ 'oldbalanceOrig',
+ 'newbalanceOrig',
+ 'oldbalanceDest',
+ 'newbalanceDest',
+ 'isFraud']
+
+
+
+model = XGBClassifier()
+booster = Booster()
+booster.load_model('xgb_model')
+model._Booster = booster
+
+
+d = csv.DictReader(sys.stdin.readlines(), fieldnames = colNames)
+
+df = pd.DataFrame(d, columns = colNames)
+
+# Use try...except to produce an error if something goes wrong in the try block
+try:
+ # Exit gracefully if DataFrame is empty
+ if df.empty:
+ sys.exit()
+
+ # Specify the rows to be scored by the model and call the predictor.
+ X_test = df[['txn_type_CASH_OUT', 'txn_type_CASH_IN','txn_type_TRANSFER', 'txn_type_DEBIT','txn_type_PAYMENT', 'txn_type_other','amount','oldbalanceOrig', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest']].astype(float)
+
+ y_prob = model.predict_proba(X_test)
+ df[['prob_0', 'prob_1']] = y_prob
+
+ y_pred = model.predict(X_test)
+ df['prediction'] = y_pred
+
+ # Export results to the Database through standard output.
+ for index, value in df.iterrows():
+ my_str = str(value['txn_id']) + ',' + str(value['prob_0']) + ',' + str(value['prob_1']) + ',' + str(value['prediction']) + ',' + str(value['isFraud'])
+ print(my_str)
+
+
+except (SystemExit):
+ # Skip exception if system exit requested in try block
+ pass
+except: # Specify in standard error any other error encountered
+ print("Script Failure :", sys.exc_info()[0], file=sys.stderr)
+ raise
+ sys.exit()
diff --git a/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb b/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb
new file mode 100644
index 00000000..57ef699d
--- /dev/null
+++ b/VantageCloud_Lake/Getting_Started/Opensource_Data_Science_OAF.ipynb
@@ -0,0 +1,1037 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "hawaiian-daniel",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ " \n",
+ " Leveraging Open Source Machine Learning with ClearScape Analytics and Open Analytics Framework\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "Open-source Machine Learning, AI, and Advanced Analytics tools, techniques, and resources offer enterprises limitless opportunities to drive new insights and business value from their internal and external data landscape. Unfortunately, with these opportunities come significant challenges to realizing success. Some of these challenges include:
\n", + "VantageCloud Lake Edition Open Analytics Framework is the only enterprise-class platform that addresses these challenges with a simple, powerful architecture. The following demonstration will illustrate how users can use any open-source tool or package of choice, deploy it to a custom, isolated environment; and then execute in parallel and at massive scale.
\n", + "\n", + "This demonstration utilizes a VantageCloud Lake Analytic Cluster architecture, using the shared data sets created in the previous demonstration. Specifically the \"Txn_History\" data that represents \"CashApp\" style transaction history stored in the Vantage Object File System (OFS).
\n", + "\n", + "The high level process is as follows:
\n", + "\n", + "\n",
+ "
\n", + " \n", + " | ![]() |
This notebook consists of three primary demonstrations
\n", + "2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using `create_context` from the teradataml Python library. Input your connection details, including the host, username, password and Analytic Compute Group name.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "700c32b0-bd3d-4cee-85db-788889f0c7a7", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=Opensource_Data_Science_OAF.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "offshore-watch", + "metadata": {}, + "source": [ + "3. Demo 1 - Custom Container Management
\n", + "\n", + "\n", + "\n", + "The Teradata Vantage Python Client Library provides simple, powerful methods for the creation and maintenance of custom Python runtime environments in the VantageCloud environment . This allows practitioners complete control over the behavior and quality of their model performance and analytic accuracy running on the Analytic Cluster. The following demonstration will show how easy it is to create a custom xgboost-based scoring environment.
\n", + "\n", + "Custom environments are persistent. Users only need to create these once and then can be saved, updated, or modified only as needed.
\n", + "\n", + "Container Management Process
\n", + "\n",
+ "
\n", + " \n", + " \n", + " | \n",
+ " ![]() | \n",
+ "
3.1 Connect to the Environment Service
\n", + "\n", + "To better support integration with Cloud Services and commong automation tools; the User Environment Service is accessed via RESTful APIs. These APIs can be called directly or in the examples shown below that leverage the Python Package for Teradata (teradataml) methods.
\n", + "\n", + "In order to properly authenticate to the UES infrastructure, the user must log in with the same credentials that are used to connect to the database. When the following cell executes, follow the instructions to open a browser window, and log in with that user.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734d8327-f92c-4843-84be-b89b8fdf690f", + "metadata": {}, + "outputs": [], + "source": [ + "# We've already loaded all the values into our environment variables and into a dictionary, env_vars.\n", + "# username=env_vars.get(\"username\") isn't required when using base_url, pat and pem.\n", + "\n", + "if set_auth_token(base_url=env_vars.get(\"ues_uri\"),\n", + " pat_token=env_vars.get(\"access_token\"), \n", + " pem_file=env_vars.get(\"pem_file\"),\n", + " valid_from=int(time.time())\n", + " ):\n", + " print(\"UES Authentication successful\")\n", + "else:\n", + " print(\"UES Authentication failed. Check credentials.\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "869bec81-31cd-4ec3-97e7-1802ae2cfd7b", + "metadata": {}, + "outputs": [], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\") " + ] + }, + { + "cell_type": "markdown", + "id": "eligible-newfoundland", + "metadata": {}, + "source": [ + "3.2 Create a Custom Container in Vantage
\n", + "\n", + "If desired, the user can create a new custom environment by starting with a \"base\" image and customizing it. The steps are:
\n", + "3.3 Install Dependencies
\n", + "\n", + "The second step in the customization process is to install Python package dependencies. This set of code:\n", + "
\n", + "\n", + "4. Demo 2 - Install Custom Models and Scripts
\n", + "\n", + "Once the custom runtime environment has been created, the user can then load custom user-created assets. For the purposes of this Demonstration, we will load two files;
\n", + "\n", + "Once again, the Vantage Python Library makes this process straightforward by calling two simple methods:
\n", + "\n", + "\n",
+ "
\n", + " | \n",
+ " ![]() | \n",
+ "
4.1 Install User Files in the Cluster Container
\n", + "\n", + "Users can load any asset to the environment using the install_file method. This ensures that only authenticated users can install specific files into a dedicated filesystem, and helps prevent malicious code injection. Users pass the file name, and whether to replace an existing file.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "large-luther", + "metadata": {}, + "outputs": [], + "source": [ + "# Install xgboost model file.\n", + "demo_env.install_file('xgb_model', replace = True)\n", + "\n", + "# Install the desired Python script into the environment.\n", + "demo_env.install_file('Demo_XGB_Scoring.py', replace = True)" + ] + }, + { + "cell_type": "markdown", + "id": "minimal-transport", + "metadata": {}, + "source": [ + "4.2 List all installed files
\n", + "\n", + "files property lists the asset, size, and last updated timestamp. As above, these methods are available to manage the container remotely, since these containers live in the Vantage environment.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "running-tribute", + "metadata": {}, + "outputs": [], + "source": [ + "# Verify the files have been installed correctly.\n", + "demo_env.files" + ] + }, + { + "cell_type": "markdown", + "id": "responsible-switzerland", + "metadata": {}, + "source": [ + "5. Demo 3 - Model Scoring at Scale
\n", + "\n", + "VantageCloud Lake Edition Analytic Clusters combine the power and scale of native ClearScape Analytics Functions with the open and flexible runtime environments; offering users the flexibility to balance built-in data prep, transformation and feature engineering functions with custom code and models at massive scale.
\n", + "\n", + "Enterprise Class customers report the ability to reduce data prep and model scoring times from several hours per run to seconds; effectively allowing model scoring in near-real-time.
\n", + "\n", + "This demonstration will illustrate these key concepts:
\n", + "\n", + "\n",
+ "
\n", + " \n", + " | \n",
+ " ![]() | \n",
+ "
5.1 Data Transformation/Feature Engineering
\n", + "\n", + "Create a reference to the data set in Vantage, and apply powerful transformation functions directly on the Data. ClearScape Analytics is a suite of in-database massively-parallel-processing functions for statistical analysis, data cleaning and transformation, machine learning, text analytics, and model scoring. Practictioners can leverage these functions together with open-source modeling as illustrated here, or create powerful, native end-to-end pipelines using just these functions.
\n", + "\n", + "5.2 Engineer Features
\n", + "\n", + "Call the ClearScape One Hot Encoding function to transform the categorical column into numeric features.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "imposed-match", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Perform native one-hot encoding on the data\n", + "# These functions use a \"fit-and-transform\" pattern\n", + "# that supports reuse and easier operationalization of the transformation process\n", + "\n", + "from teradataml import OneHotEncodingFit, OneHotEncodingTransform\n", + "\n", + "res_ohe = OneHotEncodingFit(data = tdf_test, \n", + " target_column = 'type', \n", + " categorical_values = ['CASH_OUT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'PAYMENT'], \n", + " other_column = 'other',\n", + " is_input_dense = True)\n", + "\n", + "res_transformed = OneHotEncodingTransform(data = tdf_test, object = res_ohe.result, is_input_dense = True)\n", + "res_transformed.result.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "collectible-gather", + "metadata": {}, + "source": [ + "5.3 Execute the Scoring function
\n", + "\n", + "Now that the categorical column has been encoded, the XGBoost model can be called. This is executed via the Apply method, where we pass;
\n", + "\n", + "Finally, the script is executed by calling the \"execute_script\" method; this \"lazy\" evaluation allows for more modular and performant architecture.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4af9a813-110a-4b7a-b46f-3b4a12aaa585", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(res_transformed.result, table_name = 'Transformed_Tbl', if_exists = 'replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36b7284c-9522-453b-9a8b-59adebeb335e", + "metadata": {}, + "outputs": [], + "source": [ + "res_transformed = DataFrame.from_query(\"SELECT TOP 1000 * FROM Transformed_Tbl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "unlimited-liver", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "apply_obj = Apply(data = res_transformed.drop(['step', 'nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1),\n", + " apply_command = 'python3 Demo_XGB_Scoring.py',\n", + " returns = {'Actual': VARCHAR(2) , 'Prob_0': VARCHAR(30), 'Prob_1': VARCHAR(30), 'Prediction':VARCHAR(2), 'txn_id': VARCHAR(20)},\n", + " env_name = demo_env,\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "opening-manner", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Execute the Python script inside the remote user environment.\n", + "# The result is a teradataml DataFrame. \n", + "scored_data = apply_obj.execute_script()\n", + "\n", + "# Only return five rows - minimize network overhead\n", + "scored_data.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "chief-falls", + "metadata": {}, + "source": [ + "5.4 Analyze the Results
\n", + "\n", + "It is common practice to measure the efficacy of a model. For this demonstration, a \"Confusion Matrix\" is generated that shows the quantity of true vs. false positives and negatives for the model.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "distinguished-motor", + "metadata": {}, + "outputs": [], + "source": [ + "# Copy the predictions to the client\n", + "# to generate the simple Confusion Matrix\n", + "# and print the AUC (Area Under Curve)\n", + "\n", + "df_test = scored_data.to_pandas(all_rows = True)\n", + "cm = confusion_matrix(df_test['Actual'].astype(int), df_test['Prediction'].astype(int))\n", + "disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['0', '1'])\n", + "fig, ax = plt.subplots(figsize=(10,10))\n", + "disp.plot(ax=ax)\n", + "\n", + "plt.show()\n", + "\n", + "#Get AUC score - anything over .75 is decent\n", + "AUC = roc_auc_score(df_test['Actual'].astype(int), df_test['Prediction'].astype(int))\n", + "print(f'AUC: {AUC}')" + ] + }, + { + "cell_type": "markdown", + "id": "conceptual-crash", + "metadata": {}, + "source": [ + "5.5 Disconnect from Vantage
\n", + "\n", + "Once complete, one can remove the custom environment (if desired) and close the \"context\" to the Vantage system.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "tired-purple", + "metadata": {}, + "outputs": [], + "source": [ + "# uninstall the libraries from the environment first before removing it\n", + "demo_env.uninstall_lib(libs = demo_env.libs['name'].to_list())\n", + "remove_env(demo_env.env_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fiscal-animal", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "material-groove", + "metadata": {}, + "source": [ + "6. Appendix - Model Training and Evaluation
\n", + "\n", + "VantageCloud Lake Edition Analytic Clusters and ClearScape Analytics functions can also be leveraged for model training. This brief addendum shows an abbreviated process for developing and testing an open-source fraud detection model with Vantage and XGBoost.
" + ] + }, + { + "cell_type": "markdown", + "id": "abroad-underground", + "metadata": {}, + "source": [ + "6.1 Connect to Vantage
\n", + "\n", + "If necessary, connect to Vantage. If the context is still valid from above this doesn't need to be run. The below code will read in a variables file (vars.json - this has been used in prior environment setup and data engineering examples) and will connect to Vantage with this information. The Vantage connection is referred to as a \"Context\" - a common python-rdbms connection architecture.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "contemporary-rouge", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=Opensource_Data_Science_OAF.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "modified-services", + "metadata": {}, + "source": [ + "6.2 Get a reference to the data
\n", + "\n", + "Create a Teradataml DataFrame which references the data set in Vantage. This could be a table stored in direct-attach block storage, Performance-Optimized Object Storage (OFS), or stored in an open format in any Object Store.
\n", + "\n", + "Teradataml DataFrames do not copy data into local memory, so complex analytic and transformation operations can run against data at any scale, while leveraging the parallel processing and workload isolation of Vantage Analytic Clusters.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "american-centre", + "metadata": {}, + "outputs": [], + "source": [ + "# Updated variables to insure they are the same\n", + "tdf_test = DataFrame(in_schema(\"DEMO_GLM_Fraud\", \"transaction_data\"))\n", + "tdf_test.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "terminal-network", + "metadata": {}, + "source": [ + "6.3 Engineer Features
\n", + "\n", + "Call the ClearScape One Hot Encoding function to transform the categorical column into numeric features.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "higher-courage", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OneHotEncodingFit, OneHotEncodingTransform\n", + "\n", + "res_ohe = OneHotEncodingFit(data = tdf_test, \n", + " target_column = 'type', \n", + " categorical_values = ['CASH_OUT', 'CASH_IN', 'TRANSFER', 'DEBIT', 'PAYMENT'], \n", + " other_column = 'other',\n", + " is_input_dense = True)\n", + "\n", + "res_transformed = OneHotEncodingTransform(data = tdf_test, object = res_ohe.result, is_input_dense = True)\n", + "res_transformed.result.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "billion-drawing", + "metadata": {}, + "source": [ + "Design for Operations
\n", + "\n", + "Persist the \"Fit\" table to reuse it for the Operational transformation of new data
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "meaning-trading", + "metadata": {}, + "outputs": [], + "source": [ + "# copy the fit table to a permanent table for use later\n", + "res = copy_to_sql(res_ohe.result, table_name = 'OHE_FIT_TABLE', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "cognitive-dream", + "metadata": {}, + "source": [ + "6.4 Test/Train Split
\n", + "\n", + "Extraordinarily fast \"Sample\" function can split the data into multiple data sets in seconds.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ignored-scholar", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_samples = res_transformed.result.sample(frac = [0.2, 0.8])\n", + "copy_to_sql(tdf_samples[tdf_samples['sampleid'] == 2], table_name = 'txns_train', if_exists = 'replace')\n", + "copy_to_sql(tdf_samples[tdf_samples['sampleid'] == 1], table_name = 'txns_test' , if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "major-nudist", + "metadata": {}, + "source": [ + "6.5 Train the Model
\n", + "\n", + "Use open-source XGBoost Classifier to train the model using the \"training\" data split above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "demanding-bouquet", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a Pandas DataFrame\n", + "df_train = DataFrame(\"txns_train\").to_pandas(all_rows = True)\n", + "\n", + "# define the input columns and target variable:\n", + "X_train = df_train[['type_CASH_OUT', 'type_CASH_IN', 'type_TRANSFER',\n", + " 'type_DEBIT', 'type_PAYMENT', 'type_other', 'amount','oldbalanceOrig', 'newbalanceOrig',\n", + " 'oldbalanceDest', 'newbalanceDest']]\n", + "y_train = df_train[['isFraud']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "strong-lottery", + "metadata": {}, + "outputs": [], + "source": [ + "# Fit the Model\n", + "warnings.filterwarnings('ignore')\n", + "from xgboost import XGBClassifier\n", + "\n", + "model = XGBClassifier()\n", + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "atmospheric-occasions", + "metadata": {}, + "source": [ + "6.6 Test the Model
\n", + "\n", + "It is common practice to measure the efficacy of a model. For this demonstration, a \"Confusion Matrix\" is generated that shows the quantity of true vs. false positives and negatives for the model.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "australian-religion", + "metadata": {}, + "outputs": [], + "source": [ + "# Return a Pandas DataFrame from the split data above\n", + "\n", + "df_test = DataFrame(\"txns_test\").to_pandas(all_rows = True)\n", + "\n", + "# Define the input columns and target\n", + "X_test = df_test[['type_CASH_OUT', 'type_CASH_IN', 'type_TRANSFER',\n", + " 'type_DEBIT', 'type_PAYMENT', 'type_other', 'amount','oldbalanceOrig', 'newbalanceOrig',\n", + " 'oldbalanceDest', 'newbalanceDest']]\n", + "y_test = df_test[['isFraud']]\n", + "\n", + "\n", + "# Predict the class and the probability of Fraud\n", + "y_pred = model.predict(X_test)\n", + "y_prob = model.predict_proba(X_test)\n", + "\n", + "\n", + "# Generate the Confusion Matrix\n", + "df_test[['prob_0', 'prob_1']] = y_prob\n", + "df_test['prediction'] = y_pred\n", + "\n", + "cm = confusion_matrix(df_test['isFraud'], df_test['prediction'])\n", + "disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ['0', '1'])\n", + "fig, ax = plt.subplots(figsize=(10,10))\n", + "disp.plot(ax=ax)\n", + "\n", + "plt.show()\n", + "\n", + "#Get AUC score - anything over .75 is decent\n", + "AUC = roc_auc_score(df_test['isFraud'], df_test['prediction'])\n", + "print(f'AUC: {AUC}')" + ] + }, + { + "cell_type": "markdown", + "id": "proper-friendship", + "metadata": {}, + "source": [ + "6.7 Save the Model
\n", + "\n", + "Save the model file in native xgboost format. This is used above in the main demonstration.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "assured-progressive", + "metadata": {}, + "outputs": [], + "source": [ + "model.save_model('xgb_model')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "formed-sheet", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "1a4db3dc-2241-4735-9a1e-a489c8986bdb", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + }, + "toc-autonumbering": false, + "toc-showmarkdowntxt": true + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/Getting_Started/images/Container_Layout.png b/VantageCloud_Lake/Getting_Started/images/Container_Layout.png new file mode 100644 index 00000000..79fac5d8 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/Container_Layout.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png b/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png new file mode 100644 index 00000000..7445ea5f Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/In_DB_Functions.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/Model.png b/VantageCloud_Lake/Getting_Started/images/Model.png new file mode 100644 index 00000000..228bf77b Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/Model.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Env.png b/VantageCloud_Lake/Getting_Started/images/OAF_Env.png new file mode 100644 index 00000000..1be627c3 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Env.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png b/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png new file mode 100644 index 00000000..73b29048 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Overview.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png b/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png new file mode 100644 index 00000000..239be028 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/OAF_Scoring.png differ diff --git a/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png b/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png new file mode 100644 index 00000000..a6811164 Binary files /dev/null and b/VantageCloud_Lake/Getting_Started/images/TeradataLogo.png differ diff --git a/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb new file mode 100644 index 00000000..fa2b09ac --- /dev/null +++ b/VantageCloud_Lake/UseCases/Anomaly_Detection/VCL_Anomaly_Detection_Python.ipynb @@ -0,0 +1,1900 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b1378a69-ac58-4d0c-af22-7ef881abac45", + "metadata": {}, + "source": [ + "\n",
+ " Anomaly Detection in Robot Welding Process\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "Detecting anomalies reduces issues and delays in many industries, especially in the manufacturing field. There have been approaches to detect anomalies in the past, such as engineering rules and graph and deep learning. However, it still proves difficult to detect all the existing anomalies. Plus, companies are striving to minimize false positives, cope with the diversity of sensors and metrology issues, and deliver actionable insights at a business pace. Fortunately, Teradata and ClearScape Analytics have the solution. In ClearScape Analytics, users can execute all steps of anomaly detection from data preparation and exploration to model training and evaluations and adjustments. These analyses can improve the process and ensure accuracy in anomaly detection.
\n", + "\n", + "Spot Welding Quality Assessment
\n", + "Spot welding is a common technique used for welding car body panels, particularly in the assembly of smaller parts and components. Spot welding involves using a pair of copper electrodes to apply a series of short, high-current welding pulses to the metal, fusing the parts together at specific points or “spots”.
\n", + "\n", + "The automotive industry is known for its high level of automation, and spot welding is one of the most automated processes, heavily reliant on robots to improve efficiency, reduce labor costs, and improve the consistency and quality of the finished product. Poor welding quality is rare, but even so, the consequences of poor quality may not be negligible in terms of rework costs and customer satisfaction, especially when quality issues are detected too late.
\n", + "\n", + "Spot welding is a resistance welding process that uses large electrical current. There are many ways to assess the quality of a spot, like tensile or ultrasonic testing to assess the weld strength or the analysis of the welding current measured and recorded during the welding process. In this demo, we focus on the analysis of the anomalies in the welding spot due to welding current, and more specifically the resistance, i.e. the voltage-current ratio which impacts the quality of the welding. The shape of the resistance curve depends on many factors like the nature of the materials, the geometry, and the quality of the electrodes etc.
\n", + "\n", + "\n", + "Business Values
\n", + "Why Vantage?
\n", + "Many organizations fail to realize value from their ML and AI investments due to a lack of scale. It is estimated that for broad adoption across many industries, the number of models and model deployments needs to scale 100-1000x larger than their organizations currently support.
\n", + "The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "In this particular use case, the volume of machine sensor data was so great that millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.
\n", + "\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "b33aebf1-80cf-4043-99de-b2ac0356ea64", + "metadata": { + "tags": [] + }, + "source": [ + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
3.Load the data
\n" + ] + }, + { + "cell_type": "markdown", + "id": "9476f53a-7115-4018-a58f-dd09f7fc8b88", + "metadata": {}, + "source": [ + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + "\n", + "**Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99598e0a-8a6c-4539-a06d-f6723f67134f", + "metadata": {}, + "outputs": [], + "source": [ + "Sensor_Data = DataFrame(in_schema('DEMO_AnomalyDetection', 'Sensor_Data'))\n", + "Sensor_Data" + ] + }, + { + "cell_type": "markdown", + "id": "d4b9b958-737d-41a0-adec-91614fa0fe2e", + "metadata": {}, + "source": [ + "We get the above data from sensors. We focus on one plant (PLANT=1) and one robot (ROBOT_ID=41). The Partition_ID is the type of welding, ID is the WELDING_ID, X is time required for welding in ms and Y is the RESISTANCE. We create a view with the columns required to get data with proper column names.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88cde234-6107-487e-92f2-7f045576cc1d", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "query = f\"\"\"\n", + "REPLACE VIEW DEMO_AnomalyDetection.V_dataset_01 AS\n", + "SELECT\n", + " 1 AS PLANT\n", + ", {41} AS ROBOT_ID\n", + ", CAST(A.PARTITION_ID AS BIGINT) AS WELDING_TYPE\n", + ", CAST((DATE '{str(datetime.datetime.now()).split(' ')[0]}' + FLOOR((WELDING_ID-700*WELDING_TYPE)/100)) AS DATE FORMAT 'YYYY-MM-DD') AS WELDING_DAY\n", + ", CAST(A.ID AS BIGINT) AS WELDING_ID\n", + ", CAST(A.X AS INTEGER) AS TIME_MS\n", + ", A.Y AS RESISTANCE\n", + "FROM DEMO_AnomalyDetection.Sensor_Data A\n", + "\"\"\"\n", + "execute_sql(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ec3a959-c5e0-4039-88f8-846adca6f113", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new = DataFrame(in_schema('DEMO_AnomalyDetection', 'V_dataset_01'))\n", + "welding_dataset_new" + ] + }, + { + "cell_type": "markdown", + "id": "09198aa2-6ab7-4339-a01a-365cba02c772", + "metadata": {}, + "source": [ + "3.1 - Some aggregations and visualization.
\n" + ] + }, + { + "cell_type": "markdown", + "id": "f83b1b1a-eece-487a-97d7-b4759ea624ce", + "metadata": {}, + "source": [ + "We will check the histogram based on the minimum and maximum Time for welding.
\n", + "A histogram is a better way to assess distribution, to cope with the scalability, it is recommended to compute the histogram bins in-database to leverage the Massively Parallel Architecture of Teradata Vantage. For that, we use the Histogram function of teradataml that pushes down the computations to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a5d38c3-ebb9-47a2-b8ad-f00acd9d769b", + "metadata": {}, + "outputs": [], + "source": [ + "welding_duration_ms = welding_dataset_new. \\\n", + " groupby(['PLANT','ROBOT_ID','WELDING_TYPE', 'WELDING_ID']). \\\n", + " agg({'TIME_MS':['min','max','count']})\n", + "welding_duration_ms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "642bf739-a421-4ffd-8fc1-53f273db9bd9", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Histogram\n", + "obj = Histogram(data=welding_duration_ms,\n", + " target_columns=\"count_TIME_MS\",\n", + " method_type=\"Scott\")\n", + "res = obj.result.sort('MinValue')\n", + "res" + ] + }, + { + "cell_type": "markdown", + "id": "62b099f0-eb76-45a2-9c0e-983399c59570", + "metadata": {}, + "source": [ + "We can see that we have calculated the histogram values using the teradataml functions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visualization tools, this will not only make the calculation faster but also reduce the time due to less data movement between tools. We do the data transfer for this and the subsequent visualizations wherever necessary.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c9b72ab-7d3c-4964-9199-ee1dcc17c928", + "metadata": {}, + "outputs": [], + "source": [ + "res = obj.result.sort('MinValue').to_pandas()\n", + "res['duration_ms'] = [str(row['MinValue'])+'-'+str(row['MaxValue']) for i,row in res.iterrows()]\n", + "res.plot(x='duration_ms',y='CountOfValues',kind='bar', figsize=(15,10), legend=False,xlabel='Duration(ms)', ylabel='Welding Counts')" + ] + }, + { + "cell_type": "markdown", + "id": "88429a10-aa8b-459f-976a-6276ab121bbc", + "metadata": {}, + "source": [ + "In the above histogram we can see the bins between the Min and the Max value of the durations and the welding counts.
\n", + "3.2 - More advanced processing using window functions and delta_t
\n", + "Resistance is an important parameter in resistance welding. The resistance should not vary too much. If there are any significant changes in resistance over time, it could indicate an issue with the weld quality. For example, an unusually high resistance could indicate poor contact between the parts being welded or a problem with the welding equipment.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b5615026-52eb-4aae-8bb2-146e88ef4502", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50c72091-f7f3-4ed3-a436-ee5c44335f4e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from tdnpathviz.visualizations import plotcurves\n", + "plotcurves(welding_dataset_new.loc[welding_dataset_new.WELDING_ID == 854],field='RESISTANCE',row_axis='TIME_MS', series_id='WELDING_ID',select_id=None)" + ] + }, + { + "cell_type": "markdown", + "id": "ae924828-6e92-4003-93c9-b66aeec1821f", + "metadata": {}, + "source": [ + "The above graph shows the variation of the resistance of the welding with respect to time. We see that the most interesting part lies between 40 and 400ms from the start of the curve.
\n", + "\n", + "Next we apply the window function on the resistance to smooth the resistance and taking the mean value.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "516d8fd4-ab2c-44cd-89d2-d8075e40cf82", + "metadata": {}, + "outputs": [], + "source": [ + "# curve smoothing\n", + "window_for_smoothing = welding_dataset_new.RESISTANCE.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS',\n", + " window_start_point = -15,\n", + " window_end_point = 15\n", + ")\n", + "welding_dataset_smooth = welding_dataset_new.assign(RESISTANCE_SMOOTHED = window_for_smoothing.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c351bab-cd80-452c-b600-79efaec9f769", + "metadata": {}, + "outputs": [], + "source": [ + "id_curve = 854\n", + "single_welding = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44d1ffb7-1bf2-4770-8b0d-f21ed5a589e4", + "metadata": {}, + "outputs": [], + "source": [ + "figure = Figure(width=1000, height=400, image_type=\"jpg\",\n", + " heading=\"RESISTANCE and RESISTANCE SMOOTHED\")\n", + "plot = single_welding.plot(x=single_welding.TIME_MS, y=[single_welding.RESISTANCE, single_welding.RESISTANCE_SMOOTHED],\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ',figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "299bf795-653e-45a4-8f39-5143d81173cf", + "metadata": {}, + "source": [ + "The above graph shows the variation of the resistance of the welding with respect to time and the smoothed resistance, as shown by the Red line, after applying the window function.
\n", + "\n", + "The window function generates a Window object on a teradataml DataFrame Column to run window aggregate functions.\n", + "
Function allows user to specify window for different types of computations:\n", + "
By default, window with Unbounded Preceding and Unbounded following is considered for calculation.
\n", + "\n", + "Next we calculate the derivative by using the lead function and taking the difference of the lead value and the mean value of the resistance. Applying a window function to smooth the resistance curve helps to eliminate noise and makes it easier to see the overall trend. The derivative of the resistance gives an indication of how quickly the resistance is changing, which can be a useful measure for detecting anomalies and predicting potential issues.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e489b230-97b7-4f91-a001-3355da9b20bd", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(welding_dataset_smooth,table_name='welding_dataset_smooth', if_exists='replace')\n", + "welding_dataset_smooth = DataFrame('welding_dataset_smooth')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fccb6149-ce72-4601-983b-a87f2bc52417", + "metadata": {}, + "outputs": [], + "source": [ + "# let's compute the lead\n", + "window_for_lead = welding_dataset_smooth.RESISTANCE_SMOOTHED.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6a9bc90-f330-467f-8765-5a00578c6c6e", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_smooth = welding_dataset_smooth.assign(RESISTANCE_SMOOTHED_AFTER = window_for_lead.lead())\n", + "welding_dataset_smooth = welding_dataset_smooth.assign(DERIVATIVE = (welding_dataset_smooth.RESISTANCE_SMOOTHED_AFTER - welding_dataset_smooth.RESISTANCE_SMOOTHED).zeroifnull())\n", + "welding_dataset_smooth.sort(['WELDING_ID','TIME_MS'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d019941f-4422-4012-8984-0dce20d10e48", + "metadata": {}, + "outputs": [], + "source": [ + "id_curve = 854\n", + "single_welding_subplot = welding_dataset_smooth[welding_dataset_smooth.WELDING_ID == id_curve].sort('TIME_MS')\n", + "single_welding_subplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bd9b71a-b668-44f9-a0bd-e74b2c82462e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import subplots\n", + "# fig, axes = subplots(grid = {(1, 1): (1, 1),(2, 1): (1, 2)})\n", + "# Plot 1980 data at first Axis.\n", + "fig, axes = subplots(nrows=2, ncols=1)\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=[single_welding_subplot.RESISTANCE, single_welding_subplot.RESISTANCE_SMOOTHED],\n", + " legend=[\"RESISTANCE\", \"RESISTANCE SMOOTHED\"],\n", + " figure=fig,\n", + " style=['blue', 'red'],xlabel='time in ms', ylabel='resistance ', \n", + " ax=axes[0])\n", + "\n", + "# Plot 1981 data at second Axis.\n", + "plot = single_welding_subplot.plot(x=single_welding_subplot.TIME_MS, \n", + " y=single_welding_subplot.DERIVATIVE,\n", + " legend=[\"DERIVATIVE\"],\n", + " figure=fig,\n", + " style=\"red\",xlabel='time in ms', ylabel='derivative ' , \n", + " ax=axes[1])\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "id": "92c9f6e0-7b26-4fed-9b43-1d35989affad", + "metadata": {}, + "source": [ + "We see that the most interesting part lies between 40 and 400ms from the start of the curve, so we plot only that subset.
" + ] + }, + { + "cell_type": "markdown", + "id": "4615d965-6892-4729-81b0-9dd39f7d9411", + "metadata": { + "tags": [] + }, + "source": [ + "It is hard to assess the diversity of curve shapes in this plot since many of them are superimposed. However, we see in the middle of the picture a sharp drop that looks unusual. Moreover, we guess that there are shifts in time and height.
\n", + "\n", + "4. Feature Engineering
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da82ee40-3e38-49af-a6ca-a678ba240ca2", + "metadata": {}, + "outputs": [], + "source": [ + "welding_dataset_new.columns" + ] + }, + { + "cell_type": "markdown", + "id": "539a4c25-f868-44af-bca3-13b4ca477445", + "metadata": {}, + "source": [ + "We will create a feature table by using different functions on the Resistance column. Valid values for functions are: 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var', 'skew', 'kurtosis'.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa37d2af-c185-4a84-9ca5-8628a216aa27", + "metadata": {}, + "outputs": [], + "source": [ + "features = welding_dataset_new.loc[welding_dataset_new.TIME_MS > 20,:]. \\\n", + " groupby(welding_dataset_new.columns[0:5]). \\\n", + " agg({\n", + " 'TIME_MS':['min','max'],\n", + " 'RESISTANCE':['count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var','skew','kurtosis']\n", + " })\n", + "features" + ] + }, + { + "cell_type": "markdown", + "id": "0196e16a-9d9d-4d44-a0ed-e5220c3314e2", + "metadata": {}, + "source": [ + "5. Anomaly Detection on Sensor Data
\n", + " \n", + "Let's start by getting the feature columns from the features tables
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27cdf0f8-e0b3-41b5-b18d-b77cdbc5652b", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = features.columns[7::]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "9655f048-ffbd-4785-9e8b-39d192ff7808", + "metadata": {}, + "source": [ + "5.1 Clustering by curve shape
\n", + "To cluster time series by shapes, we will use the Dynamic Time Warping (DTW) distance that measures the similarity between two time series. This distance is well adapted to this kind of problem since it provides robustness to shifts in time and height.
\n", + "\n", + "Distance Matrix in-database Computations
\n", + "\n", + "The ClearScape Analytics DTW function computes at scale distances between one reference curve to a set of curves, a many-to-one approach. ClearScape Analytics offers in database dynamic time warping function, callable in SQL as TD_DTW. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. This function computes at scale the DTW distances between one reference curve to a set of curves, a many-to-one approach. We want to compute the distance matrix of our subset, i.e. the DTW distance between each curve. The distance matrix is symmetric, since the DTW is, hence we only need to compute the triangular matrix. We wrapped this computation in the tdsense package that calls the TD_DTW function and iterates on the matrix row to compute and store the whole triangular distance matrix in a table.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "207e72c8-41e3-481a-9727-a4c7510f4206", + "metadata": {}, + "outputs": [], + "source": [ + "overview = welding_dataset_new.groupby('WELDING_DAY').count(distinct=True)\n", + "dates = list(overview.to_pandas().reset_index()['WELDING_DAY'].values.astype('str'))\n", + "dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c7180b4-a8b5-450a-96be-8aed93d1199a", + "metadata": {}, + "outputs": [], + "source": [ + "subset = welding_dataset_new[ \\\n", + " (welding_dataset_new['PLANT'] == 1) & \\\n", + " (welding_dataset_new['ROBOT_ID'] == 41) & \\\n", + " (welding_dataset_new['WELDING_TYPE'] in (8,9)) & \\\n", + " (welding_dataset_new['WELDING_DAY'].isin(dates)) \\\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dda2eca-af26-4741-abeb-b63758f8c996", + "metadata": {}, + "outputs": [], + "source": [ + "subset_zoom = subset[(subset.TIME_MS < 400) & (subset.TIME_MS > 40)]\n", + "subset_zoom.shape" + ] + }, + { + "cell_type": "markdown", + "id": "9d40f422-886d-48e5-a4ce-03b259523917", + "metadata": {}, + "source": [ + "The subset of data we have taken contains 7 columns and 344,622 rows.
\n", + "\n", + "Since this is a 2CPU system, the below computation takes around more than 2 hours for 350k rows and so we have pre calculated it and stored in the table in database.
\n", + "\n", + "**In case we still want to compute the matrix please set the If part of the below code to True instead of False
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "276fd1b7-e057-4c0c-b8b0-4e063d70eb7a", + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " dtw_matrix = dtw_distance_matrix_computation2(subset_zoom,field='RESISTANCE',\n", + " table_name=dtw_result_table,\n", + " schema_name = Param['database'],\n", + " row_axis='TIME_MS',\n", + " series_id = 'WELDING_ID')\n", + "else:\n", + " dtw_matrix = DataFrame(in_schema('DEMO_AnomalyDetection','DTW_Matrix'))" + ] + }, + { + "cell_type": "markdown", + "id": "42f770a5-f3b2-4862-8256-b1cc1f969750", + "metadata": {}, + "source": [ + "5.2 Hierarchical clustering with Scipy
\n", + "\n", + "Now the distance matrix is available, we can perform the clustering. Here, we will use the open-source package Scipy and its cluster.hierarchy modules, that have been used in a tdsense for convenience.
\n", + "\n", + "Hierarchical clustering is an alternative class of clustering algorithms that produce 1 to n clusters, where n is the number of observations in the data set. As you go down the hierarchy from 1 cluster (contains all the data) to n clusters (each observation is its own cluster), the clusters become more and more similar (almost always).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0b87b35b-c283-42d8-845b-5c9c7851c822", + "metadata": {}, + "outputs": [], + "source": [ + "dtw_matrix_loc = dtw_matrix.sort(columns=['WELDING_ID_2','WELDING_ID_1']).to_pandas(all_rows=True)\n", + "dtw_matrix_loc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49f64fd3-1f33-4b7c-9d8f-b0636bffc2f4", + "metadata": {}, + "outputs": [], + "source": [ + "from tdsense.clustering import hierarchy_dendrogram, hierarchy_clustering\n", + "linked, labelList = hierarchy_dendrogram(dtw_matrix_loc, cluster_distance = 'ward')" + ] + }, + { + "cell_type": "markdown", + "id": "df0a3961-8cd1-43b8-9c11-9e229648d1eb", + "metadata": {}, + "source": [ + "The dendrogram is useful for visualizing the structure of the hierarchical clustering and identifying the optimal number of clusters to use for further analysis. The optimal number of clusters can be determined by examining the dendrogram to identify a level at which the clusters start to merge more slowly or by using a threshold for the maximum distance between clusters.
\n", + "\n", + "The resulting dendrogram as above shows how the hierarchical clustering algorithm has merged the data points into clusters based on their pairwise distances using the Ward linkage criterion. The dendrogram is a summary of the distance matrix. The X axis has the WELDING_ID but not visible as we have more than 450k rows. Looking at the dendrogram, we see that we have about 6 clusters. When selected 6, here is what we have got.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2e168ff-626b-47b8-bc2b-ecfaac22a8f4", + "metadata": {}, + "outputs": [], + "source": [ + "cluster = hierarchy_clustering(linked, labelList, n_clusters=6)\n", + "cluster.head()" + ] + }, + { + "cell_type": "markdown", + "id": "48b62135-409c-45a9-b604-6e98ccf059fd", + "metadata": {}, + "source": [ + "The above dendogram is for only 6 clusters with the colors representing the different clusters. Now, we plot the Resistance curves for each cluster.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d31bafdc-9f43-4083-9677-ef7d94c18eb1", + "metadata": {}, + "outputs": [], + "source": [ + "fig, ax = plt.subplots(2,3,figsize=(20,10))\n", + "colors = cluster[['cluster','leaves_color_list']].copy().drop_duplicates()\n", + "for k in range(6):\n", + " plt.subplot(2,3,k+1)\n", + " img = plotcurves( subset_zoom,\n", + " field='RESISTANCE',\n", + " row_axis='TIME_MS',\n", + " series_id='WELDING_ID',\n", + " select_id=list(cluster[cluster.cluster ==k].CURVE_ID.values),\n", + " noplot=True)\n", + " plt.imshow(img)\n", + " plt.title('cluster : ' +str(k) + '\\n' + str(cluster.groupby('cluster').count()['CURVE_ID'][k]) + ' obs.',fontdict = {'fontsize' : 10, 'color':colors.leaves_color_list.values[k]})\n", + " plt.axis('off')" + ] + }, + { + "cell_type": "markdown", + "id": "f50fab99-9231-410d-bdd3-1132fc98575f", + "metadata": {}, + "source": [ + "And if we plot the curves per cluster, we spot the curves with a sharp drop(cluster 4) and these are the curves we are interested in, i.e. the curve exhibiting the anomaly we are looking for. We note also the other clusters are looking more or less similar. By monitoring the resistance over time and calculating its derivative, you can detect any sudden changes or anomalies. Anomalies might indicate a problem with the welding process, such as a sudden drop in current or a sudden increase in resistance.
" + ] + }, + { + "cell_type": "markdown", + "id": "9b99a7ac-6a99-4c9e-9ead-0f6d6e5c4759", + "metadata": {}, + "source": [ + "5.3 Create the anomaly dataset
\n", + "Now we create a table containing the anomaly flag that will be the target of a supervised machine learning model or a relevant KPI to monitor in production dashboards.
\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cec5b577-b0dd-45c8-8fad-fee1fb1f952a", + "metadata": {}, + "outputs": [], + "source": [ + "target = cluster.copy().drop('leaves_color_list',axis=1)\n", + "target = target[target.cluster.isin([1,2])]\n", + "target['WELDING_ID'] = target['CURVE_ID']\n", + "target['anomaly'] = 0\n", + "target.loc[target.cluster==2,'anomaly'] = 1\n", + "target.drop(['cluster','CURVE_ID'],axis=1, inplace=True)\n", + "target.groupby('anomaly').count().plot(y='WELDING_ID',kind='bar',figsize=(10,10))\n", + "copy_to_sql( target,\n", + " table_name = 'Anomaly_Target',\n", + " if_exists='replace',\n", + " primary_index='WELDING_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ac7c451-2fb3-45fa-895d-e881cc88a9ba", + "metadata": {}, + "outputs": [], + "source": [ + "anomalies = DataFrame('Anomaly_Target')\n", + "anomalies" + ] + }, + { + "cell_type": "markdown", + "id": "da6297fd-6f49-4619-af30-791db2af90da", + "metadata": {}, + "source": [ + "The above anomaly data has the welding ID and the anomaly flag.
\n", + "5.4 Build the analytical dataset
\n", + "\n", + "We prepare the analytical dataset by joining the feature table with the anomaly table using the Welding ID so that we get the anomalies for the weldings.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4cfcfb-7d91-47e5-a4cc-e44428e51cfe", + "metadata": {}, + "outputs": [], + "source": [ + "ADS = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "ADS = ADS.assign(WELDING_ID=ADS.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "ADS" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "20a2163c-9fea-4f3d-ab0b-696b3cccaad9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ADS.shape" + ] + }, + { + "cell_type": "markdown", + "id": "c03b26f4-0fa4-4478-922e-9cb850acbe34", + "metadata": {}, + "source": [ + "The analytical dataset we created has 14 columns and 391 rows which will be used to build the model below.
" + ] + }, + { + "cell_type": "markdown", + "id": "09b3168b-8c53-4ffd-ba75-b26f40608654", + "metadata": {}, + "source": [ + "6. Build the model
\n", + "We have datasets in which different columns have different units – like one column can be in kilograms, while another column can be in centimetres. If we feed these features to the model as is, there is every chance that one feature will influence the result more due to its value than the others. But this doesn’t necessarily mean it is more important as a predictor. So, to give importance to all the features we need feature scaling.
\n", + " \n", + "Here, we apply the Standard scale and transform functions which are ScaleFit and ScaleTransform functions in Vantage. ScaleFit() function outputs statistics to input to ScaleTransform() function, which scales specified input DataFrame columns.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5d0898e-53a7-4aca-9f24-2e2f06ac73dc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ScaleFit , ScaleTransform\n", + "scaler = ScaleFit(\n", + " data=ADS,\n", + " target_columns=feature_names,\n", + " scale_method=\"STD\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76af7c0a-b1cf-4914-a099-aeaeeb0c4977", + "metadata": {}, + "outputs": [], + "source": [ + "ADS_scaled = ScaleTransform(data=ADS,\n", + " object=scaler.output,\n", + " accumulate=\"anomaly\").result\n", + "ADS_scaled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cc1ed77-bd6e-4476-9b76-abb448c7199b", + "metadata": {}, + "outputs": [], + "source": [ + "df = ADS_scaled.to_pandas()" + ] + }, + { + "cell_type": "markdown", + "id": "3b3a8548-555a-48fd-88e4-795abaff2cc5", + "metadata": {}, + "source": [ + "6.1 Create a model file using the python libraries.
\n", + "\n", + "The Vantage Bring Your Own Model (BYOM) package gives data scientists and analysts the ability to operationalize predictive models in Vantage. Predictive models trained in external tools with sample data can be used to score data stored in Vantage using the BYOM Predict. Create or convert your predictive model using a supported model interchange format (PMML, MOJO, ONNX, Dataiku, and DataRobot are currently available), store it in a Vantage table, and use the BYOM PMMLPredict, H2OPredict, ONNXPredict, DataikuPredict, or DataRobotPredict to score your data with the model.
\n", + "\n", + "A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary. One way to solve this problem is to oversample the examples in the minority class. the most widely used approach to synthesizing new examples is called the Synthetic Minority Oversampling Technique, or SMOTE for short. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.
\n", + "\n", + "Then we use the RandomForestClassifier to create the model. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. It is basically a set of decision trees (DT) from a randomly selected subset of the training set and then It collects the votes from different decision trees to decide the final prediction.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d847d16a-9735-4482-953d-66c80faf0bdc", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df[feature_names]\n", + "y_train = df['anomaly']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4350a66c-2ff9-483c-ae30-8f17c5d375b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Balance the training set using SMOTE\n", + "smote = SMOTE(random_state=42)\n", + "X_train, y_train = smote.fit_resample(X_train, y_train)\n", + "\n", + "\n", + "# Create a random forest classifier\n", + "model = RandomForestClassifier(n_estimators=10,max_depth= 3, random_state=42)\n", + "\n", + "# Create a pipeline that includes the SMOTE transformer and the model\n", + "pipeline = PMMLPipeline([ ('model', model)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "455a3ff5-e8ee-4c9b-909e-3e1a79fa6612", + "metadata": {}, + "outputs": [], + "source": [ + "# Train the pipeline\n", + "start = time.time()\n", + "pipeline.fit(X_train, y_train)\n", + "end = time.time()\n", + "print('duration : ', end-start, 's')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61ff634a-aea7-4966-bf38-30b77547f0a3", + "metadata": {}, + "outputs": [], + "source": [ + "# make predictions on the training set\n", + "y_train_pred = pipeline.predict(X_train)\n", + "\n", + "# calculate and print the accuracy score\n", + "acc = accuracy_score(y_train, y_train_pred)\n", + "print(\"Accuracy: {:.2f}%\".format(acc * 100))\n", + "\n", + "# calculate and print precision, AUC and F1-score\n", + "prec = precision_score(y_train, y_train_pred)\n", + "print(\"Precision: {:.2f}%\".format(prec * 100))\n", + "\n", + "# calculate AUC, AUC requires probability for positive class\n", + "prob = pipeline.predict_proba(X_train)[:, 1]\n", + "auc = roc_auc_score(y_train, prob)\n", + "print(\"AUC: {:.2f}%\".format(auc * 100))\n", + "\n", + "f1 = f1_score(y_train, y_train_pred)\n", + "print(\"F1-Score: {:.2f}%\".format(f1 * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60a0b3c9-4a3f-478c-a9f9-2ddd786aa332", + "metadata": {}, + "outputs": [], + "source": [ + "pmml_metrics=pd.DataFrame([{'Model':'PMML using BYOM','Accuracy':acc, 'Precision':prec, 'F1-Score':f1}])\n", + "pmml_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da084cfa-5c7b-4899-9c9b-41b065546bf6", + "metadata": {}, + "outputs": [], + "source": [ + "sklearn2pmml(pipeline, \"my_model.pmml\", with_repr = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c35b23c2-c4c4-4601-b374-9d021a4845b0", + "metadata": {}, + "outputs": [], + "source": [ + "additional_columns = {\"Description\": type(\"RandomForestClassifier model\"),\n", + " \"UserId\": type('demo_user'),\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": prec,\n", + " \"ModelAUC\": auc,\n", + " \"Modelf1Score\": f1,\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": end-start,\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + "for k in additional_columns.keys():\n", + " print(type(additional_columns[k]))" + ] + }, + { + "cell_type": "markdown", + "id": "8351d68c-fed5-4034-b00f-fe0379625090", + "metadata": {}, + "source": [ + "6.2 Save the model file
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ffc1be2-d980-4468-9fc9-58ef30e5cb27", + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + "except Exception as e: \n", + " # if our model exists, delete and rewrite \n", + " if str(e.args).find('TDML_2200') >= 1: \n", + " delete_byom(model_id = 'model_anomaly1', table_name = 'BYOM_PMMLMODELS_REPOSITORY') \n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + " else: \n", + " raise ValueError(f\"Unable to save the model due to the following error: {e}\")\n", + "# pass \n", + "# else: \n", + "# raise \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "60c0f97c-52b2-407e-921c-75a61ca2d3fa", + "metadata": {}, + "source": [ + "The model file is saved as can be found in the left navigation pane in /UseCases/Anomaly_Detection.
\n", + "\n", + "We create new scaled data to apply this model and predict data. New dataset is created by joining the features and the anomalies.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60fe7dff-a0fa-43a6-aa03-d11aeed2904e", + "metadata": {}, + "outputs": [], + "source": [ + "newdata = features[['WELDING_ID']+feature_names].join(other=anomalies, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "newdata = newdata.assign(WELDING_ID=newdata.WELDING_ID_l).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1).select(['WELDING_ID']+feature_names+['anomaly'])\n", + "newdata" + ] + }, + { + "cell_type": "markdown", + "id": "bd7108ab-49b6-411a-a919-4ab7f859252e", + "metadata": {}, + "source": [ + "We create new transformed data by using the same Scalefit object we used earlier and get the transformed data for this new data.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "099b4d80-3bb8-4e96-ba57-c85c84ae990a", + "metadata": {}, + "outputs": [], + "source": [ + "newdata_scaled = ScaleTransform(data=newdata,\n", + " object=scaler.output,\n", + " # DataFrame(in_schema('demo_user','scaler_anomaly')),\n", + " accumulate=[\"WELDING_ID\",\"anomaly\"]).result\n", + "newdata_scaled" + ] + }, + { + "cell_type": "markdown", + "id": "46bb63a9-35eb-40e9-a4d4-d1aa558b19d1", + "metadata": {}, + "source": [ + "6.3 Retrieve the model file and use it to predict
\n", + "We use the PMMLPredict function from the teradataml library to predict the anomalies.
\n", + "Predictive Model Markup Language (PMML) is an XML-based standard established by the Data Mining Group (DMG) for defining statistical and data-mining models. PMML models can be shared between PMML-compliant platforms and across organizations so that business analysts and developers are unified in designing, analyzing, and implementing PMML-based assets and services.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16f0c6bb-3551-4337-a4e3-8c2a79fd55cc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import PMMLPredict\n", + "modeldata_anomaly = retrieve_byom(\"model_anomaly1\", table_name=\"BYOM_PMMLMODELS_REPOSITORY\")\n", + "result=PMMLPredict(\n", + " modeldata = modeldata_anomaly,\n", + " newdata = newdata_scaled,\n", + " accumulate = ['WELDING_ID'],\n", + " model_output_fields=['probability(0)','probability(1)'],\n", + " overwrite_cached_models = '*'\n", + " )\n", + "pmml_predict=result.result\n", + "pmml_predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f03ec30-32a9-4b13-af64-78eaa88b79e1", + "metadata": {}, + "outputs": [], + "source": [ + "pmml_predict_result = pmml_predict.join(other=newdata_scaled, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "pmml_predict_result = pmml_predict_result.assign(prob_0=pmml_predict_result['probability(0)'])\n", + "pmml_predict_result = pmml_predict_result.assign(prob_1=pmml_predict_result['probability(1)'])\n", + "pmml_predict_result = pmml_predict_result.assign(WELDING_ID=pmml_predict_result.WELDING_ID_l)\n", + "pmml_predict_result = pmml_predict_result.assign(prediction=case([(pmml_predict_result.prob_1>pmml_predict_result.prob_0, 1 )],else_ = 0))\n", + "pmml_predict_result = pmml_predict_result.select(['WELDING_ID']+['anomaly']+['prob_0']+['prob_1']+['prediction'])\n", + "pmml_predict_result" + ] + }, + { + "cell_type": "markdown", + "id": "220bb477-2d63-4672-98a1-cb50d40f960f", + "metadata": {}, + "source": [ + "7. Decision Forest
\n", + " \n", + "We will now use the DecisionForest model to predict the anomalies. A decision forest is a generic term to describe models made of multiple decision trees. The prediction of a decision forest is the aggregation of the predictions of its decision trees. The implementation of this aggregation depends on the algorithm used to train the decision forest. The goal of using a Decision Tree is to create a training model that can use to predict the class or value of the target variable by learning simple decision rules inferred from prior data(training data).
\n", + "\n", + "We start by creating a subset for the most interesting part lies between 40 and 400ms from the start of the curve.
\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf8a84c6-2c67-43c7-86e2-1f31c6bd1c18", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = welding_dataset_new[(welding_dataset_new.TIME_MS > 40) & (welding_dataset_new.TIME_MS < 400) ]\n", + "DF_curves_zoom" + ] + }, + { + "cell_type": "markdown", + "id": "58c9f479-f2ff-4863-b969-b9b8a873e6d4", + "metadata": {}, + "source": [ + "We create various features by using the window function on the Resistance and taking the difference between the previous and current resistance based on time. We will create these features by using the aggregation function on this resistance and the difference of the resistance.
\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a227337c-3b57-443c-a256-dd5230ed98dd", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = DF_curves_zoom.assign(\n", + " resistance_diff = DF_curves_zoom.RESISTANCE \n", + " - DF_curves_zoom.RESISTANCE.window(\n", + " partition_columns=['WELDING_ID'],\n", + " order_columns=[\"TIME_MS\"]\n", + " ).lag(1)\n", + ")\n", + "# DF_curves_zoom[DF_curves_zoom.WELDING_ID==138].sort(\"TIME_MS\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb8c00e7-c465-46ba-99ae-c094969a2eed", + "metadata": {}, + "outputs": [], + "source": [ + "DF_features = DF_curves_zoom.groupby(\"WELDING_ID\").agg({\n", + " 'RESISTANCE':['sum', 'min', 'max', 'mean', 'std', 'var','skew','kurtosis'],\n", + " 'resistance_diff':['min']\n", + "})\n", + "DF_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6498373-8b50-49fb-ac0b-b0db7b0cb522", + "metadata": {}, + "outputs": [], + "source": [ + "feature_names = DF_features.columns[1:]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "57712977-e195-4ce9-9867-a7cdbc772279", + "metadata": {}, + "source": [ + "7.1 Build the analytical dataset.
\n", + "We create the analytical dataset joining the anomaly table created above and the dataset with the features created.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55686241-b413-45eb-a495-9888c946c634", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_target = DataFrame('Anomaly_Target')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f0b595e-d794-4797-9125-b0bd2e9b046a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_ADS_train = DF_features[['WELDING_ID']+feature_names].join(\n", + " other=DF_target, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')\n", + "DF_ADS_train = DF_ADS_train.assign(WELDING_ID=DF_ADS_train.WELDING_ID_l\n", + " ).drop(['WELDING_ID_l','WELDING_ID_r'],axis=1\n", + " ).select(['WELDING_ID']+feature_names+['anomaly']\n", + " ).assign(anomaly_int = DF_ADS_train.anomaly.cast(INTEGER()))\n", + "DF_ADS_train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0199e5db-a881-4a2e-92df-0fcc0a54158b", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_score = DF_features[['WELDING_ID']+feature_names]\\\n", + " [DF_features.WELDING_ID>800]\n", + "DF_ADS_score" + ] + }, + { + "cell_type": "markdown", + "id": "d3865607-6205-43e4-a3be-2142af2dd340", + "metadata": {}, + "source": [ + "We store these training and scoring datasets into Vantage to be used by the In-DB functions.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30d0d263-3183-4e37-aa05-6f5ccd61ac49", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_train.to_sql(\n", + " table_name = 'ADS_train_data',\n", + " primary_index= 'WELDING_ID',\n", + " if_exists = 'replace'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db8815e7-cdc4-40fb-9160-bfd466d7535f", + "metadata": {}, + "outputs": [], + "source": [ + "DF_ADS_score.to_sql(\n", + " table_name = 'ADS_test_data',\n", + " primary_index= 'WELDING_ID',\n", + " if_exists = 'replace'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "f38cc3c9-6828-4c65-9b72-53ea02a172cd", + "metadata": {}, + "source": [ + "7.2 Train Decision Forest
\n", + "The DecisionForest is an ensemble algorithm used for classification and regression predictive modelling problems. It is an extension of bootstrap aggregation (bagging) of decision trees.
\n", + "\n", + "This function takes the training data as input, as well as the following function parameters
\n", + "7.3 Predict and Evaluate Decision Forest model
\n", + "Execute a testing prediction using the split data above. Evaluate the model by creating a confusion matrix with the ClassificationEvaluator SQL Function.
\n", + "\n", + "\n", + "7.4 Score new Data
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5fde359-12da-4038-963c-c34ce410de04", + "metadata": {}, + "outputs": [], + "source": [ + "decision_forest_predict_test_out = TDDecisionForestPredict(object = DecisionForest_out.result,\n", + " newdata = DF_ADS_score,\n", + " id_column = \"WELDING_ID\",\n", + " detailed = False,\n", + " output_prob = True,\n", + " output_responses = ['0','1'])\n", + "decision_forest_predict_test_out.result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58448648-772a-445e-9989-8b174ac9db2f", + "metadata": {}, + "outputs": [], + "source": [ + "# df_predict_test= DataFrame('DF_Predict_test')\n", + "df_predict_test=decision_forest_predict_test_out.result\n", + "df_predict_test" + ] + }, + { + "cell_type": "markdown", + "id": "cda02bba-235d-4f1a-b2a7-3e2ea619cce2", + "metadata": {}, + "source": [ + "8. Compare PMML and DecisionForest
\n", + "8.1 Show AUC-ROC Curve
\n", + "\n", + "The ROC curve shows the performance of a binary classification model as its discrimination threshold varies. For a range of thresholds, the curve plots the true positive rate against false-positive rate.
\n", + "\n", + "This function accepts a set of prediction-actual pairs as input and calculates the following values for a range of discrimination thresholds.
\n", + "ROC for PMML
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c4b179b-a334-4dc0-b3f8-71c35f87283e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ROC \n", + "roc_pmml = ROC(data = pmml_predict_result, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32b946fb-e09e-4e62-b78a-c5325d84c175", + "metadata": {}, + "outputs": [], + "source": [ + "roc_data_pmml = roc_pmml.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data_pmml.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de67ebb4-b0f9-4a8c-9559-e6a44f1c9a21", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "auc_pmml = roc_pmml.result.to_pandas().iloc[0,0]\n", + "auc_pmml" + ] + }, + { + "cell_type": "markdown", + "id": "baf0989e-387a-4ee9-b99e-0687d5a97799", + "metadata": {}, + "source": [ + "ROC for DecisionForest
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02a1c9e2-be8c-44da-9e0a-9056a2ec8243", + "metadata": {}, + "outputs": [], + "source": [ + "roc_obj = ROC(data = df_predict, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly_int\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "27834036-13cc-49e9-a34e-b2bcb2c192b1", + "metadata": {}, + "outputs": [], + "source": [ + "roc_data = roc_obj.output_data.to_pandas().sort_values(\"fpr\", ascending=True)\n", + "roc_data.tail(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab90afd6-b0c1-4edd-9492-c97b16c8d4e0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "auc = roc_obj.result.to_pandas().iloc[0,0]\n", + "auc" + ] + }, + { + "cell_type": "markdown", + "id": "abb98428-872c-41d5-b8b1-79804c772a8a", + "metadata": {}, + "source": [ + "Plot ROC Curves
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93ab97d1-cbd3-4044-8546-0f170a5ca9ce", + "metadata": {}, + "outputs": [], + "source": [ + "# Plot 1\n", + "plt.plot(roc_data_pmml['fpr'], roc_data_pmml['tpr'], color='orange', label='PMML ROC. AUC = {}'.format(str(auc_pmml)), drawstyle='steps') \n", + "# Plot 2\n", + "plt.plot(roc_data['fpr'], roc_data['tpr'], color='green', label='DecisionForest ROC. AUC = {}'.format(str(auc)), drawstyle='steps') \n", + "# Plot the diagonal dashed line\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--') \n", + "# Set labels and title\n", + "plt.xlabel('False Positive Rate',fontsize=12) \n", + "plt.ylabel('True Positive Rate',fontsize=12) \n", + "plt.title('Receiver Operating Characteristic (ROC) Curve',fontsize=16) \n", + "# Add legend\n", + "plt.legend(loc=\"lower right\",fontsize=10) \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c721c745-be69-4eee-a8e2-9faa4ecff46e", + "metadata": {}, + "source": [ + "The closer the ROC curve is to the upper left corner of the graph, the higher the accuracy of the test because in the upper left corner, the sensitivity = 1 and the false positive rate = 0 (specificity = 1). The ideal ROC curve thus has an AUC = 1.0. As seen in the above graph the AUC for both the models is close to 1 so the accuracy of both models is very good.
\n", + "\n", + "8.2 Show Confusion Matrix
\n", + "\n", + "Confusion Matrix is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.
\n", + "\n", + "Confusion matrices represent counts from predicted and actual values. The output “TN” stands for True Negative which shows the number of negative examples classified accurately. Similarly, “TP” stands for True Positive which indicates the number of positive examples classified accurately. The term “FP” shows False Positive value, i.e., the number of actual negative examples classified as positive; and “FN” means a False Negative value which is the number of actual positive examples classified as negative.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cac3275-2854-464a-b240-03e7b836b96d", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate confusion matrix for PMML\n", + "DF_result=df_predict.to_pandas().reset_index()\n", + "pmml_result=pmml_predict_result.to_pandas()\n", + "cm_pmml = confusion_matrix(pmml_result['anomaly'], pmml_result['prediction']) \n", + "# Calculate confusion matrix for DecisionForest\n", + "cm_df = confusion_matrix(DF_result['anomaly_int'], DF_result['prediction']) \n", + "# Create figure and axes objects\n", + "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) \n", + "# Plot PMML confusion matrix\n", + "disp_pmml = ConfusionMatrixDisplay(confusion_matrix=cm_pmml, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_pmml.plot(ax=ax1, cmap='Blues', colorbar=False) \n", + "ax1.set_title('PMML Confusion Matrix') \n", + "ax1.set_xlabel('Predicted Label') \n", + "ax1.set_ylabel('True Label') \n", + "ax1.set_xticks([0, 1]) \n", + "ax1.set_yticks([0, 1]) \n", + "ax1.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax1.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_pmml.shape[0]): \n", + " for j in range(cm_pmml.shape[1]): \n", + " ax1.text(j, i, f'{cm_pmml[i, j]}', ha='center', va='center', color='white' if cm_pmml[i, j] > cm_pmml.max() / 2 else 'black') \n", + "\n", + "# Plot DecisionForest confusion matrix\n", + "disp_df = ConfusionMatrixDisplay(confusion_matrix=cm_df, display_labels=['No Anomaly', 'Anomaly']) \n", + "disp_df.plot(ax=ax2, cmap='Blues', colorbar=False) \n", + "ax2.set_title('DecisionForest Confusion Matrix') \n", + "ax2.set_xlabel('Predicted Label') \n", + "ax2.set_ylabel('True Label') \n", + "ax2.set_xticks([0, 1]) \n", + "ax2.set_yticks([0, 1]) \n", + "ax2.set_xticklabels(['No Anomaly', 'Anomaly']) \n", + "ax2.set_yticklabels(['No Anomaly', 'Anomaly'])\n", + "\n", + "# Add text to the plot to show the actual values of the confusion matrix\n", + "for i in range(cm_df.shape[0]): \n", + " for j in range(cm_df.shape[1]): \n", + " ax2.text(j, i, f'{cm_df[i, j]}', ha='center', va='center', color='white' if cm_df[i, j] > cm_df.max() / 2 else 'black') \n", + "\n", + "# Adjust layout and spacing\n", + "plt.tight_layout() \n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6f7bd547-6020-42c0-b2a7-d1938a9bdb30", + "metadata": {}, + "source": [ + "The confusion matrix for this binary class classification problem has the below 4 quadrants:
\n", + "\n", + "8.3 Show Metrices
\n", + "\n", + "Below is the comparison for Accuracy, Precision and F1-Score of the 2 models.
\n", + "Column | \n", + "Description | \n", + "
---|---|
Precision | \n", + "The positive predictive value. Refers to the fraction of relevant instances among\n", + "the total retrieved instances.\n", + " Precision answers the following question: what proportion of predicted Positives is truly Positive? \n", + " Precision = (TP)/(TP+FP) | \n", + "
Accuracy | \n", + "Accuracy simply measures how often the classifier correctly predicts. We can define accuracy as the ratio of the number of correct predictions and the total number of predictions. | \n", + "
F1 | \n", + "F1 score, defined as the harmonic mean of the precision and recall and is a number between 0 and 1. F1 score maintains a balance between the precision and recall for your classifier. \n", + " F1 = 2*(precision*recall/precision+recall) | \n", + "
From the above metrics we can conclude that both the models are performing almost similar and have similar Accuracy and Precision.
" + ] + }, + { + "cell_type": "markdown", + "id": "43be6263-22d8-43d2-94e2-1f58d730f567", + "metadata": {}, + "source": [ + "Conclusion
\n", + "We have seen an end-to-end exploration process for labelling anomalous time series using ClearScape Analytics on Teradata Vantage. Thanks to the in-database capabilities offered by Teradata Vantage with ClearScape Analytics, we were able to run this exploration with the smallest notebook instance. The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "In this particular use case, we have observed that with large volume of machine sensor data millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.
" + ] + }, + { + "cell_type": "markdown", + "id": "29e90d19-1b71-44e8-b6d5-aa53e3b673c1", + "metadata": {}, + "source": [ + "8. Cleanup
\n", + "Work Tables
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48a959e6-319f-4592-93af-482d391224b4", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['ADS_train_data', 'ADS_test_data','DF_train', 'DF_Predict', 'DF_Predict_test','additional_metrics_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbf8f9bc-9f3a-47e9-b2d4-81fd00291bc8", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "d51fd98f-b9b2-48b9-b639-16cc51f9116f", + "metadata": {}, + "source": [ + "9. Exploring the Versatility of this Analytical Approach in Alternative Use Case Settings
\n", + "How this analytic approach can be levaraged in other use case settings
\n", + "\n", + "The analytical approach of leveraging clustering followed by classification for anomaly detection in short time series data is highly adaptable and can be broadly applied across various industries, especially in settings where operations or processes are characterized by short, continuous time series with a defined start and end and where ground truth labels are not initially available.
\n", + "This method begins with unsupervised learning to explore and understand the data, identifying patterns, similarities, and potential outliers through techniques like Dynamic Time Warping (DTW). Such exploration is crucial in settings where anomalies are not predefined or where the data’s inherent complexity requires initial unsupervised insight to develop an understanding of what constitutes normal behavior versus an anomaly. Following the clustering phase, supervised classification models are trained on the newly identified labels to predict anomalies. This generic approach is particularly effective for short time series data, where each sequence represents a process or event whose normal operational parameters need to be defined through exploratory analysis before precise anomaly detection can occur.
\n", + "Potential Use Cases Across Industries:
\n", + "- Power Grid Load Monitoring:
Analyzing short time series of electricity load during peak usage times to identify anomalies that could indicate equipment failure, energy theft, or inefficiencies in power distribution. Each series could represent the load profile for a brief, high-demand period.- ECG or EEG Analysis:
Short segments of electrocardiogram (ECG) or electroencephalogram (EEG) readings can be analyzed to detect anomalies indicating cardiac arrhythmias or neurological issues, respectively. Each segment represents a complete heartbeat or a brief brain activity pattern.- CNC Machine Operations:
Monitoring the torque and force profiles of a CNC (Computer Numerical Control) machine during a single machining operation. Anomalies could indicate tool wear, material inconsistency, or operational errors.- Aircraft Engine Test Runs:
Analyzing the time series data of engine parameters (e.g., temperature, pressure, vibration) during short test runs to identify deviations from normal operational profiles, suggesting maintenance or safety issues.- Theme Park Ride Operations:
Analyzing sensor data from individual rides, where each ride cycle produces a time series of mechanical or operational parameters. Anomalies in these series could indicate safety concerns or maintenance needs.Conclusion
\n", + "In each of these scenarios, the focus is on analyzing the shape or behavior of a curve within a short time frame, similar to observing a spot welding curve. These curves are shaped by the specific activity taking place, whether it’s a machine at work, a health test running, financial trades happening, or people interacting with a service. The method begins by sorting these curves into groups based on their patterns, without needing to know ahead of time which ones are out of the ordinary. Then, it moves on to use a more detailed approach to pinpoint which curves don’t fit the expected pattern, labeling them as either normal or not normal. This way of doing things is great for quickly finding and addressing issues, and it also helps in getting a better grasp of how these processes work. This can lead to making things run more smoothly and keeping equipment in good shape before problems even start.
" + ] + }, + { + "cell_type": "markdown", + "id": "91bd8857-19e0-4200-b3ae-b2efdbca73d3", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
\n", + "Filters: \n", + "\n",
+ " Anomaly Detection in Robot Welding Process
Trusted AI\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "Detecting anomalies reduces issues and delays in many industries, especially in the manufacturing field. There have been approaches to detect anomalies in the past, such as engineering rules and graph and deep learning. However, it still proves difficult to detect all the existing anomalies. Plus, companies are striving to minimize false positives, cope with the diversity of sensors and metrology issues, and deliver actionable insights at a business pace. Fortunately, Teradata and ClearScape Analytics have the solution. In ClearScape Analytics, users can execute all steps of anomaly detection from data preparation and exploration to model training and evaluations and adjustments. These analyses can improve the process and ensure accuracy in anomaly detection.
\n", + "\n", + "Spot Welding Quality Assessment
\n", + "Spot welding is a common technique used for welding car body panels, particularly in the assembly of smaller parts and components. Spot welding involves using a pair of copper electrodes to apply a series of short, high-current welding pulses to the metal, fusing the parts together at specific points or “spots”.
\n", + "\n", + "The automotive industry is known for its high level of automation, and spot welding is one of the most automated processes, heavily reliant on robots to improve efficiency, reduce labor costs, and improve the consistency and quality of the finished product. Poor welding quality is rare, but even so, the consequences of poor quality may not be negligible in terms of rework costs and customer satisfaction, especially when quality issues are detected too late.
\n", + "\n", + "Spot welding is a resistance welding process that uses large electrical current. There are many ways to assess the quality of a spot, like tensile or ultrasonic testing to assess the weld strength or the analysis of the welding current measured and recorded during the welding process. In this demo, we focus on the analysis of the anomalies in the welding spot due to welding current, and more specifically the resistance, i.e. the voltage-current ratio which impacts the quality of the welding. The shape of the resistance curve depends on many factors like the nature of the materials, the geometry, and the quality of the electrodes etc.
\n", + "\n", + "\n", + "Business Values
\n", + "Why Vantage?
\n", + "Many organizations fail to realize value from their ML and AI investments due to a lack of scale. It is estimated that for broad adoption across many industries, the number of models and model deployments needs to scale 100-1000x larger than their organizations currently support.
\n", + "The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "In this particular use case, the volume of machine sensor data was so great that millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.
\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "7f41da40-f1e9-4979-9e0f-bd5ba6460443", + "metadata": {}, + "source": [ + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
Note: After installing the above libraries, Please restart the kernel. The simplest way is by typing zero zero: 0 0
\n", + "2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
3.Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "99598e0a-8a6c-4539-a06d-f6723f67134f", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "add51f496db2440e9195e9a4369d27cc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PARTITION_ID | ID | X | Y | \n", + "
---|---|---|---|
10 | \n", + "\t\t905 | \n", + "\t\t105 | \n", + "\t\t233.91074826423207 | \n", + "\t
10 | \n", + "\t\t905 | \n", + "\t\t208 | \n", + "\t\t293.063489325248 | \n", + "\t
10 | \n", + "\t\t905 | \n", + "\t\t64 | \n", + "\t\t221.8570197466893 | \n", + "\t
10 | \n", + "\t\t905 | \n", + "\t\t194 | \n", + "\t\t299.3853606177206 | \n", + "\t
10 | \n", + "\t\t905 | \n", + "\t\t54 | \n", + "\t\t219.24825515502602 | \n", + "\t
We get the above data from sensors. We focus on one plant (PLANT=1) and one robot (ROBOT_ID=41). The Partition_ID is the type of welding, ID is the WELDING_ID, X is time required for welding in ms and Y is the RESISTANCE. We create a view with the columns required to get data with proper column names.
" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "88cde234-6107-487e-92f2-7f045576cc1d", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "query = f\"\"\"\n", + "REPLACE VIEW DEMO_AnomalyDetection.V_dataset_01 AS\n", + "SELECT\n", + " 1 AS PLANT\n", + ", {41} AS ROBOT_ID\n", + ", CAST(A.PARTITION_ID AS BIGINT) AS WELDING_TYPE\n", + ", CAST((DATE '{str(datetime.datetime.now()).split(' ')[0]}' + FLOOR((WELDING_ID-700*WELDING_TYPE)/100)) AS DATE FORMAT 'YYYY-MM-DD') AS WELDING_DAY\n", + ", CAST(A.ID AS BIGINT) AS WELDING_ID\n", + ", CAST(A.X AS INTEGER) AS TIME_MS\n", + ", A.Y AS RESISTANCE\n", + "FROM DEMO_AnomalyDetection.Sensor_Data A\n", + "\"\"\"\n", + "execute_sql(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8ec3a959-c5e0-4039-88f8-846adca6f113", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc7b3e698fe042d3847a1767b5cd7fc8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | \n", + "
---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t2025-06-12 | \n", + "\t\t905 | \n", + "\t\t105 | \n", + "\t\t233.91074826423207 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t2025-06-12 | \n", + "\t\t905 | \n", + "\t\t208 | \n", + "\t\t293.063489325248 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t2025-06-12 | \n", + "\t\t905 | \n", + "\t\t64 | \n", + "\t\t221.8570197466893 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t2025-06-12 | \n", + "\t\t905 | \n", + "\t\t194 | \n", + "\t\t299.3853606177206 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t2025-06-12 | \n", + "\t\t905 | \n", + "\t\t54 | \n", + "\t\t219.24825515502602 | \n", + "\t
3.1 - Some aggregations and visualization.
\n" + ] + }, + { + "cell_type": "markdown", + "id": "f83b1b1a-eece-487a-97d7-b4759ea624ce", + "metadata": {}, + "source": [ + "We will check the histogram based on the minimum and maximum Time for welding.
\n", + "A histogram is a better way to assess distribution, to cope with the scalability, it is recommended to compute the histogram bins in-database to leverage the Massively Parallel Architecture of Teradata Vantage. For that, we use the Histogram function of teradataml that pushes down the computations to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1a5d38c3-ebb9-47a2-b8ad-f00acd9d769b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3f8e6f0f137346148636bb3c10aca42b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_ID | min_TIME_MS | max_TIME_MS | count_TIME_MS | \n", + "
---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t4 | \n", + "\t\t320 | \n", + "\t\t1 | \n", + "\t\t806 | \n", + "\t\t806 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t5 | \n", + "\t\t418 | \n", + "\t\t1 | \n", + "\t\t642 | \n", + "\t\t642 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t801 | \n", + "\t\t1 | \n", + "\t\t1478 | \n", + "\t\t1478 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t884 | \n", + "\t\t1 | \n", + "\t\t1085 | \n", + "\t\t1085 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t10 | \n", + "\t\t938 | \n", + "\t\t1 | \n", + "\t\t928 | \n", + "\t\t928 | \n", + "\t
ColumnName | Label | MinValue | MaxValue | CountOfValues | Bin_Percent | \n", + "
---|---|---|---|---|---|
count_TIME_MS | \n", + "\t\t0 | \n", + "\t\t0.0 | \n", + "\t\t100.0 | \n", + "\t\t11 | \n", + "\t\t1.0999999999999999 | \n", + "\t
count_TIME_MS | \n", + "\t\t1 | \n", + "\t\t100.0 | \n", + "\t\t200.0 | \n", + "\t\t14 | \n", + "\t\t1.4000000000000001 | \n", + "\t
count_TIME_MS | \n", + "\t\t2 | \n", + "\t\t200.0 | \n", + "\t\t300.0 | \n", + "\t\t34 | \n", + "\t\t3.4000000000000004 | \n", + "\t
count_TIME_MS | \n", + "\t\t3 | \n", + "\t\t300.0 | \n", + "\t\t400.0 | \n", + "\t\t39 | \n", + "\t\t3.9 | \n", + "\t
count_TIME_MS | \n", + "\t\t4 | \n", + "\t\t400.0 | \n", + "\t\t500.0 | \n", + "\t\t47 | \n", + "\t\t4.7 | \n", + "\t
We can see that we have calculated the histogram values using the teradataml functions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visualization tools, this will not only make the calculation faster but also reduce the time due to less data movement between tools. We do the data transfer for this and the subsequent visualizations wherever necessary.
" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7c9b72ab-7d3c-4964-9199-ee1dcc17c928", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "In the above histogram we can see the bins between the Min and the Max value of the durations and the welding counts.
\n", + "3.2 - More advanced processing using window functions and delta_t
\n", + "Resistance is an important parameter in resistance welding. The resistance should not vary too much. If there are any significant changes in resistance over time, it could indicate an issue with the weld quality. For example, an unusually high resistance could indicate poor contact between the parts being welded or a problem with the welding equipment.
" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b5615026-52eb-4aae-8bb2-146e88ef4502", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d628ce44d1b549acb4bbd0b861240b89", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | \n", + "
---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t714 | \n", + "\t\t103.14563314224813 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t130 | \n", + "\t\t171.2900655657514 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t352 | \n", + "\t\t161.46970909346348 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t353 | \n", + "\t\t154.4865896266523 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t723 | \n", + "\t\t87.61515040184788 | \n", + "\t
The above graph shows the variation of the resistance of the welding with respect to time. We see that the most interesting part lies between 40 and 400ms from the start of the curve.
\n", + "\n", + "Next we apply the window function on the resistance to smooth the resistance and taking the mean value.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "516d8fd4-ab2c-44cd-89d2-d8075e40cf82", + "metadata": {}, + "outputs": [], + "source": [ + "# curve smoothing\n", + "window_for_smoothing = welding_dataset_new.RESISTANCE.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS',\n", + " window_start_point = -15,\n", + " window_end_point = 15\n", + ")\n", + "welding_dataset_smooth = welding_dataset_new.assign(RESISTANCE_SMOOTHED = window_for_smoothing.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "6c351bab-cd80-452c-b600-79efaec9f769", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "28f1b58f8da242acbd3aae6f80816278", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | RESISTANCE_SMOOTHED | \n", + "
---|---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t1 | \n", + "\t\t353.0226900449998 | \n", + "\t\t253.14252688841657 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t2 | \n", + "\t\t335.5825378948886 | \n", + "\t\t248.69892486462675 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t3 | \n", + "\t\t324.9522665624502 | \n", + "\t\t244.90621844326972 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t4 | \n", + "\t\t297.6567809200622 | \n", + "\t\t241.13418644931798 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t854 | \n", + "\t\t5 | \n", + "\t\t288.1532672344735 | \n", + "\t\t237.60587172607387 | \n", + "\t
The above graph shows the variation of the resistance of the welding with respect to time and the smoothed resistance, as shown by the Red line, after applying the window function.
\n", + "\n", + "The window function generates a Window object on a teradataml DataFrame Column to run window aggregate functions.\n", + "
Function allows user to specify window for different types of computations:\n", + "
By default, window with Unbounded Preceding and Unbounded following is considered for calculation.
\n", + "\n", + "Next we calculate the derivative by using the lead function and taking the difference of the lead value and the mean value of the resistance. Applying a window function to smooth the resistance curve helps to eliminate noise and makes it easier to see the overall trend. The derivative of the resistance gives an indication of how quickly the resistance is changing, which can be a useful measure for detecting anomalies and predicting potential issues.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c3719c73-495e-4651-b764-c01678f07417", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(welding_dataset_smooth,table_name='welding_dataset_smooth', if_exists='replace')\n", + "welding_dataset_smooth = DataFrame('welding_dataset_smooth')" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fccb6149-ce72-4601-983b-a87f2bc52417", + "metadata": {}, + "outputs": [], + "source": [ + "# let's compute the lead\n", + "window_for_lead = welding_dataset_smooth.RESISTANCE_SMOOTHED.window(\n", + " partition_columns = \"WELDING_ID\",\n", + " order_columns = 'TIME_MS')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "e6a9bc90-f330-467f-8765-5a00578c6c6e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "cce13b3b0c7b4f79878530fbd803efca", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | RESISTANCE_SMOOTHED | RESISTANCE_SMOOTHED_AFTER | DERIVATIVE | \n", + "
---|---|---|---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t1 | \n", + "\t\t25/08/05 | \n", + "\t\t1 | \n", + "\t\t1 | \n", + "\t\t474.381162422724 | \n", + "\t\t316.7743268301616 | \n", + "\t\t311.4723766637594 | \n", + "\t\t-5.301950166402207 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t1 | \n", + "\t\t25/08/05 | \n", + "\t\t1 | \n", + "\t\t2 | \n", + "\t\t428.56930331230063 | \n", + "\t\t311.4723766637594 | \n", + "\t\t306.6978645868533 | \n", + "\t\t-4.774512076906092 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t1 | \n", + "\t\t25/08/05 | \n", + "\t\t1 | \n", + "\t\t3 | \n", + "\t\t398.452732368239 | \n", + "\t\t306.6978645868533 | \n", + "\t\t302.25553180871196 | \n", + "\t\t-4.442332778141349 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t1 | \n", + "\t\t25/08/05 | \n", + "\t\t1 | \n", + "\t\t4 | \n", + "\t\t372.90267653201755 | \n", + "\t\t302.25553180871196 | \n", + "\t\t298.4338994297387 | \n", + "\t\t-3.821632378973277 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t1 | \n", + "\t\t25/08/05 | \n", + "\t\t1 | \n", + "\t\t5 | \n", + "\t\t350.6458154372735 | \n", + "\t\t298.4338994297387 | \n", + "\t\t294.38610093105206 | \n", + "\t\t-4.047798498686632 | \n", + "\t
PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | RESISTANCE_SMOOTHED | RESISTANCE_SMOOTHED_AFTER | DERIVATIVE | \n", + "
---|---|---|---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t25/06/18 | \n", + "\t\t854 | \n", + "\t\t1 | \n", + "\t\t353.0226900449998 | \n", + "\t\t253.14252688841657 | \n", + "\t\t248.69892486462675 | \n", + "\t\t-4.443602023789822 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t25/06/18 | \n", + "\t\t854 | \n", + "\t\t2 | \n", + "\t\t335.5825378948886 | \n", + "\t\t248.69892486462675 | \n", + "\t\t244.90621844326972 | \n", + "\t\t-3.7927064213570247 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t25/06/18 | \n", + "\t\t854 | \n", + "\t\t3 | \n", + "\t\t324.9522665624502 | \n", + "\t\t244.90621844326972 | \n", + "\t\t241.13418644931798 | \n", + "\t\t-3.7720319939517424 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t25/06/18 | \n", + "\t\t854 | \n", + "\t\t4 | \n", + "\t\t297.6567809200622 | \n", + "\t\t241.13418644931798 | \n", + "\t\t237.60587172607387 | \n", + "\t\t-3.528314723244108 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t25/06/18 | \n", + "\t\t854 | \n", + "\t\t5 | \n", + "\t\t288.1532672344735 | \n", + "\t\t237.60587172607387 | \n", + "\t\t234.07488665282776 | \n", + "\t\t-3.5309850732461143 | \n", + "\t
We see that the most interesting part lies between 40 and 400ms from the start of the curve, so we plot only that subset.
" + ] + }, + { + "cell_type": "markdown", + "id": "4615d965-6892-4729-81b0-9dd39f7d9411", + "metadata": { + "tags": [] + }, + "source": [ + "It is hard to assess the diversity of curve shapes in this plot since many of them are superimposed. However, we see in the middle of the picture a sharp drop that looks unusual. Moreover, we guess that there are shifts in time and height.
\n", + "\n", + "4. Feature Engineering
" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "da82ee40-3e38-49af-a6ca-a678ba240ca2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['PLANT',\n", + " 'ROBOT_ID',\n", + " 'WELDING_TYPE',\n", + " 'WELDING_DAY',\n", + " 'WELDING_ID',\n", + " 'TIME_MS',\n", + " 'RESISTANCE']" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "welding_dataset_new.columns" + ] + }, + { + "cell_type": "markdown", + "id": "539a4c25-f868-44af-bca3-13b4ca477445", + "metadata": {}, + "source": [ + "We will create a feature table by using different functions on the Resistance column. Valid values for functions are: 'count', 'sum', 'min', 'max', 'mean', 'std', 'percentile', 'unique','median', 'var', 'skew', 'kurtosis'.
" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "aa37d2af-c185-4a84-9ca5-8628a216aa27", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3d71e7bf77b44a3994113f0fe2dc8068", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | min_TIME_MS | max_TIME_MS | count_RESISTANCE | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | percentile_RESISTANCE | unique_RESISTANCE | median_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t5 | \n", + "\t\t2025-07-12 | \n", + "\t\t471 | \n", + "\t\t21 | \n", + "\t\t646 | \n", + "\t\t626 | \n", + "\t\t108176.8054879289 | \n", + "\t\t88.4331503589787 | \n", + "\t\t308.8898639352117 | \n", + "\t\t172.8063985430174 | \n", + "\t\t67.43470974015446 | \n", + "\t\t155.80095377664534 | \n", + "\t\t626 | \n", + "\t\t155.80095377664534 | \n", + "\t\t4547.440077738883 | \n", + "\t\t0.49796745320216573 | \n", + "\t\t-1.1467845713561156 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t8 | \n", + "\t\t2025-06-24 | \n", + "\t\t705 | \n", + "\t\t21 | \n", + "\t\t633 | \n", + "\t\t613 | \n", + "\t\t117433.38478654668 | \n", + "\t\t82.76152873653831 | \n", + "\t\t316.66867750259905 | \n", + "\t\t191.57159019012508 | \n", + "\t\t78.08923446856586 | \n", + "\t\t206.63668033820593 | \n", + "\t\t613 | \n", + "\t\t206.63668033820593 | \n", + "\t\t6097.928539886655 | \n", + "\t\t0.04364970537902622 | \n", + "\t\t-1.5898532421335452 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t5 | \n", + "\t\t2025-07-12 | \n", + "\t\t446 | \n", + "\t\t21 | \n", + "\t\t1106 | \n", + "\t\t1086 | \n", + "\t\t166518.9453555073 | \n", + "\t\t70.0355276366476 | \n", + "\t\t315.4566895327382 | \n", + "\t\t153.33236220580784 | \n", + "\t\t72.20479609391711 | \n", + "\t\t108.80434505003922 | \n", + "\t\t1086 | \n", + "\t\t108.80434505003922 | \n", + "\t\t5213.532578964148 | \n", + "\t\t0.9529411796445812 | \n", + "\t\t-0.6269191898462032 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t8 | \n", + "\t\t2025-06-24 | \n", + "\t\t748 | \n", + "\t\t21 | \n", + "\t\t764 | \n", + "\t\t744 | \n", + "\t\t109921.27260669864 | \n", + "\t\t84.02047298240994 | \n", + "\t\t282.98968717382587 | \n", + "\t\t147.74364597674548 | \n", + "\t\t59.15325216552249 | \n", + "\t\t111.91280735495172 | \n", + "\t\t744 | \n", + "\t\t111.91280735495172 | \n", + "\t\t3499.107241757891 | \n", + "\t\t0.8606279445787202 | \n", + "\t\t-0.7317898244111711 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t7 | \n", + "\t\t2025-06-30 | \n", + "\t\t605 | \n", + "\t\t21 | \n", + "\t\t590 | \n", + "\t\t570 | \n", + "\t\t108569.63225682522 | \n", + "\t\t81.47252264867211 | \n", + "\t\t299.90656597268855 | \n", + "\t\t190.4730390470618 | \n", + "\t\t67.5249319900682 | \n", + "\t\t195.99852023934073 | \n", + "\t\t570 | \n", + "\t\t195.99852023934073 | \n", + "\t\t4559.616440263336 | \n", + "\t\t-0.07402858894022789 | \n", + "\t\t-1.4208348503506993 | \n", + "\t
5. Anomaly Detection on Sensor Data
\n", + " \n", + "Let's start by getting the feature columns from the features tables
" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "27cdf0f8-e0b3-41b5-b18d-b77cdbc5652b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['count_RESISTANCE',\n", + " 'sum_RESISTANCE',\n", + " 'min_RESISTANCE',\n", + " 'max_RESISTANCE',\n", + " 'mean_RESISTANCE',\n", + " 'std_RESISTANCE',\n", + " 'percentile_RESISTANCE',\n", + " 'unique_RESISTANCE',\n", + " 'median_RESISTANCE',\n", + " 'var_RESISTANCE',\n", + " 'skew_RESISTANCE',\n", + " 'kurtosis_RESISTANCE']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "feature_names = features.columns[7::]\n", + "feature_names" + ] + }, + { + "cell_type": "markdown", + "id": "9655f048-ffbd-4785-9e8b-39d192ff7808", + "metadata": {}, + "source": [ + "5.1 Clustering by curve shape
\n", + "To cluster time series by shapes, we will use the Dynamic Time Warping (DTW) distance that measures the similarity between two time series. This distance is well adapted to this kind of problem since it provides robustness to shifts in time and height.
\n", + "\n", + "Distance Matrix in-database Computations
\n", + "\n", + "The ClearScape Analytics DTW function computes at scale distances between one reference curve to a set of curves, a many-to-one approach. ClearScape Analytics offers in database dynamic time warping function, callable in SQL as TD_DTW. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. TD_DTW measures the similarity of two time series. The Dynamics Time Warping (DTW) algorithm is used for space and time. The algorithm uses the FastDTW algorithm. This function computes at scale the DTW distances between one reference curve to a set of curves, a many-to-one approach. We want to compute the distance matrix of our subset, i.e. the DTW distance between each curve. The distance matrix is symmetric, since the DTW is, hence we only need to compute the triangular matrix. We wrapped this computation in the tdsense package that calls the TD_DTW function and iterates on the matrix row to compute and store the whole triangular distance matrix in a table.
" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "207e72c8-41e3-481a-9727-a4c7510f4206", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['2025-07-12',\n", + " '2025-07-06',\n", + " '2025-06-12',\n", + " '2025-07-30',\n", + " '2025-06-24',\n", + " '2025-07-18',\n", + " '2025-06-30',\n", + " '2025-08-05',\n", + " '2025-07-24',\n", + " '2025-06-18']" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "overview = welding_dataset_new.groupby('WELDING_DAY').count(distinct=True)\n", + "dates = list(overview.to_pandas().reset_index()['WELDING_DAY'].values.astype('str'))\n", + "dates" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7c7180b4-a8b5-450a-96be-8aed93d1199a", + "metadata": {}, + "outputs": [], + "source": [ + "subset = welding_dataset_new[ \\\n", + " (welding_dataset_new['PLANT'] == 1) & \\\n", + " (welding_dataset_new['ROBOT_ID'] == 41) & \\\n", + " (welding_dataset_new['WELDING_TYPE'] in (8,9)) & \\\n", + " (welding_dataset_new['WELDING_DAY'].isin(dates)) \\\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "0dda2eca-af26-4741-abeb-b63758f8c996", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(344622, 7)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subset_zoom = subset[(subset.TIME_MS < 400) & (subset.TIME_MS > 40)]\n", + "subset_zoom.shape" + ] + }, + { + "cell_type": "markdown", + "id": "9d40f422-886d-48e5-a4ce-03b259523917", + "metadata": {}, + "source": [ + "The subset of data we have taken contains 7 columns and 344,622 rows.
\n", + "\n", + "Since this is a 2CPU system, the below computation takes around more than 2 hours for 350k rows and so we have pre calculated it and stored in the table in database.
\n", + "\n", + "**In case we still want to compute the matrix please set the If part of the below code to True instead of False
" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "276fd1b7-e057-4c0c-b8b0-4e063d70eb7a", + "metadata": {}, + "outputs": [], + "source": [ + "if False:\n", + " dtw_matrix = dtw_distance_matrix_computation2(subset_zoom,field='RESISTANCE',\n", + " table_name=dtw_result_table,\n", + " schema_name = Param['database'],\n", + " row_axis='TIME_MS',\n", + " series_id = 'WELDING_ID')\n", + "else:\n", + " dtw_matrix = DataFrame(in_schema('DEMO_AnomalyDetection','DTW_Matrix'))" + ] + }, + { + "cell_type": "markdown", + "id": "42f770a5-f3b2-4862-8256-b1cc1f969750", + "metadata": {}, + "source": [ + "5.2 Hierarchical clustering with Scipy
\n", + "\n", + "Now the distance matrix is available, we can perform the clustering. Here, we will use the open-source package Scipy and its cluster.hierarchy modules, that have been used in a tdsense for convenience.
\n", + "\n", + "Hierarchical clustering is an alternative class of clustering algorithms that produce 1 to n clusters, where n is the number of observations in the data set. As you go down the hierarchy from 1 cluster (contains all the data) to n clusters (each observation is its own cluster), the clusters become more and more similar (almost always).
" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0b87b35b-c283-42d8-845b-5c9c7851c822", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | MATRIX_ROW | \n", + "WELDING_ID_1 | \n", + "WELDING_ID_2 | \n", + "ROW_ID | \n", + "DISTANCE | \n", + "
---|---|---|---|---|---|
0 | \n", + "1 | \n", + "2 | \n", + "1 | \n", + "0 | \n", + "3404.181076 | \n", + "
1 | \n", + "2 | \n", + "3 | \n", + "1 | \n", + "0 | \n", + "2879.840770 | \n", + "
2 | \n", + "3 | \n", + "4 | \n", + "1 | \n", + "0 | \n", + "2035.367486 | \n", + "
3 | \n", + "4 | \n", + "5 | \n", + "1 | \n", + "0 | \n", + "2641.395023 | \n", + "
4 | \n", + "5 | \n", + "6 | \n", + "1 | \n", + "0 | \n", + "10501.828495 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
497498 | \n", + "996 | \n", + "999 | \n", + "997 | \n", + "0 | \n", + "4333.889951 | \n", + "
497499 | \n", + "997 | \n", + "1000 | \n", + "997 | \n", + "0 | \n", + "4851.558731 | \n", + "
497500 | \n", + "996 | \n", + "999 | \n", + "998 | \n", + "0 | \n", + "2143.966383 | \n", + "
497501 | \n", + "997 | \n", + "1000 | \n", + "998 | \n", + "0 | \n", + "2132.062450 | \n", + "
497502 | \n", + "997 | \n", + "1000 | \n", + "999 | \n", + "0 | \n", + "2119.041857 | \n", + "
497503 rows × 5 columns
\n", + "The dendrogram is useful for visualizing the structure of the hierarchical clustering and identifying the optimal number of clusters to use for further analysis. The optimal number of clusters can be determined by examining the dendrogram to identify a level at which the clusters start to merge more slowly or by using a threshold for the maximum distance between clusters.
\n", + "\n", + "The resulting dendrogram as above shows how the hierarchical clustering algorithm has merged the data points into clusters based on their pairwise distances using the Ward linkage criterion. The dendrogram is a summary of the distance matrix. The X axis has the WELDING_ID but not visible as we have more than 450k rows. Looking at the dendrogram, we see that we have about 6 clusters. When selected 6, here is what we have got.
" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "d2e168ff-626b-47b8-bc2b-ecfaac22a8f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | CURVE_ID | \n", + "cluster | \n", + "leaves_color_list | \n", + "
---|---|---|---|
675 | \n", + "1 | \n", + "0 | \n", + "#ff964f | \n", + "
805 | \n", + "2 | \n", + "0 | \n", + "#ff964f | \n", + "
471 | \n", + "3 | \n", + "1 | \n", + "#b2f396 | \n", + "
670 | \n", + "4 | \n", + "0 | \n", + "#ff964f | \n", + "
839 | \n", + "5 | \n", + "0 | \n", + "#ff964f | \n", + "
The above dendogram is for only 6 clusters with the colors representing the different clusters. Now, we plot the Resistance curves for each cluster.
" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "d31bafdc-9f43-4083-9677-ef7d94c18eb1", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "And if we plot the curves per cluster, we spot the curves with a sharp drop(cluster 4) and these are the curves we are interested in, i.e. the curve exhibiting the anomaly we are looking for. We note also the other clusters are looking more or less similar. By monitoring the resistance over time and calculating its derivative, you can detect any sudden changes or anomalies. Anomalies might indicate a problem with the welding process, such as a sudden drop in current or a sudden increase in resistance.
" + ] + }, + { + "cell_type": "markdown", + "id": "9b99a7ac-6a99-4c9e-9ead-0f6d6e5c4759", + "metadata": {}, + "source": [ + "5.3 Create the anomaly dataset
\n", + "Now we create a table containing the anomaly flag that will be the target of a supervised machine learning model or a relevant KPI to monitor in production dashboards.
\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cec5b577-b0dd-45c8-8fad-fee1fb1f952a", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "WELDING_ID | anomaly | \n", + "
---|---|
326 | \n", + "\t\t1 | \n", + "\t
183 | \n", + "\t\t0 | \n", + "\t
530 | \n", + "\t\t0 | \n", + "\t
999 | \n", + "\t\t1 | \n", + "\t
387 | \n", + "\t\t1 | \n", + "\t
The above anomaly data has the welding ID and the anomaly flag.
\n", + "5.4 Build the analytical dataset
\n", + "\n", + "We prepare the analytical dataset by joining the feature table with the anomaly table using the Welding ID so that we get the anomalies for the weldings.
" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "fe4cfcfb-7d91-47e5-a4cc-e44428e51cfe", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "62175dfc79d94e4bbd5646fd055c4db3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | count_RESISTANCE | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | percentile_RESISTANCE | unique_RESISTANCE | median_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | anomaly | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
570 | \n", + "\t\t637 | \n", + "\t\t108597.77124086616 | \n", + "\t\t76.02029807955392 | \n", + "\t\t352.4795596378072 | \n", + "\t\t170.48315736399712 | \n", + "\t\t86.51173633708574 | \n", + "\t\t115.06655015743611 | \n", + "\t\t637 | \n", + "\t\t115.06655015743611 | \n", + "\t\t7484.28052405744 | \n", + "\t\t0.7249096707674543 | \n", + "\t\t-1.0064015341549895 | \n", + "\t\t0 | \n", + "\t
183 | \n", + "\t\t1280 | \n", + "\t\t180122.758205533 | \n", + "\t\t69.41522063694657 | \n", + "\t\t414.25927750342277 | \n", + "\t\t140.72090484807265 | \n", + "\t\t86.13303343939174 | \n", + "\t\t104.7790581693169 | \n", + "\t\t1280 | \n", + "\t\t104.7790581693169 | \n", + "\t\t7418.899449471376 | \n", + "\t\t1.9063589843778255 | \n", + "\t\t2.202091867073732 | \n", + "\t\t0 | \n", + "\t
530 | \n", + "\t\t888 | \n", + "\t\t142793.80259248955 | \n", + "\t\t80.33864364172554 | \n", + "\t\t352.84994686956196 | \n", + "\t\t160.80383174829905 | \n", + "\t\t81.96568876456104 | \n", + "\t\t110.63194606046804 | \n", + "\t\t888 | \n", + "\t\t110.63194606046804 | \n", + "\t\t6718.374134648887 | \n", + "\t\t0.9804755013182304 | \n", + "\t\t-0.4865935073406828 | \n", + "\t\t0 | \n", + "\t
999 | \n", + "\t\t1009 | \n", + "\t\t167017.72523876745 | \n", + "\t\t78.48183376623636 | \n", + "\t\t339.3274318454616 | \n", + "\t\t165.52797347747023 | \n", + "\t\t83.15916845787675 | \n", + "\t\t111.43083661985594 | \n", + "\t\t1009 | \n", + "\t\t111.43083661985594 | \n", + "\t\t6915.4472986055225 | \n", + "\t\t0.7309353059569027 | \n", + "\t\t-1.1177279822102073 | \n", + "\t\t1 | \n", + "\t
387 | \n", + "\t\t1629 | \n", + "\t\t252327.66954286018 | \n", + "\t\t67.734531657172 | \n", + "\t\t385.3317739232777 | \n", + "\t\t154.8972802595827 | \n", + "\t\t89.27281066015279 | \n", + "\t\t107.3343318461327 | \n", + "\t\t1629 | \n", + "\t\t107.3343318461327 | \n", + "\t\t7969.634723163489 | \n", + "\t\t1.3145840678553307 | \n", + "\t\t0.23440643306334338 | \n", + "\t\t1 | \n", + "\t
The analytical dataset we created has 14 columns and 391 rows which will be used to build the model below.
" + ] + }, + { + "cell_type": "markdown", + "id": "09b3168b-8c53-4ffd-ba75-b26f40608654", + "metadata": {}, + "source": [ + "6. Build the model
\n", + "We have datasets in which different columns have different units – like one column can be in kilograms, while another column can be in centimetres. If we feed these features to the model as is, there is every chance that one feature will influence the result more due to its value than the others. But this doesn’t necessarily mean it is more important as a predictor. So, to give importance to all the features we need feature scaling.
\n", + " \n", + "Here, we apply the Standard scale and transform functions which are ScaleFit and ScaleTransform functions in Vantage. ScaleFit() function outputs statistics to input to ScaleTransform() function, which scales specified input DataFrame columns.
" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "d5d0898e-53a7-4aca-9f24-2e2f06ac73dc", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ScaleFit , ScaleTransform\n", + "scaler = ScaleFit(\n", + " data=ADS,\n", + " target_columns=feature_names,\n", + " scale_method=\"STD\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "76af7c0a-b1cf-4914-a099-aeaeeb0c4977", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c3445e103d204f1ea97a9885e3fab53a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "anomaly | count_RESISTANCE | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | percentile_RESISTANCE | unique_RESISTANCE | median_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "\t\t-0.4454958013788012 | \n", + "\t\t0.49175516324283447 | \n", + "\t\t0.058266501337359396 | \n", + "\t\t1.1150200856051362 | \n", + "\t\t1.3539755632281287 | \n", + "\t\t1.8200619070354604 | \n", + "\t\t0.9194382247575821 | \n", + "\t\t-0.4454958013788012 | \n", + "\t\t0.9194382247575821 | \n", + "\t\t2.0817677950852636 | \n", + "\t\t-1.0994434098745625 | \n", + "\t\t-1.0566279679947848 | \n", + "\t
0 | \n", + "\t\t0.793525182974448 | \n", + "\t\t0.523797308954638 | \n", + "\t\t-0.47123828167733633 | \n", + "\t\t1.9938112179994238 | \n", + "\t\t-0.806783408272433 | \n", + "\t\t0.4409365865810209 | \n", + "\t\t-0.5777180923358187 | \n", + "\t\t0.793525182974448 | \n", + "\t\t-0.5777180923358187 | \n", + "\t\t0.38847470211704904 | \n", + "\t\t1.3988023115204253 | \n", + "\t\t1.5933929191283127 | \n", + "\t
1 | \n", + "\t\t0.0034670964809643983 | \n", + "\t\t0.19150685538374623 | \n", + "\t\t0.007189680804109719 | \n", + "\t\t-0.572992840326859 | \n", + "\t\t-0.023939443562901425 | \n", + "\t\t0.21488164140523353 | \n", + "\t\t-0.42987602059741353 | \n", + "\t\t0.0034670964809643983 | \n", + "\t\t-0.42987602059741353 | \n", + "\t\t0.14170515277163295 | \n", + "\t\t-0.3998375077234471 | \n", + "\t\t-0.7211087815327962 | \n", + "\t
1 | \n", + "\t\t1.8109800618904102 | \n", + "\t\t2.354620840583509 | \n", + "\t\t-0.5599250484658025 | \n", + "\t\t1.0028942996347474 | \n", + "\t\t-0.3594153577319633 | \n", + "\t\t0.679603160858965 | \n", + "\t\t-0.5209247163916084 | \n", + "\t\t1.8109800618904102 | \n", + "\t\t-0.5209247163916084 | \n", + "\t\t0.65842030680638 | \n", + "\t\t0.49326505850456726 | \n", + "\t\t0.22156809424649065 | \n", + "\t
0 | \n", + "\t\t-0.3492894661231371 | \n", + "\t\t-0.42271351810262053 | \n", + "\t\t0.10517001290583916 | \n", + "\t\t-0.10977661846737265 | \n", + "\t\t-0.1730205742826949 | \n", + "\t\t0.12416064757468764 | \n", + "\t\t-0.4476321194017249 | \n", + "\t\t-0.3492894661231371 | \n", + "\t\t-0.4476321194017249 | \n", + "\t\t0.04510877161800008 | \n", + "\t\t-0.017989719458924348 | \n", + "\t\t-0.2810964068413063 | \n", + "\t
6.1 Create a model file using the python libraries.
\n", + "\n", + "The Vantage Bring Your Own Model (BYOM) package gives data scientists and analysts the ability to operationalize predictive models in Vantage. Predictive models trained in external tools with sample data can be used to score data stored in Vantage using the BYOM Predict. Create or convert your predictive model using a supported model interchange format (PMML, MOJO, ONNX, Dataiku, and DataRobot are currently available), store it in a Vantage table, and use the BYOM PMMLPredict, H2OPredict, ONNXPredict, DataikuPredict, or DataRobotPredict to score your data with the model.
\n", + "\n", + "A problem with imbalanced classification is that there are too few examples of the minority class for a model to effectively learn the decision boundary. One way to solve this problem is to oversample the examples in the minority class. the most widely used approach to synthesizing new examples is called the Synthetic Minority Oversampling Technique, or SMOTE for short. SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.
\n", + "\n", + "Then we use the RandomForestClassifier to create the model. A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The Random forest classifier creates a set of decision trees from a randomly selected subset of the training set. It is basically a set of decision trees (DT) from a randomly selected subset of the training set and then It collects the votes from different decision trees to decide the final prediction.
" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "d847d16a-9735-4482-953d-66c80faf0bdc", + "metadata": {}, + "outputs": [], + "source": [ + "X_train = df[feature_names]\n", + "y_train = df['anomaly']" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "4350a66c-2ff9-483c-ae30-8f17c5d375b3", + "metadata": {}, + "outputs": [], + "source": [ + "# Balance the training set using SMOTE\n", + "smote = imblearn.over_sampling.SMOTE(random_state=42)\n", + "X_train, y_train = smote.fit_resample(X_train, y_train)\n", + "\n", + "\n", + "# Create a random forest classifier\n", + "model = RandomForestClassifier(n_estimators=10,max_depth= 3, random_state=42)\n", + "\n", + "# Create a pipeline that includes the SMOTE transformer and the model\n", + "pipeline = PMMLPipeline([ ('model', model)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "455a3ff5-e8ee-4c9b-909e-3e1a79fa6612", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "duration : 0.02437281608581543 s\n" + ] + } + ], + "source": [ + "# Train the pipeline\n", + "start = time.time()\n", + "pipeline.fit(X_train, y_train)\n", + "end = time.time()\n", + "print('duration : ', end-start, 's')" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "61ff634a-aea7-4966-bf38-30b77547f0a3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 94.02%\n", + "Precision: 93.85%\n", + "AUC: 98.74%\n", + "F1-Score: 94.03%\n" + ] + } + ], + "source": [ + "# make predictions on the training set\n", + "y_train_pred = pipeline.predict(X_train)\n", + "\n", + "# calculate and print the accuracy score\n", + "acc = accuracy_score(y_train, y_train_pred)\n", + "print(\"Accuracy: {:.2f}%\".format(acc * 100))\n", + "\n", + "# calculate and print precision, AUC and F1-score\n", + "prec = precision_score(y_train, y_train_pred)\n", + "print(\"Precision: {:.2f}%\".format(prec * 100))\n", + "\n", + "# calculate AUC, AUC requires probability for positive class\n", + "prob = pipeline.predict_proba(X_train)[:, 1]\n", + "auc = roc_auc_score(y_train, prob)\n", + "print(\"AUC: {:.2f}%\".format(auc * 100))\n", + "\n", + "f1 = f1_score(y_train, y_train_pred)\n", + "print(\"F1-Score: {:.2f}%\".format(f1 * 100))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "60a0b3c9-4a3f-478c-a9f9-2ddd786aa332", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | Model | \n", + "Accuracy | \n", + "Precision | \n", + "F1-Score | \n", + "
---|---|---|---|---|
0 | \n", + "PMML using BYOM | \n", + "0.940154 | \n", + "0.938462 | \n", + "0.94027 | \n", + "
6.2 Save the model file
" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "7ffc1be2-d980-4468-9fc9-58ef30e5cb27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model is deleted.\n", + "Model is saved.\n" + ] + } + ], + "source": [ + "try:\n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + "except Exception as e: \n", + " # if our model exists, delete and rewrite \n", + " if str(e.args).find('TDML_2200') >= 1: \n", + " delete_byom(model_id = 'model_anomaly1', table_name = 'BYOM_PMMLMODELS_REPOSITORY') \n", + " save_byom(model_id = 'model_anomaly1',\n", + " model_file = 'my_model.pmml',\n", + " table_name = 'BYOM_PMMLMODELS_REPOSITORY',\n", + " additional_columns={\"Description\": \"RandomForestClassifier model\",\n", + " \"UserId\": 'demo_user',\n", + " \"ProductionReady\": False,\n", + " \"ModelAccuracy\": float(acc),\n", + " \"ModelPrecision\": float(prec),\n", + " \"ModelAUC\": float(auc),\n", + " \"Modelf1Score\": float(f1),\n", + " \"ModelSavedTime\": str(datetime.datetime.now(tz=pytz.UTC)),\n", + " \"ModelGeneratedTime\": float(end-start),\n", + " \"sklearnVersion\": sklearn.__version__\n", + " }\n", + " )\n", + " else: \n", + " raise ValueError(f\"Unable to save the model due to the following error: {e}\")\n", + "# pass \n", + "# else: \n", + "# raise \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "60c0f97c-52b2-407e-921c-75a61ca2d3fa", + "metadata": {}, + "source": [ + "The model file is saved as can be found in the left navigation pane in /UseCases/Anomaly_Detection.
\n", + "\n", + "We create new scaled data to apply this model and predict data. New dataset is created by joining the features and the anomalies.
" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "60fe7dff-a0fa-43a6-aa03-d11aeed2904e", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d07a6981a51e4db19c7fdae5e2a71f36", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | count_RESISTANCE | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | percentile_RESISTANCE | unique_RESISTANCE | median_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | anomaly | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
387 | \n", + "\t\t1629 | \n", + "\t\t252327.66954286018 | \n", + "\t\t67.734531657172 | \n", + "\t\t385.3317739232777 | \n", + "\t\t154.8972802595827 | \n", + "\t\t89.27281066015279 | \n", + "\t\t107.3343318461327 | \n", + "\t\t1629 | \n", + "\t\t107.3343318461327 | \n", + "\t\t7969.634723163489 | \n", + "\t\t1.3145840678553307 | \n", + "\t\t0.23440643306334338 | \n", + "\t\t1 | \n", + "\t
345 | \n", + "\t\t1572 | \n", + "\t\t225618.05330320456 | \n", + "\t\t76.78441855749419 | \n", + "\t\t340.6338108095452 | \n", + "\t\t143.52293467124971 | \n", + "\t\t72.66592162969769 | \n", + "\t\t106.39312461277355 | \n", + "\t\t1572 | \n", + "\t\t106.39312461277355 | \n", + "\t\t5280.336166293367 | \n", + "\t\t1.4037474965495578 | \n", + "\t\t0.4397063604318919 | \n", + "\t\t1 | \n", + "\t
326 | \n", + "\t\t855 | \n", + "\t\t178859.06443592097 | \n", + "\t\t79.44978256514692 | \n", + "\t\t388.6050241978192 | \n", + "\t\t209.19188822914734 | \n", + "\t\t104.27611140847034 | \n", + "\t\t172.13980338906399 | \n", + "\t\t855 | \n", + "\t\t172.13980338906399 | \n", + "\t\t10873.507410471719 | \n", + "\t\t0.27373796485881025 | \n", + "\t\t-1.5989820053608266 | \n", + "\t\t1 | \n", + "\t
141 | \n", + "\t\t1707 | \n", + "\t\t217554.70851152326 | \n", + "\t\t69.20055931911273 | \n", + "\t\t385.9221118724905 | \n", + "\t\t127.44856971969729 | \n", + "\t\t67.96165098216868 | \n", + "\t\t102.93497385409218 | \n", + "\t\t1707 | \n", + "\t\t102.93497385409218 | \n", + "\t\t4618.786004222108 | \n", + "\t\t2.3869857481164622 | \n", + "\t\t4.524407928377094 | \n", + "\t\t0 | \n", + "\t
570 | \n", + "\t\t637 | \n", + "\t\t108597.77124086616 | \n", + "\t\t76.02029807955392 | \n", + "\t\t352.4795596378072 | \n", + "\t\t170.48315736399712 | \n", + "\t\t86.51173633708574 | \n", + "\t\t115.06655015743611 | \n", + "\t\t637 | \n", + "\t\t115.06655015743611 | \n", + "\t\t7484.28052405744 | \n", + "\t\t0.7249096707674543 | \n", + "\t\t-1.0064015341549895 | \n", + "\t\t0 | \n", + "\t
We create new transformed data by using the same Scalefit object we used earlier and get the transformed data for this new data.
" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "099b4d80-3bb8-4e96-ba57-c85c84ae990a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd54b8613eb14b16835e3bd03d9495bf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | anomaly | count_RESISTANCE | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | percentile_RESISTANCE | unique_RESISTANCE | median_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
326 | \n", + "\t\t1 | \n", + "\t\t-0.4454958013788012 | \n", + "\t\t0.49175516324283447 | \n", + "\t\t0.058266501337359396 | \n", + "\t\t1.1150200856051362 | \n", + "\t\t1.3539755632281287 | \n", + "\t\t1.8200619070354604 | \n", + "\t\t0.9194382247575821 | \n", + "\t\t-0.4454958013788012 | \n", + "\t\t0.9194382247575821 | \n", + "\t\t2.0817677950852636 | \n", + "\t\t-1.0994434098745625 | \n", + "\t\t-1.0566279679947848 | \n", + "\t
999 | \n", + "\t\t1 | \n", + "\t\t0.0034670964809643983 | \n", + "\t\t0.19150685538374623 | \n", + "\t\t0.007189680804109719 | \n", + "\t\t-0.572992840326859 | \n", + "\t\t-0.023939443562901425 | \n", + "\t\t0.21488164140523353 | \n", + "\t\t-0.42987602059741353 | \n", + "\t\t0.0034670964809643983 | \n", + "\t\t-0.42987602059741353 | \n", + "\t\t0.14170515277163295 | \n", + "\t\t-0.3998375077234471 | \n", + "\t\t-0.7211087815327962 | \n", + "\t
183 | \n", + "\t\t0 | \n", + "\t\t0.793525182974448 | \n", + "\t\t0.523797308954638 | \n", + "\t\t-0.47123828167733633 | \n", + "\t\t1.9938112179994238 | \n", + "\t\t-0.806783408272433 | \n", + "\t\t0.4409365865810209 | \n", + "\t\t-0.5777180923358187 | \n", + "\t\t0.793525182974448 | \n", + "\t\t-0.5777180923358187 | \n", + "\t\t0.38847470211704904 | \n", + "\t\t1.3988023115204253 | \n", + "\t\t1.5933929191283127 | \n", + "\t
530 | \n", + "\t\t0 | \n", + "\t\t-0.3492894661231371 | \n", + "\t\t-0.42271351810262053 | \n", + "\t\t0.10517001290583916 | \n", + "\t\t-0.10977661846737265 | \n", + "\t\t-0.1730205742826949 | \n", + "\t\t0.12416064757468764 | \n", + "\t\t-0.4476321194017249 | \n", + "\t\t-0.3492894661231371 | \n", + "\t\t-0.4476321194017249 | \n", + "\t\t0.04510877161800008 | \n", + "\t\t-0.017989719458924348 | \n", + "\t\t-0.2810964068413063 | \n", + "\t
387 | \n", + "\t\t1 | \n", + "\t\t1.8109800618904102 | \n", + "\t\t2.354620840583509 | \n", + "\t\t-0.5599250484658025 | \n", + "\t\t1.0028942996347474 | \n", + "\t\t-0.3594153577319633 | \n", + "\t\t0.679603160858965 | \n", + "\t\t-0.5209247163916084 | \n", + "\t\t1.8109800618904102 | \n", + "\t\t-0.5209247163916084 | \n", + "\t\t0.65842030680638 | \n", + "\t\t0.49326505850456726 | \n", + "\t\t0.22156809424649065 | \n", + "\t
6.3 Retrieve the model file and use it to predict
\n", + "We use the PMMLPredict function from the teradataml library to predict the anomalies.
\n", + "Predictive Model Markup Language (PMML) is an XML-based standard established by the Data Mining Group (DMG) for defining statistical and data-mining models. PMML models can be shared between PMML-compliant platforms and across organizations so that business analysts and developers are unified in designing, analyzing, and implementing PMML-based assets and services.
" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "16f0c6bb-3551-4337-a4e3-8c2a79fd55cc", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "29a988d766e44b49b4522e5cb63ea9aa", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | prediction | probability(0) | probability(1) | \n", + "
---|---|---|---|
141 | \n", + "\t\t\n", + "\t\t | 0.9552904297256605 | \n", + "\t\t0.04470957027433943 | \n", + "\t
183 | \n", + "\t\t\n", + "\t\t | 0.7217187249019571 | \n", + "\t\t0.27828127509804285 | \n", + "\t
530 | \n", + "\t\t\n", + "\t\t | 0.8436407811560411 | \n", + "\t\t0.15635921884395887 | \n", + "\t
387 | \n", + "\t\t\n", + "\t\t | 0.0871778334911983 | \n", + "\t\t0.9128221665088017 | \n", + "\t
999 | \n", + "\t\t\n", + "\t\t | 0.19836118525209878 | \n", + "\t\t0.8016388147479013 | \n", + "\t
WELDING_ID | anomaly | prob_0 | prob_1 | prediction | \n", + "
---|---|---|---|---|
116 | \n", + "\t\t0 | \n", + "\t\t0.9552904297256605 | \n", + "\t\t0.04470957027433943 | \n", + "\t\t0 | \n", + "\t
581 | \n", + "\t\t0 | \n", + "\t\t0.7435115241296084 | \n", + "\t\t0.2564884758703916 | \n", + "\t\t0 | \n", + "\t
281 | \n", + "\t\t0 | \n", + "\t\t0.7522548100598302 | \n", + "\t\t0.24774518994016978 | \n", + "\t\t0 | \n", + "\t
999 | \n", + "\t\t1 | \n", + "\t\t0.19836118525209878 | \n", + "\t\t0.8016388147479013 | \n", + "\t\t1 | \n", + "\t
856 | \n", + "\t\t0 | \n", + "\t\t0.7919828100855837 | \n", + "\t\t0.20801718991441617 | \n", + "\t\t0 | \n", + "\t
7. Random Forest using Teradata OpenSource ML functions
\n", + " \n", + "We start by creating a subset for the most interesting part lies between 40 and 400ms from the start of the curve.
\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "cf8a84c6-2c67-43c7-86e2-1f31c6bd1c18", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dc6d18df9e254ce8882ec7453be1ab40", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "PLANT | ROBOT_ID | WELDING_TYPE | WELDING_DAY | WELDING_ID | TIME_MS | RESISTANCE | \n", + "
---|---|---|---|---|---|---|
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t833 | \n", + "\t\t184 | \n", + "\t\t225.50114877088427 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t5 | \n", + "\t\t2025-07-12 | \n", + "\t\t489 | \n", + "\t\t287 | \n", + "\t\t230.52740760760418 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t9 | \n", + "\t\t2025-06-18 | \n", + "\t\t812 | \n", + "\t\t76 | \n", + "\t\t178.16102130409436 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t2 | \n", + "\t\t2025-07-30 | \n", + "\t\t131 | \n", + "\t\t122 | \n", + "\t\t314.408712136953 | \n", + "\t
1 | \n", + "\t\t41 | \n", + "\t\t7 | \n", + "\t\t2025-06-30 | \n", + "\t\t601 | \n", + "\t\t69 | \n", + "\t\t198.1165335758247 | \n", + "\t
We create various features by using the window function on the Resistance and taking the difference between the previous and current resistance based on time. We will create these features by using the aggregation function on this resistance and the difference of the resistance.
\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "a227337c-3b57-443c-a256-dd5230ed98dd", + "metadata": {}, + "outputs": [], + "source": [ + "DF_curves_zoom = DF_curves_zoom.assign(\n", + " resistance_diff = DF_curves_zoom.RESISTANCE \n", + " - DF_curves_zoom.RESISTANCE.window(\n", + " partition_columns=['WELDING_ID'],\n", + " order_columns=[\"TIME_MS\"]\n", + " ).lag(1)\n", + ")\n", + "# DF_curves_zoom[DF_curves_zoom.WELDING_ID==138].sort(\"TIME_MS\")" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "cb8c00e7-c465-46ba-99ae-c094969a2eed", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2e2da15e86ab4b1385219f642e37e5dc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | \n", + "
---|---|---|---|---|---|---|---|---|---|
620 | \n", + "\t\t88342.64044827396 | \n", + "\t\t137.30900299564482 | \n", + "\t\t314.06522413849495 | \n", + "\t\t246.07977840744834 | \n", + "\t\t45.09106156197229 | \n", + "\t\t2033.203832785575 | \n", + "\t\t-0.5593553778837088 | \n", + "\t\t-0.6737088126622688 | \n", + "\t\t-29.856946521200086 | \n", + "\t
72 | \n", + "\t\t82132.21433826616 | \n", + "\t\t104.70467014938446 | \n", + "\t\t312.7132264322771 | \n", + "\t\t228.78054133221772 | \n", + "\t\t57.099248554428925 | \n", + "\t\t3260.324185480454 | \n", + "\t\t-0.4527050989025892 | \n", + "\t\t-0.9189959880168315 | \n", + "\t\t-27.171428796617818 | \n", + "\t
759 | \n", + "\t\t85850.3948821828 | \n", + "\t\t95.93586249651608 | \n", + "\t\t349.2521127143311 | \n", + "\t\t239.13759020106627 | \n", + "\t\t71.37967029255474 | \n", + "\t\t5095.057331073822 | \n", + "\t\t-0.40798080630716765 | \n", + "\t\t-1.042277705763781 | \n", + "\t\t-34.2552297092069 | \n", + "\t
711 | \n", + "\t\t77910.29397222718 | \n", + "\t\t99.59721397994052 | \n", + "\t\t308.17260229102055 | \n", + "\t\t217.0203174713849 | \n", + "\t\t58.580333021597795 | \n", + "\t\t3431.6554169213014 | \n", + "\t\t-0.28157863829571916 | \n", + "\t\t-1.0842604382366274 | \n", + "\t\t-25.294217256768533 | \n", + "\t
735 | \n", + "\t\t86983.36964263133 | \n", + "\t\t105.16013699875654 | \n", + "\t\t328.7554675173849 | \n", + "\t\t242.2935087538477 | \n", + "\t\t56.5150805707015 | \n", + "\t\t3193.954331912882 | \n", + "\t\t-0.5469935212726376 | \n", + "\t\t-0.7309877203258219 | \n", + "\t\t-31.729098532553564 | \n", + "\t
7.1 Build the analytical dataset.
\n", + "We create the analytical dataset joining the anomaly table created above and the dataset with the features created.
" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "55686241-b413-45eb-a495-9888c946c634", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_target = DataFrame('Anomaly_Target')" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "d4f6e7dc-7a1e-447f-918c-fe675f5d597f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "DF_ADS = DF_features[['WELDING_ID']+feature_names].join(\n", + " other=DF_target, how='inner', on='WELDING_ID=WELDING_ID',rsuffix='r',lsuffix='l')" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "41bca2b7-9260-46f2-afed-f0d611fd232a", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5f9efcfa58094da8a6ed2b379fdabc9a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID_l | WELDING_ID_r | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
759 | \n", + "\t\t759 | \n", + "\t\t85850.3948821828 | \n", + "\t\t95.93586249651608 | \n", + "\t\t349.2521127143311 | \n", + "\t\t239.13759020106627 | \n", + "\t\t71.37967029255474 | \n", + "\t\t5095.057331073822 | \n", + "\t\t-0.40798080630716765 | \n", + "\t\t-1.042277705763781 | \n", + "\t\t-34.2552297092069 | \n", + "\t\t0 | \n", + "\t
521 | \n", + "\t\t521 | \n", + "\t\t86311.16209557129 | \n", + "\t\t83.63565446899322 | \n", + "\t\t371.03290669363076 | \n", + "\t\t240.42106433306765 | \n", + "\t\t86.59469017742082 | \n", + "\t\t7498.6403669235015 | \n", + "\t\t-0.22559943677069424 | \n", + "\t\t-1.2315781754798603 | \n", + "\t\t-48.000249911263666 | \n", + "\t\t0 | \n", + "\t
123 | \n", + "\t\t123 | \n", + "\t\t83759.94919396425 | \n", + "\t\t82.6451402512838 | \n", + "\t\t368.3012871516082 | \n", + "\t\t233.3146217102068 | \n", + "\t\t82.76885574461353 | \n", + "\t\t6850.683481272646 | \n", + "\t\t-0.2867184557621234 | \n", + "\t\t-1.3075810926072349 | \n", + "\t\t-39.541343850154476 | \n", + "\t\t0 | \n", + "\t
342 | \n", + "\t\t342 | \n", + "\t\t106500.14073297645 | \n", + "\t\t178.45785277776122 | \n", + "\t\t367.5445073395069 | \n", + "\t\t296.65777362946085 | \n", + "\t\t45.55150288848985 | \n", + "\t\t2074.9394154000993 | \n", + "\t\t-0.41900879613466196 | \n", + "\t\t-0.8516234489970328 | \n", + "\t\t-31.266292156182544 | \n", + "\t\t1 | \n", + "\t
144 | \n", + "\t\t144 | \n", + "\t\t83121.01622703334 | \n", + "\t\t78.52177145197345 | \n", + "\t\t374.24517113960087 | \n", + "\t\t231.53486414215413 | \n", + "\t\t95.91404929726042 | \n", + "\t\t9199.504852597303 | \n", + "\t\t-0.17056032020583595 | \n", + "\t\t-1.5247519859543917 | \n", + "\t\t-35.818755185093664 | \n", + "\t\t0 | \n", + "\t
WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly | anomaly_int | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
168 | \n", + "\t\t81231.31993153358 | \n", + "\t\t78.20584184189954 | \n", + "\t\t383.797723458992 | \n", + "\t\t226.27108616026067 | \n", + "\t\t92.61259834947403 | \n", + "\t\t8577.093373041 | \n", + "\t\t0.05391970417306466 | \n", + "\t\t-1.3925390208869324 | \n", + "\t\t-36.405558520862314 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
521 | \n", + "\t\t86311.16209557129 | \n", + "\t\t83.63565446899322 | \n", + "\t\t371.03290669363076 | \n", + "\t\t240.42106433306765 | \n", + "\t\t86.59469017742082 | \n", + "\t\t7498.6403669235015 | \n", + "\t\t-0.22559943677069424 | \n", + "\t\t-1.2315781754798603 | \n", + "\t\t-48.000249911263666 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
342 | \n", + "\t\t106500.14073297645 | \n", + "\t\t178.45785277776122 | \n", + "\t\t367.5445073395069 | \n", + "\t\t296.65777362946085 | \n", + "\t\t45.55150288848985 | \n", + "\t\t2074.9394154000993 | \n", + "\t\t-0.41900879613466196 | \n", + "\t\t-0.8516234489970328 | \n", + "\t\t-31.266292156182544 | \n", + "\t\t1 | \n", + "\t\t1 | \n", + "\t
144 | \n", + "\t\t83121.01622703334 | \n", + "\t\t78.52177145197345 | \n", + "\t\t374.24517113960087 | \n", + "\t\t231.53486414215413 | \n", + "\t\t95.91404929726042 | \n", + "\t\t9199.504852597303 | \n", + "\t\t-0.17056032020583595 | \n", + "\t\t-1.5247519859543917 | \n", + "\t\t-35.818755185093664 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
123 | \n", + "\t\t83759.94919396425 | \n", + "\t\t82.6451402512838 | \n", + "\t\t368.3012871516082 | \n", + "\t\t233.3146217102068 | \n", + "\t\t82.76885574461353 | \n", + "\t\t6850.683481272646 | \n", + "\t\t-0.2867184557621234 | \n", + "\t\t-1.3075810926072349 | \n", + "\t\t-39.541343850154476 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
TD_IsTrainRow | WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly_int | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
1 | \n", + "\t\t10 | \n", + "\t\t95127.1795645449 | \n", + "\t\t144.88744123637863 | \n", + "\t\t356.36420112865824 | \n", + "\t\t264.97821605722817 | \n", + "\t\t54.9337378770633 | \n", + "\t\t3017.7155571458993 | \n", + "\t\t-0.2307254513118324 | \n", + "\t\t-1.1858367926106634 | \n", + "\t\t-32.0320370169095 | \n", + "\t\t1 | \n", + "\t
1 | \n", + "\t\t8 | \n", + "\t\t100195.44141325781 | \n", + "\t\t153.1729060302975 | \n", + "\t\t369.01676378573256 | \n", + "\t\t279.0959370842836 | \n", + "\t\t53.34160790042546 | \n", + "\t\t2845.3271334027318 | \n", + "\t\t-0.4776006642407464 | \n", + "\t\t-0.7252063427887376 | \n", + "\t\t-30.30756652342805 | \n", + "\t\t1 | \n", + "\t
1 | \n", + "\t\t35 | \n", + "\t\t101079.20526104877 | \n", + "\t\t187.26783471658626 | \n", + "\t\t358.73415881062687 | \n", + "\t\t281.55767482186286 | \n", + "\t\t49.84128428232629 | \n", + "\t\t2484.153618911666 | \n", + "\t\t-0.2591425375150598 | \n", + "\t\t-1.3078073722006127 | \n", + "\t\t-45.314281342120864 | \n", + "\t\t1 | \n", + "\t
1 | \n", + "\t\t9 | \n", + "\t\t82501.00500514822 | \n", + "\t\t91.14168264526084 | \n", + "\t\t365.18561530292624 | \n", + "\t\t229.80781338481398 | \n", + "\t\t89.14456616163214 | \n", + "\t\t7946.7536761456095 | \n", + "\t\t-0.18283169109922082 | \n", + "\t\t-1.6351004381704022 | \n", + "\t\t-33.84552053148343 | \n", + "\t\t0 | \n", + "\t
1 | \n", + "\t\t21 | \n", + "\t\t87651.6631041502 | \n", + "\t\t100.06115552653354 | \n", + "\t\t370.50674200883566 | \n", + "\t\t244.15505042938773 | \n", + "\t\t77.73139844534938 | \n", + "\t\t6042.170304269665 | \n", + "\t\t-0.10823631050168349 | \n", + "\t\t-1.2142935096694525 | \n", + "\t\t-35.92449265501136 | \n", + "\t\t0 | \n", + "\t
7.2 Train RandomForest Classifier
\n", + "Train dataset is created using sampleid = 1.
" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "5eef1a68-6211-4b4a-a870-083f6aff1633", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bead7343638f493fa1e329c078fee65f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly_int | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
9 | \n", + "\t\t82501.00500514822 | \n", + "\t\t91.14168264526084 | \n", + "\t\t365.18561530292624 | \n", + "\t\t229.80781338481398 | \n", + "\t\t89.14456616163214 | \n", + "\t\t7946.7536761456095 | \n", + "\t\t-0.18283169109922082 | \n", + "\t\t-1.6351004381704022 | \n", + "\t\t-33.84552053148343 | \n", + "\t\t0 | \n", + "\t
12 | \n", + "\t\t106817.32190479447 | \n", + "\t\t191.7494296610409 | \n", + "\t\t385.8402275855924 | \n", + "\t\t297.5412866428815 | \n", + "\t\t52.55922597785767 | \n", + "\t\t2762.4722353915085 | \n", + "\t\t-0.1999867594892224 | \n", + "\t\t-1.3022494693454283 | \n", + "\t\t-31.826116223990255 | \n", + "\t\t1 | \n", + "\t
6 | \n", + "\t\t105201.78708839459 | \n", + "\t\t199.73777915742855 | \n", + "\t\t369.66316282209505 | \n", + "\t\t293.0411896612663 | \n", + "\t\t48.937854771632054 | \n", + "\t\t2394.91362964935 | \n", + "\t\t-0.2972221842013988 | \n", + "\t\t-1.2708881859616288 | \n", + "\t\t-44.07554059750447 | \n", + "\t\t1 | \n", + "\t
10 | \n", + "\t\t95127.1795645449 | \n", + "\t\t144.88744123637863 | \n", + "\t\t356.36420112865824 | \n", + "\t\t264.97821605722817 | \n", + "\t\t54.9337378770633 | \n", + "\t\t3017.7155571458993 | \n", + "\t\t-0.2307254513118324 | \n", + "\t\t-1.1858367926106634 | \n", + "\t\t-32.0320370169095 | \n", + "\t\t1 | \n", + "\t
3 | \n", + "\t\t76967.210140556 | \n", + "\t\t104.14666690623334 | \n", + "\t\t287.7835576436938 | \n", + "\t\t214.39334300990527 | \n", + "\t\t51.09115870026167 | \n", + "\t\t2610.306497335324 | \n", + "\t\t-0.4833111981907075 | \n", + "\t\t-0.8441517264101307 | \n", + "\t\t-22.450629480628777 | \n", + "\t\t0 | \n", + "\t
Test dataset is created using sampleid = 2.
" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "be644b0e-dcfd-40ff-ba0a-b5800cfd0875", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "492a163cd1f34097ab58c864fbea4d6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly_int | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
23 | \n", + "\t\t80968.38566460606 | \n", + "\t\t91.32685504056975 | \n", + "\t\t362.0288544976556 | \n", + "\t\t225.5386787314932 | \n", + "\t\t76.16092244254854 | \n", + "\t\t5800.486107299894 | \n", + "\t\t-0.06229470076093798 | \n", + "\t\t-1.2994883640937822 | \n", + "\t\t-38.24098140508306 | \n", + "\t\t0 | \n", + "\t
26 | \n", + "\t\t103807.76297225684 | \n", + "\t\t152.857497527659 | \n", + "\t\t383.578384675018 | \n", + "\t\t289.15811412884915 | \n", + "\t\t57.608322823675024 | \n", + "\t\t3318.7188585567565 | \n", + "\t\t-0.21002500953619418 | \n", + "\t\t-1.3168188493421034 | \n", + "\t\t-42.325211030065475 | \n", + "\t\t1 | \n", + "\t
69 | \n", + "\t\t97411.15857106655 | \n", + "\t\t168.72998702644077 | \n", + "\t\t362.43335495393376 | \n", + "\t\t271.340274571216 | \n", + "\t\t55.08092368489766 | \n", + "\t\t3033.9081539815206 | \n", + "\t\t-0.19776981850595154 | \n", + "\t\t-1.3735577645369346 | \n", + "\t\t-33.54196500414861 | \n", + "\t\t1 | \n", + "\t
82 | \n", + "\t\t94048.76279519273 | \n", + "\t\t114.03951750210967 | \n", + "\t\t366.35161875486074 | \n", + "\t\t261.97426962449225 | \n", + "\t\t68.10084349437736 | \n", + "\t\t4637.72488464568 | \n", + "\t\t-0.4827546065748303 | \n", + "\t\t-0.8556540622845826 | \n", + "\t\t-49.74532255752476 | \n", + "\t\t0 | \n", + "\t
33 | \n", + "\t\t93185.74584100883 | \n", + "\t\t124.65019295111625 | \n", + "\t\t351.7031928317579 | \n", + "\t\t259.5703226769048 | \n", + "\t\t59.354738112601765 | \n", + "\t\t3522.9849364155407 | \n", + "\t\t-0.3229589598739873 | \n", + "\t\t-1.046640890827172 | \n", + "\t\t-40.22434101551627 | \n", + "\t\t1 | \n", + "\t
Copy the Train and Test datasets into vantage
" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "130cdab5-841a-4e65-b4f3-090414df65c3", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(data_train, table_name='data_train', if_exists='replace')\n", + "copy_to_sql(data_val, table_name='data_val', if_exists='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "f717da10-a24f-4345-a36e-8554fefcdc2e", + "metadata": {}, + "outputs": [], + "source": [ + "data_train=DataFrame('data_train')\n", + "data_val= DataFrame('data_val')" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "5217c805-010b-4184-b312-b22c7f0b1d49", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import td_sklearn as osml\n", + "X_train = data_train.drop(['anomaly_int','WELDING_ID'], axis = 1)\n", + "y_train = data_train.select([\"anomaly_int\"])\n", + "X_test = data_val.drop(['anomaly_int','WELDING_ID'], axis = 1)\n", + "y_test = data_val.select([\"anomaly_int\"])" + ] + }, + { + "cell_type": "markdown", + "id": "139fef94-8a5b-4cfc-964a-14c1498d8ba5", + "metadata": {}, + "source": [ + "Set the session to use the Analytic compute group and cluster to execute the OpenSourceML function.
" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "72c6f350-9dd5-4e61-af02-b368b8014414", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compute group set to GPUGroup\n" + ] + } + ], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\")" + ] + }, + { + "cell_type": "markdown", + "id": "30978fe8-6c08-47b8-8b79-3157a74151e7", + "metadata": {}, + "source": [ + "Check the user environments and create an environment for the usecase.
" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "id": "bb089556-3f28-4ad8-8de7-1506cf7a4412", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No user environment(s) found.\n" + ] + } + ], + "source": [ + "list_user_envs()" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "8dcea3eb-0d96-45e4-b525-76b59dba9b98", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User environment 'osml_env' created.\n" + ] + }, + { + "data": { + "text/plain": [ + "\n", + "================================================\n", + "Environment Name: osml_env\n", + "Base Environment: python_3.9\n", + "Description: OAF Demo env for Anomaly OSML\n", + "\n", + "############ Libraries installed in User Environment ############\n", + "\n", + " name version\n", + "0 pip 25.0.1\n", + "1 setuptools 78.1.0\n", + "\n", + "================================================" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "try:\n", + " env = create_env(\n", + " env_name=\"osml_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for Anomaly OSML\",\n", + " )\n", + "except:\n", + " remove_env(\"osml_env\")\n", + " env = create_env(\n", + " env_name=\"osml_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for Anomaly OSML\",\n", + " )\n", + " \n", + "env " + ] + }, + { + "cell_type": "markdown", + "id": "c51c6334-d126-4668-94a3-e8e5f9c76b13", + "metadata": {}, + "source": [ + "Confirm that the versions in the local environment are same to the virtual environment.
" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "f529092a-43da-4a42-9eb1-9beceec29792", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "scikit-learn 1.1.3\n", + "scipy 1.11.2\n", + "numpy 1.24.2\n", + "geopandas 0.12.2\n", + "pandas 2.1.3\n", + "sklearn-pandas 2.2.0\n" + ] + } + ], + "source": [ + "!pip list | grep scikit-learn\n", + "!pip list | grep scipy\n", + "!pip list | grep numpy\n", + "!pip list | grep pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "id": "7a82e027-864c-4590-ba76-dc493311fdf4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Request to install libraries initiated successfully in the remote user environment osml_env. Check the status using status() with the claim id '68add8e1-7c66-406d-90f9-1ede4695e870'.\n" + ] + } + ], + "source": [ + "claim_id = env.install_lib([\"pandas==2.1.3\",\n", + " \"scipy==1.11.2\",\n", + " \"scikit-learn==1.1.3\",\n", + " \"numpy==1.24.2\"], asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "id": "606ca6eb-845c-4fea-8bd6-ec2e9094b43f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | Claim Id | \n", + "File/Libs/Model | \n", + "Method Name | \n", + "Stage | \n", + "Timestamp | \n", + "Additional Details | \n", + "
---|---|---|---|---|---|---|
0 | \n", + "68add8e1-7c66-406d-90f9-1ede4695e870 | \n", + "pandas==2.1.3, scipy==1.11.2, scikit-learn==1.... | \n", + "install_lib | \n", + "Started | \n", + "2025-08-11T13:43:37Z | \n", + "\n", + " |
1 | \n", + "68add8e1-7c66-406d-90f9-1ede4695e870 | \n", + "pandas==2.1.3, scipy==1.11.2, scikit-learn==1.... | \n", + "install_lib | \n", + "Finished | \n", + "2025-08-11T13:47:04Z | \n", + "\n", + " |
\n", + " | name | \n", + "version | \n", + "
---|---|---|
0 | \n", + "joblib | \n", + "1.5.1 | \n", + "
1 | \n", + "numpy | \n", + "1.24.2 | \n", + "
2 | \n", + "pandas | \n", + "2.1.3 | \n", + "
3 | \n", + "pip | \n", + "25.0.1 | \n", + "
4 | \n", + "python-dateutil | \n", + "2.9.0.post0 | \n", + "
5 | \n", + "pytz | \n", + "2025.2 | \n", + "
6 | \n", + "scikit-learn | \n", + "1.1.3 | \n", + "
7 | \n", + "scipy | \n", + "1.11.2 | \n", + "
8 | \n", + "setuptools | \n", + "78.1.0 | \n", + "
9 | \n", + "six | \n", + "1.17.0 | \n", + "
10 | \n", + "threadpoolctl | \n", + "3.6.0 | \n", + "
11 | \n", + "tzdata | \n", + "2025.2 | \n", + "
Set the user environment to the created virtual environment and execute the RandomForestClassifier.
" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "e775073f-81b6-47bd-8696-efee242e3baf", + "metadata": {}, + "outputs": [], + "source": [ + "configure.openml_user_env = env" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "95aa7d9d-dadd-4267-9f13-2526d2d6989b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "RandomForestClassifier(max_depth=2, max_features='auto', max_leaf_nodes=2,\n", + " n_estimators=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=2, max_features='auto', max_leaf_nodes=2,\n", + " n_estimators=10)
Check the params for the Classifier.
" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ef508629-f7e7-4210-9f8d-9d2f21530a85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'bootstrap': True,\n", + " 'ccp_alpha': 0.0,\n", + " 'class_weight': None,\n", + " 'criterion': 'gini',\n", + " 'max_depth': 2,\n", + " 'max_features': 'auto',\n", + " 'max_leaf_nodes': 2,\n", + " 'max_samples': None,\n", + " 'min_impurity_decrease': 0.0,\n", + " 'min_samples_leaf': 1,\n", + " 'min_samples_split': 2,\n", + " 'min_weight_fraction_leaf': 0.0,\n", + " 'n_estimators': 10,\n", + " 'n_jobs': None,\n", + " 'oob_score': False,\n", + " 'random_state': None,\n", + " 'verbose': 0,\n", + " 'warm_start': False}" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "RF_classifier.get_params()" + ] + }, + { + "cell_type": "markdown", + "id": "aca1ef54-8f11-48af-9d9f-ffe19a08b050", + "metadata": {}, + "source": [ + "7.3 Predict and Evaluate model
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "12ca4daf-6b7f-453b-b690-3ca59df0fb6b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7dba28565d8d45aea0fea458c4379def", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly_int | randomforestclassifier_predict_1 | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
110341.49664250232 | \n", + "\t\t205.03522927809342 | \n", + "\t\t380.74742538916405 | \n", + "\t\t307.35792936630173 | \n", + "\t\t45.5209819478354 | \n", + "\t\t2072.159797495157 | \n", + "\t\t-0.3214105274578798 | \n", + "\t\t-1.1819447978660633 | \n", + "\t\t-49.25786591005914 | \n", + "\t\t1 | \n", + "\t\t1 | \n", + "\t
96649.6900333734 | \n", + "\t\t122.17637448667617 | \n", + "\t\t366.32142645670035 | \n", + "\t\t269.2191922935192 | \n", + "\t\t64.87601147648358 | \n", + "\t\t4208.896865096829 | \n", + "\t\t-0.5753868635026909 | \n", + "\t\t-0.6683953472392237 | \n", + "\t\t-41.076280458269196 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
84099.21496523272 | \n", + "\t\t118.69683849065697 | \n", + "\t\t355.3352831442057 | \n", + "\t\t234.25965171374017 | \n", + "\t\t70.96890818596847 | \n", + "\t\t5036.585929108423 | \n", + "\t\t0.15449954336904378 | \n", + "\t\t-1.383727210170674 | \n", + "\t\t-31.554292692684527 | \n", + "\t\t0 | \n", + "\t\t0 | \n", + "\t
107360.61084072788 | \n", + "\t\t201.71277038171522 | \n", + "\t\t377.2026817832662 | \n", + "\t\t299.05462629729215 | \n", + "\t\t45.158285010493685 | \n", + "\t\t2039.2707050889785 | \n", + "\t\t-0.46398574196047354 | \n", + "\t\t-1.0083162755064483 | \n", + "\t\t-34.04093502152949 | \n", + "\t\t1 | \n", + "\t\t1 | \n", + "\t
119047.29002453877 | \n", + "\t\t207.8252169086635 | \n", + "\t\t410.920900680535 | \n", + "\t\t331.6080502076289 | \n", + "\t\t49.55794200324507 | \n", + "\t\t2455.989615597002 | \n", + "\t\t-0.6231397861594759 | \n", + "\t\t-0.8036020482778766 | \n", + "\t\t-33.43931357109807 | \n", + "\t\t1 | \n", + "\t\t1 | \n", + "\t
score | \n", + "
---|
0.9746835443037974 | \n", + "\t
8. Compare PMML and OpenSource ML model
\n", + "8.1 Show AUC-ROC Curve
\n", + "\n", + "The ROC curve shows the performance of a binary classification model as its discrimination threshold varies. For a range of thresholds, the curve plots the true positive rate against false-positive rate.
\n", + "\n", + "This function accepts a set of prediction-actual pairs as input and calculates the following values for a range of discrimination thresholds.
\n", + "ROC for PMML
" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "7c4b179b-a334-4dc0-b3f8-71c35f87283e", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ROC \n", + "roc_pmml = ROC(data = pmml_predict_result, \n", + " probability_column = \"prob_1\",\n", + " observation_column = \"anomaly\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "32b946fb-e09e-4e62-b78a-c5325d84c175", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | threshold_value | \n", + "tpr | \n", + "fpr | \n", + "
---|---|---|---|
9 | \n", + "0.183673 | \n", + "1.0 | \n", + "0.366795 | \n", + "
8 | \n", + "0.163265 | \n", + "1.0 | \n", + "0.447876 | \n", + "
7 | \n", + "0.142857 | \n", + "1.0 | \n", + "0.490347 | \n", + "
6 | \n", + "0.122449 | \n", + "1.0 | \n", + "0.548263 | \n", + "
5 | \n", + "0.102041 | \n", + "1.0 | \n", + "0.594595 | \n", + "
4 | \n", + "0.081633 | \n", + "1.0 | \n", + "0.687259 | \n", + "
3 | \n", + "0.061224 | \n", + "1.0 | \n", + "0.698842 | \n", + "
2 | \n", + "0.040816 | \n", + "1.0 | \n", + "0.949807 | \n", + "
1 | \n", + "0.020408 | \n", + "1.0 | \n", + "1.000000 | \n", + "
0 | \n", + "0.000000 | \n", + "1.0 | \n", + "1.000000 | \n", + "
ROC for tdmlOpenSource RandomForestClassifier
" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "02a1c9e2-be8c-44da-9e0a-9056a2ec8243", + "metadata": {}, + "outputs": [], + "source": [ + "roc_obj = ROC(data = predict_RF, \n", + " probability_column = \"randomforestclassifier_predict_1\",\n", + " observation_column = \"anomaly_int\",\n", + " positive_class=\"1\"\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "27834036-13cc-49e9-a34e-b2bcb2c192b1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | threshold_value | \n", + "tpr | \n", + "fpr | \n", + "
---|---|---|---|
15 | \n", + "0.306122 | \n", + "0.975 | \n", + "0.025641 | \n", + "
16 | \n", + "0.326531 | \n", + "0.975 | \n", + "0.025641 | \n", + "
17 | \n", + "0.346939 | \n", + "0.975 | \n", + "0.025641 | \n", + "
18 | \n", + "0.367347 | \n", + "0.975 | \n", + "0.025641 | \n", + "
19 | \n", + "0.387755 | \n", + "0.975 | \n", + "0.025641 | \n", + "
20 | \n", + "0.408163 | \n", + "0.975 | \n", + "0.025641 | \n", + "
21 | \n", + "0.428571 | \n", + "0.975 | \n", + "0.025641 | \n", + "
22 | \n", + "0.448980 | \n", + "0.975 | \n", + "0.025641 | \n", + "
12 | \n", + "0.244898 | \n", + "0.975 | \n", + "0.025641 | \n", + "
0 | \n", + "0.000000 | \n", + "1.000 | \n", + "1.000000 | \n", + "
Plot ROC Curves
" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "id": "93ab97d1-cbd3-4044-8546-0f170a5ca9ce", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "The closer the ROC curve is to the upper left corner of the graph, the higher the accuracy of the test because in the upper left corner, the sensitivity = 1 and the false positive rate = 0 (specificity = 1). The ideal ROC curve thus has an AUC = 1.0. As seen in the above graph the AUC for both the models is close to 1 so the accuracy of both models is very good.
\n", + "\n", + "8.2 Show Confusion Matrix
\n", + "\n", + "Confusion Matrix is a performance measurement for machine learning classification problem where output can be two or more classes. It is a table with 4 different combinations of predicted and actual values.
\n", + "\n", + "Confusion matrices represent counts from predicted and actual values. The output “TN” stands for True Negative which shows the number of negative examples classified accurately. Similarly, “TP” stands for True Positive which indicates the number of positive examples classified accurately. The term “FP” shows False Positive value, i.e., the number of actual negative examples classified as positive; and “FN” means a False Negative value which is the number of actual positive examples classified as negative.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "0cac3275-2854-464a-b240-03e7b836b96d", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "The confusion matrix for this binary class classification problem has the below 4 quadrants:
\n", + "\n", + "Conclusion
\n", + "We have seen an end-to-end exploration process for labelling anomalous time series using ClearScape Analytics on Teradata Vantage. Thanks to the in-database capabilities offered by Teradata Vantage with ClearScape Analytics, we were able to run this exploration with the smallest notebook instance. The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "In this particular use case, we have observed that with large volume of machine sensor data millions of ML models were created to derive analytic features that ultimately deployed tens of thousands of models for real-time scoring. This extent of scale is only possible by combining the power of Vantage with native ClearScape Analytic functions.
" + ] + }, + { + "cell_type": "markdown", + "id": "4cb4409a-847a-4501-95bb-8268958315ec", + "metadata": {}, + "source": [ + "9. Model Explainability
\n", + "Trusted AI
\n", + "\n", + "Trusted AI is important for the in-database functions and data pipelines used in predictive AI/ML, providing significant benefits when applied. One way to enhance the benefits: Teradata VantageCloud, the only platform to offer the massively parallel processing (MPP) architecture that enables best-in-class vertical and horizontal scaling of models.
\n", + "\n", + "LIME stands for Local Interpretable Model-agnostic Explanations. LIME focuses on training local surrogate models to explain individual predictions. Local surrogate models are interpretable models that are used to explain individual predictions of black box machine learning models. Surrogate models are trained to approximate the predictions of the underlying black box model. Instead of training a global surrogate model, LIME focuses on training local surrogate models.
\n", + "\n", + "In practice, LIME only optimizes the loss part. The user has to determine the complexity, e.g. by selecting the maximum number of features that the linear regression model may use.
\n", + "\n", + "So, the recipe for training local surrogate models is as follows:
\n", + "\n", + "Here we will use the model which is created using the teradataml opensouce ml functions to create the explainer and explain the modle parameters. LIME has an attribute lime_tabular that can interpret how the features correlate to the target outcome. We can also specify the mode to classification, training_label to the target outcome (Anomaly), and the features that we have selected on the training process.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "908bc562-13e9-4af4-893f-bf0097d22cc9", + "metadata": {}, + "outputs": [], + "source": [ + "import lime.lime_tabular\n", + "explainer = lime.lime_tabular.LimeTabularExplainer(X_train.get_values(), feature_names=X_train.columns, \n", + " class_names=['Anomaly','NoAnomaly'], verbose=True, mode='classification')" + ] + }, + { + "cell_type": "markdown", + "id": "dcfdd23a-a708-4954-9499-16da43b8c2ae", + "metadata": {}, + "source": [ + "We will choose 1 instance of the data and use it to explain the predictions.
\n", + "Note:Please replace the WELDING_ID with the ID we need to get explaination
\n" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "56dc3b86-06ba-4599-838b-37e2c5b193fa", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "13cf6ec9c9da4a76af1198f2f374eb6f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "WELDING_ID | sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | anomaly_int | \n", + "
---|---|---|---|---|---|---|---|---|---|---|
31 | \n", + "\t\t86754.86978245218 | \n", + "\t\t118.8968714267842 | \n", + "\t\t362.48279195869617 | \n", + "\t\t241.65701889262445 | \n", + "\t\t69.87842670568841 | \n", + "\t\t4882.994518862268 | \n", + "\t\t0.08507576069901807 | \n", + "\t\t-1.42337112306267 | \n", + "\t\t-35.30970378099613 | \n", + "\t\t0 | \n", + "\t
49 | \n", + "\t\t91432.81224184402 | \n", + "\t\t88.86290355992934 | \n", + "\t\t363.8854353719825 | \n", + "\t\t254.68749928090256 | \n", + "\t\t70.82363815606475 | \n", + "\t\t5015.98772166119 | \n", + "\t\t-0.407115172919324 | \n", + "\t\t-0.9634199725189564 | \n", + "\t\t-35.26534136945912 | \n", + "\t\t0 | \n", + "\t
66 | \n", + "\t\t103051.29313790884 | \n", + "\t\t135.40482535648476 | \n", + "\t\t380.15990390532056 | \n", + "\t\t287.05095581590206 | \n", + "\t\t62.96273728425751 | \n", + "\t\t3964.3062863264304 | \n", + "\t\t-0.6235690534090123 | \n", + "\t\t-0.5482607552102269 | \n", + "\t\t-40.70364856459071 | \n", + "\t\t1 | \n", + "\t
69 | \n", + "\t\t97411.15857106655 | \n", + "\t\t168.72998702644077 | \n", + "\t\t362.43335495393376 | \n", + "\t\t271.340274571216 | \n", + "\t\t55.08092368489766 | \n", + "\t\t3033.9081539815206 | \n", + "\t\t-0.19776981850595154 | \n", + "\t\t-1.3735577645369346 | \n", + "\t\t-33.54196500414861 | \n", + "\t\t1 | \n", + "\t
134 | \n", + "\t\t94420.8995976158 | \n", + "\t\t94.11009786670994 | \n", + "\t\t375.05861641361037 | \n", + "\t\t263.01086238890196 | \n", + "\t\t80.60147862863109 | \n", + "\t\t6496.598357121674 | \n", + "\t\t-0.4659654036466217 | \n", + "\t\t-1.005563091732258 | \n", + "\t\t-33.038380967269234 | \n", + "\t\t0 | \n", + "\t
159 | \n", + "\t\t82057.37975225871 | \n", + "\t\t90.58182588355331 | \n", + "\t\t336.17284608259644 | \n", + "\t\t228.5720884464031 | \n", + "\t\t76.26957280821834 | \n", + "\t\t5817.0477363481195 | \n", + "\t\t-0.30761862548752666 | \n", + "\t\t-1.254185147630323 | \n", + "\t\t-30.42841019493534 | \n", + "\t\t0 | \n", + "\t
161 | \n", + "\t\t84858.52334969501 | \n", + "\t\t78.97502599970588 | \n", + "\t\t398.92248591342815 | \n", + "\t\t236.37471685151814 | \n", + "\t\t98.29532213355554 | \n", + "\t\t9661.970353339455 | \n", + "\t\t-0.047871194645529774 | \n", + "\t\t-1.3935480197759396 | \n", + "\t\t-42.87126718108749 | \n", + "\t\t0 | \n", + "\t
181 | \n", + "\t\t77288.85013686282 | \n", + "\t\t86.43693248109098 | \n", + "\t\t347.45875485406475 | \n", + "\t\t215.2892761472502 | \n", + "\t\t82.09548498215857 | \n", + "\t\t6739.668654455822 | \n", + "\t\t-0.00877229297708187 | \n", + "\t\t-1.3780222101024648 | \n", + "\t\t-32.23392513119438 | \n", + "\t\t0 | \n", + "\t
183 | \n", + "\t\t84737.54845905438 | \n", + "\t\t69.41522063694657 | \n", + "\t\t414.25927750342277 | \n", + "\t\t236.0377394402629 | \n", + "\t\t111.3020148836517 | \n", + "\t\t12388.138517160622 | \n", + "\t\t0.04790406345700506 | \n", + "\t\t-1.5726990344048148 | \n", + "\t\t-41.171495496964624 | \n", + "\t\t0 | \n", + "\t
191 | \n", + "\t\t70922.03807495922 | \n", + "\t\t85.22354420250932 | \n", + "\t\t324.8828641407507 | \n", + "\t\t197.55442360712874 | \n", + "\t\t73.60226748752646 | \n", + "\t\t5417.293779305394 | \n", + "\t\t0.0890401373922395 | \n", + "\t\t-1.390612017613927 | \n", + "\t\t-28.39684234679362 | \n", + "\t\t0 | \n", + "\t
197 | \n", + "\t\t89400.06752725711 | \n", + "\t\t101.29585436346969 | \n", + "\t\t361.75865663789 | \n", + "\t\t249.02525773609221 | \n", + "\t\t73.94867759539977 | \n", + "\t\t5468.40691810838 | \n", + "\t\t-0.24937134877639008 | \n", + "\t\t-1.150863210615582 | \n", + "\t\t-37.831497680579446 | \n", + "\t\t0 | \n", + "\t
213 | \n", + "\t\t77113.92086390035 | \n", + "\t\t95.5138494285472 | \n", + "\t\t310.05832297655536 | \n", + "\t\t214.8020079774383 | \n", + "\t\t62.3636669442054 | \n", + "\t\t3889.2269547277774 | \n", + "\t\t-0.18709413696807978 | \n", + "\t\t-1.201959178021247 | \n", + "\t\t-31.09975538009448 | \n", + "\t\t0 | \n", + "\t
220 | \n", + "\t\t80431.80679541029 | \n", + "\t\t99.8457585154082 | \n", + "\t\t318.63240984263155 | \n", + "\t\t224.04403007078074 | \n", + "\t\t62.14756504135687 | \n", + "\t\t3862.319840569682 | \n", + "\t\t-0.3778431806772692 | \n", + "\t\t-1.0024887103133715 | \n", + "\t\t-34.43533812140225 | \n", + "\t\t0 | \n", + "\t
246 | \n", + "\t\t75325.58109881892 | \n", + "\t\t87.41145030453134 | \n", + "\t\t328.385988902767 | \n", + "\t\t209.820560163841 | \n", + "\t\t72.69350084131506 | \n", + "\t\t5284.345064566274 | \n", + "\t\t-0.16540605044478487 | \n", + "\t\t-1.3743463677707606 | \n", + "\t\t-34.40794546664989 | \n", + "\t\t0 | \n", + "\t
288 | \n", + "\t\t82600.6500639613 | \n", + "\t\t105.11201031767013 | \n", + "\t\t321.9040522651045 | \n", + "\t\t230.0853762227334 | \n", + "\t\t60.675474581119914 | \n", + "\t\t3681.5132156441286 | \n", + "\t\t-0.2672838739091899 | \n", + "\t\t-1.1058119595257525 | \n", + "\t\t-39.32726399334618 | \n", + "\t\t0 | \n", + "\t
212 | \n", + "\t\t83454.39580858205 | \n", + "\t\t100.12825272685797 | \n", + "\t\t361.5405819021655 | \n", + "\t\t232.4634980740447 | \n", + "\t\t75.20271193179973 | \n", + "\t\t5655.447881897253 | \n", + "\t\t0.024830506918099376 | \n", + "\t\t-1.293111687332468 | \n", + "\t\t-32.1240297553486 | \n", + "\t\t0 | \n", + "\t
82 | \n", + "\t\t94048.76279519273 | \n", + "\t\t114.03951750210967 | \n", + "\t\t366.35161875486074 | \n", + "\t\t261.97426962449225 | \n", + "\t\t68.10084349437736 | \n", + "\t\t4637.72488464568 | \n", + "\t\t-0.4827546065748303 | \n", + "\t\t-0.8556540622845826 | \n", + "\t\t-49.74532255752476 | \n", + "\t\t0 | \n", + "\t
33 | \n", + "\t\t93185.74584100883 | \n", + "\t\t124.65019295111625 | \n", + "\t\t351.7031928317579 | \n", + "\t\t259.5703226769048 | \n", + "\t\t59.354738112601765 | \n", + "\t\t3522.9849364155407 | \n", + "\t\t-0.3229589598739873 | \n", + "\t\t-1.046640890827172 | \n", + "\t\t-40.22434101551627 | \n", + "\t\t1 | \n", + "\t
26 | \n", + "\t\t103807.76297225684 | \n", + "\t\t152.857497527659 | \n", + "\t\t383.578384675018 | \n", + "\t\t289.15811412884915 | \n", + "\t\t57.608322823675024 | \n", + "\t\t3318.7188585567565 | \n", + "\t\t-0.21002500953619418 | \n", + "\t\t-1.3168188493421034 | \n", + "\t\t-42.325211030065475 | \n", + "\t\t1 | \n", + "\t
23 | \n", + "\t\t80968.38566460606 | \n", + "\t\t91.32685504056975 | \n", + "\t\t362.0288544976556 | \n", + "\t\t225.5386787314932 | \n", + "\t\t76.16092244254854 | \n", + "\t\t5800.486107299894 | \n", + "\t\t-0.06229470076093798 | \n", + "\t\t-1.2994883640937822 | \n", + "\t\t-38.24098140508306 | \n", + "\t\t0 | \n", + "\t
Please replace the IDs in the below cell with any 2 WELDING_IDs from the above output dataframe.
" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "6ab9cc59-a9a7-4445-be5d-09665230b782", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "ID1: 31\n", + "ID2: 23\n" + ] + } + ], + "source": [ + "ID1=input('ID1:')\n", + "ID2=input('ID2:')" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "id": "ba156ce1-34a1-4a24-a04d-882d0cd1a082", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4d95d72933ad4cdf9900b837a844f5ed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | \n", + "
---|---|---|---|---|---|---|---|---|
86754.86978245218 | \n", + "\t\t118.8968714267842 | \n", + "\t\t362.48279195869617 | \n", + "\t\t241.65701889262445 | \n", + "\t\t69.87842670568841 | \n", + "\t\t4882.994518862268 | \n", + "\t\t0.08507576069901807 | \n", + "\t\t-1.42337112306267 | \n", + "\t\t-35.30970378099613 | \n", + "\t
Next, we call the explainer using the selected instance and the model object created using the RandomForestClassifier.
" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "1e463fab-ef93-42e7-888a-93d560fec61c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept 0.3255261119033892\n", + "Prediction_local [0.16654005]\n", + "Right: 0.049446324561220036\n" + ] + } + ], + "source": [ + "exp = explainer.explain_instance(df.get_values().flatten(), RF_classifier.modelObj.predict_proba, num_features=9)" + ] + }, + { + "cell_type": "markdown", + "id": "36c413e2-fbea-4c14-9a6d-c57439d02db7", + "metadata": {}, + "source": [ + "We display the results using the show_in_notebook function of the explainer
" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "id": "eace0bf3-be59-47ab-a99f-007c2c2829b6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "This gives a result as shown in the image above. There are three parts to the explanation :
\n", + "\n", + "We will repeat the same steps for 1 more instance
" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "id": "c7900aca-d181-41a0-a114-5a812273d657", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3dddb984c51840a68d35ff77a82aa51a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "sum_RESISTANCE | min_RESISTANCE | max_RESISTANCE | mean_RESISTANCE | std_RESISTANCE | var_RESISTANCE | skew_RESISTANCE | kurtosis_RESISTANCE | min_resistance_diff | \n", + "
---|---|---|---|---|---|---|---|---|
80968.38566460606 | \n", + "\t\t91.32685504056975 | \n", + "\t\t362.0288544976556 | \n", + "\t\t225.5386787314932 | \n", + "\t\t76.16092244254854 | \n", + "\t\t5800.486107299894 | \n", + "\t\t-0.06229470076093798 | \n", + "\t\t-1.2994883640937822 | \n", + "\t\t-38.24098140508306 | \n", + "\t
Next, we call the explainer using the selected instance and the model object created using the RandomForestClassifier.
" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "1ee238a5-3122-44c8-8bc3-286df4f8216f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Intercept 0.3665620653425659\n", + "Prediction_local [0.04139662]\n", + "Right: 0.049446324561220036\n" + ] + } + ], + "source": [ + "exp = explainer.explain_instance(df.get_values().flatten(), RF_classifier.modelObj.predict_proba, num_features=9)" + ] + }, + { + "cell_type": "markdown", + "id": "521cc53a-4afb-498a-84ae-4fe45de60c53", + "metadata": {}, + "source": [ + "We display the results using the show_in_notebook function of the explainer
" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "e7844ab3-87ab-44d6-a1c9-eb43818d5ad7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " " + ], + "text/plain": [ + "Similar to the previous example, the above image shows three graphs that each show essential information about the anomaly.
\n", + "\n", + "The left graph shows the prediction probabilities and the middle and right most show the features and their contribution towards the prediction.
\n", + "Thus, with the explainer functions we try to get explainations using the different feature values on why the weldings have anomaly or do not have anomaly.
" + ] + }, + { + "cell_type": "markdown", + "id": "29e90d19-1b71-44e8-b6d5-aa53e3b673c1", + "metadata": {}, + "source": [ + "10. Cleanup
\n", + "Work Tables
" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "48a959e6-319f-4592-93af-482d391224b4", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['ADS_train_data', 'ADS_test_data','DF_train', 'DF_Predict', 'DF_Predict_test','additional_metrics_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "c233387e-cff1-4e6e-81a7-2e3b3221b957", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "User environment 'osml_env' removed.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_env(\"osml_env\")" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "bbf8f9bc-9f3a-47e9-b2d4-81fd00291bc8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No user environment(s) found.\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "c5cea44c-e3e0-4634-bfa9-efa65c42ac44", + "metadata": {}, + "source": [ + "If you have updated the teradataml package, reinstall the package by uncommenting and running the below code cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "93311aa2-79b1-44bd-926d-5c5bc23a1999", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# !pip install teradataml==17.20.0.6 --force-reinstall\n", + "!pip install scikit-learn==1.0.2 --force-reinstall\n", + "!pip install numpy==1.24.2 --force-reinstall" + ] + }, + { + "cell_type": "markdown", + "id": "d51fd98f-b9b2-48b9-b639-16cc51f9116f", + "metadata": {}, + "source": [ + "11. Exploring the Versatility of this Analytical Approach in Alternative Use Case Settings
\n", + "How this analytic approach can be levaraged in other use case settings
\n", + "\n", + "The analytical approach of leveraging clustering followed by classification for anomaly detection in short time series data is highly adaptable and can be broadly applied across various industries, especially in settings where operations or processes are characterized by short, continuous time series with a defined start and end and where ground truth labels are not initially available.
\n", + "This method begins with unsupervised learning to explore and understand the data, identifying patterns, similarities, and potential outliers through techniques like Dynamic Time Warping (DTW). Such exploration is crucial in settings where anomalies are not predefined or where the data’s inherent complexity requires initial unsupervised insight to develop an understanding of what constitutes normal behavior versus an anomaly. Following the clustering phase, supervised classification models are trained on the newly identified labels to predict anomalies. This generic approach is particularly effective for short time series data, where each sequence represents a process or event whose normal operational parameters need to be defined through exploratory analysis before precise anomaly detection can occur.
\n", + "Potential Use Cases Across Industries:
\n", + "- Power Grid Load Monitoring:
Analyzing short time series of electricity load during peak usage times to identify anomalies that could indicate equipment failure, energy theft, or inefficiencies in power distribution. Each series could represent the load profile for a brief, high-demand period.- ECG or EEG Analysis:
Short segments of electrocardiogram (ECG) or electroencephalogram (EEG) readings can be analyzed to detect anomalies indicating cardiac arrhythmias or neurological issues, respectively. Each segment represents a complete heartbeat or a brief brain activity pattern.- CNC Machine Operations:
Monitoring the torque and force profiles of a CNC (Computer Numerical Control) machine during a single machining operation. Anomalies could indicate tool wear, material inconsistency, or operational errors.- Aircraft Engine Test Runs:
Analyzing the time series data of engine parameters (e.g., temperature, pressure, vibration) during short test runs to identify deviations from normal operational profiles, suggesting maintenance or safety issues.- Theme Park Ride Operations:
Analyzing sensor data from individual rides, where each ride cycle produces a time series of mechanical or operational parameters. Anomalies in these series could indicate safety concerns or maintenance needs.Conclusion
\n", + "In each of these scenarios, the focus is on analyzing the shape or behavior of a curve within a short time frame, similar to observing a spot welding curve. These curves are shaped by the specific activity taking place, whether it’s a machine at work, a health test running, financial trades happening, or people interacting with a service. The method begins by sorting these curves into groups based on their patterns, without needing to know ahead of time which ones are out of the ordinary. Then, it moves on to use a more detailed approach to pinpoint which curves don’t fit the expected pattern, labeling them as either normal or not normal. This way of doing things is great for quickly finding and addressing issues, and it also helps in getting a better grasp of how these processes work. This can lead to making things run more smoothly and keeping equipment in good shape before problems even start.
" + ] + }, + { + "cell_type": "markdown", + "id": "91bd8857-19e0-4200-b3ae-b2efdbca73d3", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
\n", + "Filters: \n", + "\n",
+ " Telco Churn using Enterprise Feature Store in Vantage\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "\n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.
\n", + "\n", + "Successful AI/ML implementations face three main challenges:
\n", + "Addressing these challenges requires strategic planning, skilled talent, and integration with existing systems. Oraganizations with a history in Data Management recognize the benefits of reusable Data Products, making Enterprise Feature Stores a valuable investment.
\n", + "\n", + "A Feature Store is a curated repository of pre-calculated features, simplifying the journey from data to actionable insights. An Enterprise Feature Store extends across domains/teams, incorporating a Governance Framework for predictable feature delivery.
\n", + " \n", + "While most features are reusable, some need model-specific calculations before integration into a unified dataset.
\n", + " \n", + "The key difference between Feature Store (FS) and Enterprise Feature Store (EFS) is the scope across multiple domains/teams along with the Governance Framework (that gives an assurance that features are delivered under predictable SLAs and it also defines the operating model how the EFS is used across different teams/domains and how features lifecycle is managed). Although most Features are considered as re-usable, there is still some minor part of Features that must be calculated as model-specific (e.g., scaled variables, principal components, etc.) and then combined with the rest of the pre-calculated Features into a single data set (ADS). The figure below describes this co-existence of model-specific ADS(es) and model-independent EFS.
\n", + "\n", + "Business Values
\n", + "\n", + "Why Vantage?
\n", + "There are several reasons why EFS naturally fits to Teradata Vantage:
\n", + "The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "\n", + "\n", + "Methodology
\n", + "In this demo we have used a methodology which involves analyzing a time series of data, where each data point represents the outstanding amount at the end of each month. To detect anomalies, we use the following steps:
\n", + "\n", + "Z = (X - μ) / σ
where X is the value in question, μ is the mean, and σ is the standard deviation.
\n", + "\n", + "It's important to note that the computation of the Z-score and the anomaly flag is dependent on the values of the mean and standard deviation. These dependent features are not computed at the same time as the static features but are derived later, once the latest outstanding amount (the new data point) becomes available.
\n", + "\n", + "Feature Engineering
\n", + "Feature engineering is a crucial step in the entity-feature paradigm, as it involves creating and transforming features to better represent the underlying problem for predictive modeling. In our case, the feature engineering process is twofold, each with its specific inputs and outputs. Below are the processes that are a part of this feature engineering
\n", + "\n", + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
Note: Please execute the above pip install to get the latest version of the required library. Be sure to restart the kernel after executing those lines to bring the installed libraries into memory. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
Setup a Feature Store
" + ] + }, + { + "cell_type": "markdown", + "id": "0776e2ee-bdb8-4927-9d14-5cff5583b6ee", + "metadata": {}, + "source": [ + "We can now set-up the feature store using the FeatureStore.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e72cc2f-2e36-47cd-a2da-19da50f892d2", + "metadata": {}, + "outputs": [], + "source": [ + "username=env_vars.get(\"username\")\n", + "fs = FeatureStore(repo=username)\n", + "fs.setup()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7f40c3-8efe-4042-9ede-a767681f0fcf", + "metadata": {}, + "outputs": [], + "source": [ + "# List whether FeatureStore is setup or not.\n", + "fs.list_repos()" + ] + }, + { + "cell_type": "markdown", + "id": "e80e18e4-d009-4d88-8340-72636ca8f0dd", + "metadata": {}, + "source": [ + "3.Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below
" + ] + }, + { + "cell_type": "markdown", + "id": "d8540286-8309-47c6-9aff-fe153700ee9d", + "metadata": {}, + "source": [ + "4. Feature Engineering
" + ] + }, + { + "cell_type": "markdown", + "id": "c4992424-3837-4a9f-b532-2e8d188d8c02", + "metadata": {}, + "source": [ + "The code creates a DataFrame named df
using the DataFrame function. The in_schema
function specifies the schema, which in this case is \"DEMO_Telco\", and the table name \"Customer_Churn\". Let us now start with feature engineering.
This code performs the following operations:
\n", + "df.assign()
function is used to create new columns or modify existing ones in the DataFrame df
.case
function to convert \"Yes\" to 1 and \"No\" to 0. If the value is neither \"Yes\" nor \"No\", it defaults to 0.df
statement displays the modified DataFrame.Let's store the transformed data to table.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df39f7d9-37b5-445f-bb29-4f4883c0d021", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(\n", + " df=df,\n", + " table_name='transformed_data',\n", + " if_exists='replace'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "3cf33828-2e9e-42e5-883b-469cb47dd515", + "metadata": {}, + "source": [ + "5. Save feature and feature processing to Feature Store
" + ] + }, + { + "cell_type": "markdown", + "id": "2ec83605-4d29-47d5-93c3-7c05791d1782", + "metadata": {}, + "source": [ + "Now we will proceed to save the features as well as the feature processing logic in feature store.
\n", + "This will allow us to re-use the features and processing later-on, avoiding to re-write the processing logic.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19557186-c5be-4be6-a452-1ff6869865e3", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame('transformed_data')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edbe4e7e-0b44-4624-9657-da2f5b7aff02", + "metadata": {}, + "outputs": [], + "source": [ + "# Create FeatureGroup for this DataFrame.\n", + "fg = FeatureGroup.from_DataFrame(name='telcom', df=df, entity_columns='CustomerID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0565b334-f84e-4843-92b2-7e784494ac07", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at Features\n", + "fg.features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9cb9c82f-8e99-4cca-bd7a-5d77c7764bd9", + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at Entity.\n", + "fg.entity.columns" + ] + }, + { + "cell_type": "markdown", + "id": "191c113e-08af-4eca-bf7c-fcab24be71ec", + "metadata": {}, + "source": [ + "Here we will saving the features and processing with additional metadata such as project names as churn
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c5c15a2-ddca-4af5-919b-bc7cfd5fd11e", + "metadata": {}, + "outputs": [], + "source": [ + "# upload the features in the physical feature store\n", + "fs.apply(fg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37931c07-6843-4835-9b33-edd8b8fbc131", + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_features()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e3a60da1-b8ea-4461-a953-2da276a11d8f", + "metadata": {}, + "outputs": [], + "source": [ + "fs.list_feature_groups()" + ] + }, + { + "cell_type": "markdown", + "id": "beda7104-0cc0-48f9-b98f-ad573be558ea", + "metadata": {}, + "source": [ + "6. Re-using features for machine learning
" + ] + }, + { + "cell_type": "markdown", + "id": "57de3efb-2956-4a2a-9fc9-9c9ddd7bc155", + "metadata": {}, + "source": [ + "Now that our features have been stores in feature store, let us re-use them to train a machine learning model
\n", + "We now need to just specify the feature name, we do not need to specify the processing logic
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce440d3-7c72-4400-82b0-2ff09e8fc914", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = fs.get_dataset('telcom')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "fcaca5d6-2e5a-400f-a653-cbf70c1564c8", + "metadata": {}, + "source": [ + "We have our training dataset which is created, with all the feature engineering
\n", + "We can see from that the column Multiple lines has only two values yes and no. The same features can also be re-used accross multiple use-cases and models without any data preperation
" + ] + }, + { + "cell_type": "markdown", + "id": "46f6bf8f-b9e1-441a-b089-c1ceebdbc059", + "metadata": {}, + "source": [ + "We split the dataset in to training and testing dataset with 80:20 split ratio.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a5f3a51-60a2-4b80-8f82-a77d8adcc322", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Performing sampling to get 80% for trainning and 20% for testing\n", + "tdf_sample = df.sample(frac = [0.8, 0.2])\n", + "\n", + "# Fetching train and test data\n", + "tdf_train= tdf_sample[tdf_sample['sampleid'] == 1].drop('sampleid', axis=1)\n", + "tdf_test = tdf_sample[tdf_sample['sampleid'] == 2].drop('sampleid', axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "c86ae3c5-c44c-4c1d-bf08-c8e8d3d83d58", + "metadata": {}, + "source": [ + "AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.
\n", + "\n", + "We create a AutoClassifier
instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude
parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs
specifies the time limit in seconds for model training.\n",
+ "
\n",
+ "verbose
: specifies the detailed execution steps based on verbose level as follows:\n",
+ "
AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.
\n", + "\n", + "We create a AutoClassifier
instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude
parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs
specifies the time limit in seconds for model training.\n",
+ "
\n",
+ "verbose
: specifies the detailed execution steps based on verbose level as follows:\n",
+ "
Note: Since the AutoML functionality does a lot of steps like Feature exploration and Data Preparation along with Model Training and Evaluating to select the Best model the below step may take anywhere between 12-15 minutes
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "378743d2-daae-4a91-af06-fbd566072902", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fitting train data \n", + "aml.fit(data = tdf_train, target_column = 'Churn')" + ] + }, + { + "cell_type": "markdown", + "id": "3ea7b521-d7c1-4994-b380-b0308c85743d", + "metadata": {}, + "source": [ + "Here, we generate model leaderboard and leader for a given dataset. Leaderboard is a ranked table with a list of models with all their evaluation metrics.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22d69751-fe5e-46dd-9c24-3322a5bd1487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching leaderboard\n", + "\n", + "aml.leaderboard()" + ] + }, + { + "cell_type": "markdown", + "id": "7a330d76-3671-42f0-a2e3-e0b7f2138048", + "metadata": {}, + "source": [ + "The following function displays the best performing model.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2054141f-3c2e-47be-b9db-aef8b4c3424d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching best performing model\n", + "aml.leader()" + ] + }, + { + "cell_type": "markdown", + "id": "55a1096f-7c8b-4e2f-b028-aa7f89c22f15", + "metadata": {}, + "source": [ + "The predict function generates predictions using either the default test data or any specified dataset, based on the model's rank in the leaderboard, and displays the performance metrics of the chosen model. If the test data contains a target column, both predictions and performance metrics are displayed; otherwise, only the predictions are shown.\n",
+ "
\n",
+ "You can also use the rank
parameter in the predict function. The rank
parameter specifies the model's rank in the leaderboard to be used for prediction. By default, the rank is set to 1, meaning the best-performing model is used.
Here, we specify the tdf_test
dataset for prediction. When using external data instead of the default test data, the predict function applies all the data transformation steps performed during the training phase on the external data before passing the data to the model for prediction.
We used feature store to store features as well as its processing. We re-used it in model training. The features and processing can be re-used accross multiple machine leanring models and use-case , helping to improve data science productivity
\n", + "\n", + "Teradata's AutoML functionality plays a crucial role in this context by automating the complex process of building and deploying machine learning models. AutoML ensures the most optimal preparation and training of models, delivering high-quality machine learning models in minutes. Through hyperparameter tuning (HPT), Teradata's AutoML can automatically select the best parameters for machine learning algorithms using grid search and random search techniques, significantly enhancing model performance.\n",
+ "
\n",
+ "By leveraging Teradata's AutoML, companies can save time and reduce costs associated with manual model building and tuning. The technology not only improves the accuracy of predictive models but also democratizes the power of machine learning, allowing customers to utilize advanced analytics without requiring extensive coding or data science expertise. This capability enables companies to swiftly and effectively analyze customer churn data, develop predictive models, and implement proactive strategies to retain customers and enhance their satisfaction.\n",
+ "
\n",
+ "In conclusion, Teradata's AutoML functionality is a vital tool for banks aiming to reduce customer churn. By automating and optimizing the machine learning process, Teradata empowers various industries to make data-driven decisions that improve customer retention and drive long-term profitability.
Work Tables
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ff3fb9b-4d13-4628-988d-f82463d96537", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['transformed_data']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21fac6bb-3e3a-488c-848b-41473d6156e7", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "72bfa61c-3daa-4d47-b0d7-0a69ef13dc1a", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
" + ] + }, + { + "cell_type": "markdown", + "id": "fc4938d2-5ce6-412e-a665-5d62a3b1a1b5", + "metadata": {}, + "source": [ + "Filters:
\n", + "Related Resources:
\n", + "Reference Links:
\n", + "\n",
+ " Telco Churn using Feature Store in Vantage\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "\n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.
\n", + "\n", + "Successful AI/ML implementations face three main challenges:
\n", + "Addressing these challenges requires strategic planning, skilled talent, and integration with existing systems. Oraganizations with a history in Data Management recognize the benefits of reusable Data Products, making Enterprise Feature Stores a valuable investment.
\n", + "\n", + "A Feature Store is a curated repository of pre-calculated features, simplifying the journey from data to actionable insights. An Enterprise Feature Store extends across domains/teams, incorporating a Governance Framework for predictable feature delivery.
\n", + " \n", + "While most features are reusable, some need model-specific calculations before integration into a unified dataset.
\n", + " \n", + "The key difference between Feature Store (FS) and Enterprise Feature Store (EFS) is the scope across multiple domains/teams along with the Governance Framework (that gives an assurance that features are delivered under predictable SLAs and it also defines the operating model how the EFS is used across different teams/domains and how features lifecycle is managed). Although most Features are considered as re-usable, there is still some minor part of Features that must be calculated as model-specific (e.g., scaled variables, principal components, etc.) and then combined with the rest of the pre-calculated Features into a single data set (ADS). The figure below describes this co-existence of model-specific ADS(es) and model-independent EFS.
\n", + "\n", + "Business Values
\n", + "\n", + "Why Vantage?
\n", + "There are several reasons why EFS naturally fits to Teradata Vantage:
\n", + "The unique massively-parallel architecture of Teradata Vantage allows users to prepare data, train, evaluate, and deploy models at unprecedented scale.
\n", + "\n", + "\n", + "Methodology
\n", + "In this demo we have used a methodology which involves analyzing a time series of data, where each data point represents the outstanding amount at the end of each month. To detect anomalies, we use the following steps:
\n", + "\n", + "Z = (X - μ) / σ
where X is the value in question, μ is the mean, and σ is the standard deviation.
\n", + "\n", + "It's important to note that the computation of the Z-score and the anomaly flag is dependent on the values of the mean and standard deviation. These dependent features are not computed at the same time as the static features but are derived later, once the latest outstanding amount (the new data point) becomes available.
\n", + "\n", + "Feature Engineering
\n", + "Feature engineering is a crucial step in the entity-feature paradigm, as it involves creating and transforming features to better represent the underlying problem for predictive modeling. In our case, the feature engineering process is twofold, each with its specific inputs and outputs. Below are the processes that are a part of this feature engineering
\n", + "\n", + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
Note: Please execute the above pip install to get the latest version of the required library. Be sure to restart the kernel after executing those lines to bring the installed libraries into memory. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
Setup a Feature Store
" + ] + }, + { + "cell_type": "markdown", + "id": "0776e2ee-bdb8-4927-9d14-5cff5583b6ee", + "metadata": {}, + "source": [ + "We can now set-up the feature store using the tdfs4dslibrary.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e72cc2f-2e36-47cd-a2da-19da50f892d2", + "metadata": {}, + "outputs": [], + "source": [ + "username=env_vars.get(\"username\")\n", + "tdfs4ds.setup(database=username)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d7f40c3-8efe-4042-9ede-a767681f0fcf", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.connect(database=username)" + ] + }, + { + "cell_type": "markdown", + "id": "e80e18e4-d009-4d88-8340-72636ca8f0dd", + "metadata": {}, + "source": [ + "3.Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below
" + ] + }, + { + "cell_type": "markdown", + "id": "d8540286-8309-47c6-9aff-fe153700ee9d", + "metadata": {}, + "source": [ + "4. Feature Engineering
" + ] + }, + { + "cell_type": "markdown", + "id": "c4992424-3837-4a9f-b532-2e8d188d8c02", + "metadata": {}, + "source": [ + "Let us now start with feature engineering. We will replace multiple values which indicate absence of a service by No
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "271b6d33-7792-442d-8c46-7e4d659e5920", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame(in_schema(\"DEMO_Telco\", \"Customer_Churn\"))\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a4cb4956-2260-4c65-a6fa-b91ec4048cc2", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(\n", + " oreplace_MultipleLines = func.oreplace(\n", + " df.MultipleLines.expression,\"No phone service\",\"No\"\n", + " ),\n", + " oreplace_OnlineSecurity = func.oreplace(\n", + " df.OnlineSecurity.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_OnlineBackup = func.oreplace(\n", + " df.OnlineBackup.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_DeviceProtection = func.oreplace(\n", + " df.DeviceProtection.expression, \"No internet service\",\"No\"\n", + " ), \n", + " oreplace_TechSupport = func.oreplace(\n", + " df.TechSupport.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_StreamingTV = func.oreplace(\n", + " df.StreamingTV.expression, \"No internet service\",\"No\"\n", + " ),\n", + " oreplace_StreamingMovies = func.oreplace(\n", + " df.StreamingMovies.expression, \"No internet service\",\"No\"\n", + " )\n", + ")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "b702973d-8a23-4070-94cc-b031013a11e1", + "metadata": {}, + "source": [ + "We will also convert Churn column value from Yes / No to 1 or 0
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e491f404-c614-4c13-a2f0-31daa7305750", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(\n", + " drop_columns = True,\n", + " CustomerID = df.CustomerID,\n", + " Gender = df.Gender,\n", + " SeniorCitizen = df.SeniorCitizen,\n", + " Partner = df.Partner,\n", + " Dependents = df.Dependents,\n", + " Tenure = df.Tenure,\n", + " PhoneService = df.PhoneService,\n", + " MultipleLines = df.oreplace_MultipleLines,\n", + " InternetService = df.InternetService,\n", + " OnlineSecurity = df.oreplace_OnlineSecurity,\n", + " OnlineBackup = df.oreplace_OnlineBackup,\n", + " DeviceProtection = df.oreplace_DeviceProtection,\n", + " TechSupport = df.oreplace_TechSupport,\n", + " StreamingTV = df.oreplace_StreamingTV,\n", + " StreamingMovies = df.oreplace_StreamingMovies,\n", + " Contract = df.Contract,\n", + " PaperlessBilling = df.PaperlessBilling,\n", + " PaymentMethod = df.PaymentMethod,\n", + " MonthlyCharges = df.MonthlyCharges,\n", + " TotalCharges = df.TotalCharges,\n", + " Churn = case({ \"Yes\" : 1, \"No\" : 0},value=df.Churn,else_=0)\n", + ") \n", + "\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "3cf33828-2e9e-42e5-883b-469cb47dd515", + "metadata": {}, + "source": [ + "5. Save feature and feature processing to Feature Store
" + ] + }, + { + "cell_type": "markdown", + "id": "2ec83605-4d29-47d5-93c3-7c05791d1782", + "metadata": {}, + "source": [ + "Now we will proceed to save the features as well as the feature processing logic in feature store.
\n", + "This will allow us to re-use the features and processing later-on, avoiding to re-write the processing logic.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "edbe4e7e-0b44-4624-9657-da2f5b7aff02", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.DATA_DOMAIN='efs_telco'\n", + "tdfs4ds.VARCHAR_SIZE=50" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ed9af79-d838-4ac3-8d67-3bb80f380bb9", + "metadata": {}, + "outputs": [], + "source": [ + "df = crystallize_view(df, view_name = 'PROC_FEATURE_ENGINEERING', schema_name = username,output_view=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68668b95-3b30-4f34-99e0-a19e37a52aec", + "metadata": {}, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f61ac745-3c56-43c0-8e24-b32ed6c24246", + "metadata": {}, + "outputs": [], + "source": [ + "# define the set of columns defining the entity id\n", + "entity_id = ['CustomerID']\n", + "# list the columns dealing with the features\n", + "features = df.columns[1::]\n", + "features" + ] + }, + { + "cell_type": "markdown", + "id": "cc8da6b1-fa36-44f8-ab70-bc0aa0329897", + "metadata": {}, + "source": [ + "We will create a Data Domain for the feature store
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9665f2d9-4968-4ac6-b880-fc3f6fa46b12", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.DATA_DOMAIN" + ] + }, + { + "cell_type": "markdown", + "id": "191c113e-08af-4eca-bf7c-fcab24be71ec", + "metadata": {}, + "source": [ + "Here we will saving the features and processing with additional metadata such as project names as churn
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c5c15a2-ddca-4af5-919b-bc7cfd5fd11e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# upload the features in the physical feature store\n", + "tdfs4ds.upload_features(\n", + " df,\n", + " entity_id = entity_id,\n", + " feature_names = features,\n", + " metadata = {'project': 'churn'}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "81390658-0330-423d-87b4-001311a51289", + "metadata": {}, + "source": [ + "We can now use the feature catalog command to visualize all features which have been saved in the feature store
\n", + "All features are time dependent, as seen by the column validity start and end
\n", + "This means you can change the processing logic, but still keep the history of the features
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37931c07-6843-4835-9b33-edd8b8fbc131", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.feature_catalog()" + ] + }, + { + "cell_type": "markdown", + "id": "beda7104-0cc0-48f9-b98f-ad573be558ea", + "metadata": {}, + "source": [ + "6. Re-using features for machine learning
" + ] + }, + { + "cell_type": "markdown", + "id": "57de3efb-2956-4a2a-9fc9-9c9ddd7bc155", + "metadata": {}, + "source": [ + "Now that our features have been stores in feature store, let us re-use them to train a machine learning model
\n", + "We now need to just specify the feature name, we do not need to specify the processing logic
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dce440d3-7c72-4400-82b0-2ff09e8fc914", + "metadata": {}, + "outputs": [], + "source": [ + "tdfs4ds.connect(database=username)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "537bc561-8519-41d3-88f9-37ebfa215836", + "metadata": {}, + "outputs": [], + "source": [ + "entity_id = ['CustomerID']\n", + "features = ['Gender',\n", + " 'SeniorCitizen',\n", + " 'Partner',\n", + " 'Dependents',\n", + " 'Tenure',\n", + " 'PhoneService',\n", + " 'MultipleLines',\n", + " 'InternetService',\n", + " 'OnlineSecurity',\n", + " 'OnlineBackup',\n", + " 'DeviceProtection',\n", + " 'TechSupport',\n", + " 'StreamingTV',\n", + " 'StreamingMovies',\n", + " 'Contract',\n", + " 'PaperlessBilling',\n", + " 'PaymentMethod',\n", + " 'MonthlyCharges',\n", + " 'TotalCharges',\n", + " 'Churn']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8a64b16-9913-4b10-a3f7-1f1be67fef8d", + "metadata": {}, + "outputs": [], + "source": [ + "selected_features = get_feature_versions(entity_name=entity_id,features=features)\n", + "selected_features" + ] + }, + { + "cell_type": "markdown", + "id": "f5dd9918-f5d1-4cf3-9f4e-cbf1241ff47f", + "metadata": {}, + "source": [ + "We can now build our training dataset by specifying the build_dataset command
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b8d3d05c-2d1d-4307-bce3-b879a8624f38", + "metadata": {}, + "outputs": [], + "source": [ + "df = tdfs4ds.build_dataset(\n", + " entity_id = entity_id,\n", + " selected_features = selected_features,\n", + " view_name = 'mydataset',\n", + " schema_name = username,\n", + " comment = 'dataset for churn prediction'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "fcaca5d6-2e5a-400f-a653-cbf70c1564c8", + "metadata": {}, + "source": [ + "We have our training dataset which is created, with all the feature engineering
\n", + "We can see from that the column Multiple lines has only two values yes and no. The same features can also be re-used accross multiple use-cases and models without any data preperation
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb1ff7df-5216-4c26-95ff-641e16d9ba9c", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "df = DataFrame(in_schema(username , 'mydataset'))\n", + "copy_to_sql(df, table_name='fs_dataset', if_exists ='replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73057d11-c8b9-4ce5-9cc3-1f7375d0f3f7", + "metadata": {}, + "outputs": [], + "source": [ + "df = DataFrame('fs_dataset')\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "46f6bf8f-b9e1-441a-b089-c1ceebdbc059", + "metadata": {}, + "source": [ + "We split the dataset in to training and testing dataset with 80:20 split ratio.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a5f3a51-60a2-4b80-8f82-a77d8adcc322", + "metadata": {}, + "outputs": [], + "source": [ + "# Performing sampling to get 80% for trainning and 20% for testing\n", + "tdf_sample = df.sample(frac = [0.8, 0.2])\n", + "\n", + "# Fetching train and test data\n", + "tdf_train= tdf_sample[tdf_sample['sampleid'] == 1].drop('sampleid', axis=1)\n", + "tdf_test = tdf_sample[tdf_sample['sampleid'] == 2].drop('sampleid', axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "c86ae3c5-c44c-4c1d-bf08-c8e8d3d83d58", + "metadata": {}, + "source": [ + "AutoML (Automated Machine Learning) is an approach that automates the process of building, training, and validating machine learning models. It involves various algorithms to automate various aspects of the machine learning workflow, such as data preparation, feature engineering, model selection, hyperparameter tuning, and model deployment. It aims to simplify the process of building machine learning models, by automating some of the more time-consuming and labor-intensive tasks involved in the process.
\n", + "\n", + "We create a AutoClassifier
instance which is a special purpose AutoML feature to run classification specific tasks. We use the exclude
parameter to specify model algorithms to be excluded from model training phase. Here we exclude the 'knn' model. The max_runtime_secs
specifies the time limit in seconds for model training.\n",
+ "
\n",
+ "verbose
: specifies the detailed execution steps based on verbose level as follows:\n",
+ "
Note: Since the AutoML functionality does a lot of steps like Feature exploration and Data Preparation along with Model Training and Evaluating to select the Best model the below step may take anywhere between 12-15 minutes
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "378743d2-daae-4a91-af06-fbd566072902", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fitting train data \n", + "aml.fit(data = tdf_train,target_column = 'Churn')" + ] + }, + { + "cell_type": "markdown", + "id": "3ea7b521-d7c1-4994-b380-b0308c85743d", + "metadata": {}, + "source": [ + "Here, we generate model leaderboard and leader for a given dataset. Leaderboard is a ranked table with a list of models with all their evaluation metrics.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22d69751-fe5e-46dd-9c24-3322a5bd1487", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching leaderboard\n", + "\n", + "aml.leaderboard()" + ] + }, + { + "cell_type": "markdown", + "id": "7a330d76-3671-42f0-a2e3-e0b7f2138048", + "metadata": {}, + "source": [ + "The following function displays the best performing model.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2054141f-3c2e-47be-b9db-aef8b4c3424d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Fetching best performing model\n", + "aml.leader()" + ] + }, + { + "cell_type": "markdown", + "id": "55a1096f-7c8b-4e2f-b028-aa7f89c22f15", + "metadata": {}, + "source": [ + "The predict function generates predictions using either the default test data or any specified dataset, based on the model's rank in the leaderboard, and displays the performance metrics of the chosen model. If the test data contains a target column, both predictions and performance metrics are displayed; otherwise, only the predictions are shown.\n",
+ "
\n",
+ "You can also use the rank
parameter in the predict function. The rank
parameter specifies the model's rank in the leaderboard to be used for prediction. By default, the rank is set to 1, meaning the best-performing model is used.
Here, we specify the tdf_test
dataset for prediction. When using external data instead of the default test data, the predict function applies all the data transformation steps performed during the training phase on the external data before passing the data to the model for prediction.
We used feature store to store features as well as its processing. We re-used it in model training. The features and processing can be re-used accross multiple machine leanring models and use-case , helping to improve data science productivity
\n", + "\n", + "Teradata's AutoML functionality plays a crucial role in this context by automating the complex process of building and deploying machine learning models. AutoML ensures the most optimal preparation and training of models, delivering high-quality machine learning models in minutes. Through hyperparameter tuning (HPT), Teradata's AutoML can automatically select the best parameters for machine learning algorithms using grid search and random search techniques, significantly enhancing model performance.\n",
+ "
\n",
+ "By leveraging Teradata's AutoML, companies can save time and reduce costs associated with manual model building and tuning. The technology not only improves the accuracy of predictive models but also democratizes the power of machine learning, allowing customers to utilize advanced analytics without requiring extensive coding or data science expertise. This capability enables companies to swiftly and effectively analyze customer churn data, develop predictive models, and implement proactive strategies to retain customers and enhance their satisfaction.\n",
+ "
\n",
+ "In conclusion, Teradata's AutoML functionality is a vital tool for banks aiming to reduce customer churn. By automating and optimizing the machine learning process, Teradata empowers various industries to make data-driven decisions that improve customer retention and drive long-term profitability.
Work Tables
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a07f45cf-ff45-4b95-810c-061fb4d1e528", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP VIEW {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_V')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e0c61f8-ce5a-4d77-a8dc-ed1628b499d0", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_T')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4c9de55f-8b22-40f6-8044-8b63cf5617e6", + "metadata": {}, + "outputs": [], + "source": [ + "list_of_tables = db_list_tables()\n", + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t.startswith('FS_')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc241d13-6921-4582-8a9c-d1f5da3ec360", + "metadata": {}, + "outputs": [], + "source": [ + "[execute_sql(f\"DROP TABLE {username}.{t}\") for t in list_of_tables.TableName if t in ['temp','tdfs__fgjnojnsmdoignmosnig']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21fac6bb-3e3a-488c-848b-41473d6156e7", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "72bfa61c-3daa-4d47-b0d7-0a69ef13dc1a", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
" + ] + }, + { + "cell_type": "markdown", + "id": "fc4938d2-5ce6-412e-a665-5d62a3b1a1b5", + "metadata": {}, + "source": [ + "Filters:
\n", + "Related Resources:
\n", + "Reference Links:
\n", + "\n",
+ " Financial Fraud Detection with Python and TeradataML\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n",
+ " In recent years we have seen a massive increase in Fraud attempts, making fraud detection imperative for Banking and Financial Institutions. Despite countless efforts and human supervision, hundreds of millions of dollars are lost due to fraud. Fraud can happen using various methods, i.e., stolen credit cards, misleading accounting, phishing emails, etc. Due to small cases in significant populations, fraud detection has become more and more challenging. \n",
+ "
\n",
+ "
\n",
+ " With ClearScape Analytics, data scientists can use their preferred language, tools and platform to develop models to identify this fraud. Even in large scale operations, users have the guarantee that Vantage can scale to their needs and reduce fraud.
Business Values
\n", + "Why Vantage?
\n", + "To maximize the business value of advanced analytic techniques including Machine Learning and Artificial Intelligence, it is estimated that organizations must scale their model development and deployment pipelines to 100s or 1000s of times greater amounts of data, models, or both.\n",
+ "
\n",
+ "
\n",
+ " ClearScape Analytics provides powerful, flexible end-to-end data connectivity, feature engineering, model training, evaluation, and operational functions that can be deployed at scale as enterprise data assets; treating the products of ML and AI as first-class analytic processes in the enterprise.
Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Standard Libraries\n", + "import os\n", + "import json\n", + "import getpass\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "\n", + "# Teradata Libraries\n", + "from teradataml import *\n", + "\n", + "from dotenv import load_dotenv, dotenv_values\n", + "# Configuration\n", + "spacing_large = \" \"*95\n", + "spacing_small = \" \"*12\n", + "display.max_rows = 5\n", + "configure.val_install_location = 'td_val'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql(\"SET query_band='DEMO=VCL_Financial_Fraud_Detection_Python.ipynb;' UPDATE FOR SESSION;\")\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.Load the data and Data Exploration
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_GLM_Fraud\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "Note: The tables are available in DEMO_GLM_Fraud_DB databases and we have created views in DEMO_GLM_Fraud databases which are used in the cells below
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "We loaded the data from https://www.kaggle.com/code/georgepothur/4-financial-fraud-detection-xgboost/data into Vantage in a table named \"transaction_data\". We checked the data size and printed sample rows: 63k rows and 12 columns.
\n", + "*Please scroll down to the end of the notebook for detailed column descriptions of the dataset.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_data = DataFrame(in_schema('DEMO_GLM_Fraud', 'transaction_data'))\n", + "\n", + "print(txn_data.shape)\n", + "txn_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this simulated scenario, deceptive agents engage in transactions with the objective of taking control of customers' accounts, transferring funds to another account, and ultimately cashing out for profit.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1 How many fraudulent transactions do we have in our dataset?
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# There are 92 fraud transactions i.e. 0.14% of fraud transactions in the dataset.\n", + "print(\"No of fraud transactions: %d\\nPercentage of fraud transactions: %.2f%%\"%(\n", + " txn_data.loc[txn_data.isFraud == 1].shape[0],\n", + " txn_data.loc[txn_data.isFraud == 1].shape[0]/txn_data.shape[0]*100)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.2 How many transactions do we have group by transaction type?
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter data for fraud transactions and group by 'type'\n", + "transactions_by_type = txn_data.groupby('type').count().get(['type','count_txn_id'])\n", + "\n", + "\n", + "# Sort by 'count_step' column in descending order\n", + "transactions_by_type = transactions_by_type.sort('count_txn_id', ascending = False)\n", + "\n", + "transactions_by_type = transactions_by_type.assign(\n", + " type_int = case([\n", + " (transactions_by_type.type == 'CASH_IN', 0),\n", + " (transactions_by_type.type == 'CASH_OUT', 1),\n", + " (transactions_by_type.type == 'DEBIT', 2),\n", + " (transactions_by_type.type == 'PAYMENT ', 3),\n", + " (transactions_by_type.type == 'TRANSFER', 4),\n", + " ])\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transactions_by_type.plot(\n", + " x = transactions_by_type.type_int,\n", + " y = transactions_by_type.count_txn_id,\n", + " kind = 'bar',\n", + " legend = ['Count by Type'],\n", + " ylabel = 'Count of Transactions',\n", + " xlabel = spacing_small.join(sorted(list(transactions_by_type[['type']].get_values().flatten()))),\n", + " title = \"Number of Transactions per Transaction Type\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.3 How many fraudulent transactions do we have group by transaction type?
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Filter data for fraud transactions and group by 'type'\n", + "fraud_transactions_by_type = txn_data.loc[txn_data.isFraud == 1].groupby('type').count().get(['type','count_txn_id'])\n", + "\n", + "# Sort by 'count_step' column in descending order\n", + "fraud_transactions_by_type = fraud_transactions_by_type.sort('count_txn_id', ascending = False)\n", + "\n", + "fraud_transactions_by_type = fraud_transactions_by_type.assign(\n", + " total_fraud = txn_data.loc[txn_data.isFraud == 1].shape[0],\n", + " type_int = case([(fraud_transactions_by_type.type == 'TRANSFER', 0)], else_ = 1)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "fraud_transactions_by_type.plot(\n", + " x = fraud_transactions_by_type.type_int,\n", + " y = [fraud_transactions_by_type.total_fraud, fraud_transactions_by_type.count_txn_id],\n", + " kind = 'bar',\n", + " figsize = (800, 500),\n", + " legend = ['Total Fraud', 'Count by Type'],\n", + " ylabel = 'Count of Fraud Transactions',\n", + " xlabel = 'TRANSFER' + spacing_large + 'CASH_OUT',\n", + " title = \"Number of Fraud Transactions by Transaction Type\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above result, we can see that out of the 92 fraud transactions, 47 are from transaction type \"TRANSFER\" and 45 are from \"CASH_OUT\".
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.4 What percentage of fraudulent transactions do we have where transaction amount is equal to old balance in the origin account?
\n", + "\n", + "This might be the case where the fraudster emptied the account of the victim.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"No of cleanout fraud transactions: %d\\nPercentage of cleanout fraud transactions: %.2f%%\"%(\n", + " txn_data.loc[txn_data['amount'] == txn_data.oldbalanceOrig].loc[txn_data['isFraud'] == 1].shape[0],\n", + " txn_data.loc[txn_data['amount'] == txn_data.oldbalanceOrig].loc[txn_data['isFraud'] == 1].shape[0] / txn_data.loc[txn_data.isFraud == 1].shape[0]*100)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above result, we can see that out of 92 Fraud transactions, the amount involved in 90 fraud transactions was equal to the total balance in the account.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below are some insights about the dataset:
\n", + "3.5 Univariate statistics
\n", + "\n", + "The describe funtion computes the count, mean, std, min, percentiles, and max for numeric columns.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "txn_data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.6 Checking for Null Values
\n", + "The ColumnSummary() function can be used to take a quick look at the columns, their datatypes, and summary of NULLs/non-NULLs for a given table.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "colsum = ColumnSummary(\n", + " data = txn_data,\n", + " target_columns = [':']\n", + ")\n", + "colsum.result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.7 Checking for Outliers
\n", + "The OutlierFilterFit() function calculates the lower percentile, upper percentile, count of rows and median for all the \"target_columns\" provided by the user. These metrics for each column help the function OutlierTransform() detect outliers in data.
\n", + "\n", + "Here we are using teradataml syntax for the function. The same can be achived using the following SQL as well.
\n", + "\n", + "SELECT * FROM TD_OutlierFilterFit(\n",
+ " ON \"DEMO_GLM_Fraud\".\"transaction_data\" AS InputTable\n",
+ " OUT TABLE OutputTable(\"DEMO_USER\".\"Outlier_output\")\n",
+ " USING\n",
+ " TargetColumns('amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig')\n",
+ ") as dt;
\n",
+ "\n",
+ "*Please note that both the versions run in-database and there is no data transfer involved.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fit_object = OutlierFilterFit(\n", + " data = txn_data,\n", + " target_columns = ['amount','newbalanceOrig', 'oldbalanceDest','newbalanceDest','oldbalanceOrig']\n", + ")\n", + "\n", + "res = fit_object.transform(data = txn_data).result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Rows before removing outliers: {txn_data.shape[0]}\\n\\\n", + "Rows after removing outliers: {res.shape[0]}\\n\\\n", + "Total outliers: {txn_data.shape[0] - res.shape[0]}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outliers = td_minus([txn_data, res])\n", + "outliers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll perform the following steps:
\n", + "We perform feature scaling during data pre-processing to handle highly varying magnitudes, values, or units. If feature scaling is not done, then a machine learning algorithm tends to weigh greater values higher and consider smaller values as lower ones, regardless of the unit of the values.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4.1 Drop redundant columns
\n", + "We don't need nameDest, nameOrigin, and isFlaggedFraud for model training as they do not impact the outcome. We have txn_id to uniquely identify each transaction.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_data = txn_data.drop(['nameDest', 'nameOrig', 'isFlaggedFraud'], axis = 1)\n", + "txn_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4.2 One-hot encoding
\n", + "\n", + "Here, we are one-hot encoding the \"type\" column. We find one-hot encoding necessary in many cases to represent categorical variables as binary values, enable numerical processing, ensure feature independence, handle non-numeric data, and improve the performance and interpretability of our machine learning models.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "txn_type_encoder = OneHotEncoder(\n", + " values = [\"CASH_IN\", \"CASH_OUT\", \"DEBIT\", \"PAYMENT\", \"TRANSFER\"],\n", + " columns = \"type\"\n", + ")\n", + "\n", + "retain = Retain(\n", + " columns = ['step', 'amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig', 'isFraud']\n", + ")\n", + "\n", + "obj = valib.Transform(\n", + " data = txn_data,\n", + " one_hot_encode = txn_type_encoder,\n", + " retain = retain,\n", + " index_columns = 'txn_id'\n", + ")\n", + "txn_trans = obj.result\n", + "txn_trans" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above output shows that we have transformed the data into a transfromed dataset.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(txn_trans, table_name = 'clean_data', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll create two datasets for training and testing in the ratio of 80:20.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "TrainTestSplit_out = TrainTestSplit(\n", + " data = txn_trans,\n", + " id_column = \"txn_id\",\n", + " train_size = 0.80,\n", + " test_size = 0.20,\n", + " seed = 25\n", + ")\n", + "\n", + "df_train = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 1].drop(['TD_IsTrainRow'], axis = 1)\n", + "df_test = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 0].drop(['TD_IsTrainRow'], axis = 1)\n", + "\n", + "print(\"Training Set = \" + str(df_train.shape[0]) + \". Testing Set = \" + str(df_test.shape[0]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(df_train, table_name = 'clean_data_train', if_exists = 'replace')\n", + "copy_to_sql(df_test, table_name = 'clean_data_test', if_exists = 'replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df_train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above output shows that we have transformed the data into a scaled dataset. Scaling our data makes it easy for our model to learn and understand the problem.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The XGBoost() function, also known as eXtreme Gradient Boosting, is an implementation of the gradient boosted decision tree algorithm designed for speed and performance. It has recently been dominating applied machine learning.
\n", + "In gradient boosting, each iteration fits a model to the residuals (errors) of the previous iteration to correct the errors made by existing models. The predicted residual is multiplied by this learning rate and then added to the previous prediction. Models are added sequentially until no further improvements can be made. It is called gradient boosting because it uses a gradient descent algorithm to minimize the loss when adding new models.
\n", + "\n", + "Here we are using teradataml syntax for the function. The same can be achived using the following SQL as well.
\n", + "\n", + "SELECT * FROM TD_XGBoost(\n",
+ "\tON \"DEMO_USER\".\"clean_data_train\" AS \"input\"\n",
+ "\tPARTITION BY ANY\n",
+ "\tUSING InputColumns('amount','newbalanceOrig','oldbalanceDest','newbalanceDest','oldbalanceOrig','CASH_IN_type','CASH_OUT_type','DEBIT_type','PAYMENT_type','TRANSFER_type')\n",
+ "\tResponseColumn('isFraud')\n",
+ "\tMaxDepth(7)\n",
+ "\tSeed(42)\n",
+ "\tModelType('Classification')\n",
+ "\tRegularizationLambda(120.0)\n",
+ "\tShrinkageFactor(0.1)\n",
+ ") as sqlmr
\n",
+ "\n",
+ "*Please note that both the versions run in-database and there is no data transfer involved.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cols = df_train.columns\n", + "cols.remove('txn_id')\n", + "cols.remove('step')\n", + "cols.remove('isFraud')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoost_out = XGBoost(\n", + " data=df_train,\n", + " input_columns=cols,\n", + " response_column = 'isFraud',\n", + " lambda1 = 120.0,\n", + " model_type='Classification',\n", + " seed=42,\n", + " shrinkage_factor=0.1,\n", + " max_depth=7\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoost_out.output_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The function output is a trained XGBoost model, and we can input it to the XGBoostPredict() function for prediction.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The XGBoostPredict() function runs the predictive algorithm based on the model generated by XGBoost(). The XGBoost() function, also known as eXtreme Gradient Boosting, performs classification or regression analysis on datasets.
\n", + "\n", + "When using the function, we should provide only numeric features. We need to convert the categorical features to numeric values before prediction.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoostPredict_out = XGBoostPredict(\n", + " newdata=df_test,\n", + " object=XGBoost_out.result,\n", + " model_type='Classification',\n", + " id_column='txn_id',\n", + " object_order_column=['task_index', 'tree_num',\n", + " 'iter', 'tree_order'],\n", + " accumulate='isFraud',\n", + " output_prob=True,\n", + " output_responses=['0', '1']\n", + ").result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "XGBoostPredict_out" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output above shows our prob_1, i.e., the transaction is fraud, and prob_0, i.e., the transaction is not a fraud. We use these probabilities in our prediction column to assign a class label.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "combined_df = df_test.join(XGBoostPredict_out, on='txn_id', lsuffix='test', rsuffix='pred')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "combined_df[combined_df['Prediction']==1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "out = XGBoostPredict_out.assign(Prediction = XGBoostPredict_out.Prediction.cast(type_ = BYTEINT))\n", + "out = out.assign(Prediction = out.Prediction.cast(type_ = VARCHAR(2)))\n", + "out = out.assign(isFraud = out.isFraud.cast(type_ = VARCHAR(2)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ClassificationEvaluator_obj = ClassificationEvaluator(\n", + " data = out,\n", + " observation_column = 'isFraud',\n", + " prediction_column = 'Prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ClassificationEvaluator_obj.output_data.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We create the ROC curve, which is a graph between TPR (True Positive Rate) and FPR (False Positive Rate). We use the area under the ROC curve as a metric to evaluate how well our model can distinguish between positive and negative classes. A higher AUC indicates better performance in distinguishing between the positive and negative categories. We generally consider an AUC above 0.75 as decent.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import ROC\n", + "\n", + "roc_out = ROC(\n", + " probability_column = '\"Prob_1\"',\n", + " observation_column = \"isFraud\",\n", + " positive_class = \"1\",\n", + " data = XGBoostPredict_out,\n", + " num_thresholds=300\n", + ")\n", + "\n", + "roc_out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Assigning new index column\n", + "roc_out.result = roc_out.result.assign(row = 1)\n", + "# Changing the index label.\n", + "roc_out.result._index_label = [\"row\"]\n", + "auc = roc_out.result.get_values()[0][0]\n", + "\n", + "figure = Figure(width=500, height=400, heading=\"Receiver Operating Characteristic (ROC) Curve\")\n", + "\n", + "plot = roc_out.output_data.plot(\n", + " x=roc_out.output_data.fpr,\n", + " y=[roc_out.output_data.tpr, roc_out.output_data.fpr],\n", + " xlabel='False Positive Rate',\n", + " ylabel='True Positive Rate',\n", + " color='carolina blue',\n", + " figure=figure,\n", + " legend=[f'XGBoost AUC = {round(auc, 4)}', 'AUC Baseline'],\n", + " legend_style='lower right',\n", + " grid_linestyle='--',\n", + " grid_linewidth=0.5,\n", + " linestyle = ['-', '--']\n", + ")\n", + "\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the above ROC Curve, we can confidently say that our model has performed well on testing data. The AUC value is above 0.75 and resonates with our understanding that the model is performing well.
\n", + "\n", + "Conclusion
\n", + "\n", + "In this demonstration, we have illustrated a simplified - but complete - overview of how we can implement a typical machine learning workflow completely inside the database using Vantage. This allows us to leverage Vantage's operational scale, power, and stability.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Work Tables
\n", + "We need to clean up our work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['clean_data', 'clean_data_train', 'clean_data_test']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name = table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
\n", + "\n", + "Filters:
\n", + "Related Resources:
\n", + "\n", + "Links:
\n", + "\n",
+ " Store Sales Forecasting with In-Database Time Series\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "\n", + "Retail stores rely on sales and an accurate amount of inventory to support these sales. However, demand can be everchanging leading to stores being overstocked or out of stock. In these situations, retail stores need to quickly adjust to increase revenues and avoid additional unnecessary costs. The best way to keep ROI up is with retail demand forecasting in Teradata Vantage and ClearScape Analytics. Teradata’s capabilities allow users to combine and analyze sales and inventory data across all stores, while taking into consideration seasonal events, such as holidays or the weather. Bringing together all the components that influence customers to buy products allows retail stores to accurately predict sales and demand to ensure for precise inventory.
\n", + "\n", + "\n", + "Good Eats Grocery is a renowned retail corporation that operates a chain of hypermarkets. Here, Good Eats Grocery has provided a data combining of 45 stores including store information and monthly sales. The data is provided on weekly basis. Good Eats Grocery tries to find the impact of holidays on the sales of store. For which it has included four holidays’ weeks into the dataset which are Christmas, Thanksgiving, Super Bowl, Labor Day.
\n",
+ "
\n",
+ "Our Main Objective is to predict sales of store in a week. As in dataset size and time related data are given as feature, so analyze if sales are impacted by time-based factors and space- based factor. Most importantly how inclusion of holidays in a week soars the sales in store?\n",
+ "
\n",
+ " \n",
+ "
Business Value
\n", + "Why Vantage?
\n", + "Unbounded Array Framework (UAF) is the Teradata framework for building end-to-end time series forecasting pipelines. It also provides functions for digital signal processing and 4D spatial analytics. The series can reside in any Teradata supported or Teradata accessible table or in an analytic result table (ART). The UAF architecture provides a range of unique benefits including:
\n", + "\n", + "UAF provides data scientists with the tools for all phases of forecasting:
\n", + "Plus, with Teradata Vantage, users can perform these functions at scale and analyze and forecast hundreds/thousands series at once. Time Series analysis requires significant effort in analyzing, preparing, and testing forecast models. Traditional approaches require users to perform these laborious tasks multiple times for each prediction, so scaling forecasting efforts beyond a small number of different forecasts becomes prohibitive.
\n", + " \n", + "\n", + "Data
\n", + "The dataset contains historical sales data for 45 Good Eats Grocery stores located in different regions. Each store contains a number of departments, and you are tasked with predicting the department-wide sales for each store.
\n", + "\n", + "In addition, Good Eats Grocery runs several promotional markdown events throughout the year. These markdowns precede prominent holidays, the four largest of which are the Super Bowl, Labor Day, Thanksgiving, and Christmas. The weeks including these holidays are weighted five times higher in the evaluation than non-holiday weeks. Part of the challenge presented by this competition is modelling the effects of markdowns on these holiday weeks in the absence of complete/ideal historical data.
\n", + "\n", + "The basic idea of analyzing the Good Eats Grocery Forecasting dataset is to get a fair idea about the factors affecting the Sales of the Good Eats Grocery Store.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
3.Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_SalesForecasting\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_SalesForecasting_DB database and we have created views in DEMO_SalesForecasting database which are used in the cells below
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1 Prepare data to do some basic Analysis of the Sales data.
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let us start by creating a \"Virtual DataFrame\" that points directly to the dataset in Vantage. We begin our analysis by obtaining the necessary data types for columns and extract values such as Sales_week, Sales_year, etc., from the Sales_date column. These extracted values will be used in our subsequent analysis.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df=DataFrame(in_schema('DEMO_SalesForecasting','Weekly_Sales'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml.dataframe.sql_functions import case\n", + "from teradatasqlalchemy import TIMESTAMP, VARCHAR, INTEGER\n", + "from sqlalchemy import func\n", + "df = df.assign(IsHoliday = case([(df.IsHoliday == 0, 'False')], else_ = 'True'))\n", + "df = df.assign(Sales_Week = func.td_week_of_year(df.Sales_Date.expression))\n", + "df = df.assign(Sales_Date = df.Sales_Date.cast(type_=TIMESTAMP))\n", + "df = df.assign(Sales_Year = df.Sales_Date.cast(type_=VARCHAR(10)))\n", + "df = StrApply(data=df,\n", + " target_columns='Sales_Year',\n", + " string_operation='SUBSTRING',\n", + " string_length = 4,\n", + " accumulate = ['Store', 'Dept', 'Sales_Date', 'Weekly_Sales', 'IsHoliday','Sales_Week'],\n", + " in_place=True).result\n", + "df = df.assign(Sales_Year = df.Sales_Year.cast(type_=INTEGER))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "testdf=df\n", + "testdf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Let's do some basic analysis of the dataset
\n", + "We group the weekly sales by Sales Date and calculate the Average Sales based on Sales date. Alongside aggregating the data, we leverage the InDB plot() function for teradataml dataframes to visualize the data. This allows us to avoid transferring data to the client side even for visualizations.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df=testdf.select(['Sales_Date','Weekly_Sales']).groupby('Sales_Date')\n", + "df_plot=df.avg()\n", + "df_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=800, heading=\"Average Weekly Sales\")\n", + "plot = df_plot.plot(x=df_plot.Sales_Date, y=df_plot.avg_Weekly_Sales,\n", + " xtick_format='YYYY-MM',\n", + " xlabel='Week', ylabel='Sales', color=\"blue\",figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph shows the Average Sales per week. We can see that there are peaks mainly during the Year end period.
\n", + "Next we try to get the average sales for each Store, for that we group the Weekly Sales by each Store.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "weekly_sales = testdf.select(['Store','Weekly_Sales']).groupby('Store')\n", + "ws_plot=weekly_sales.avg()\n", + "ws_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=800, heading=\"Average Sales per Store\")\n", + "plot = ws_plot.plot(x=ws_plot.Store, y=ws_plot.avg_Weekly_Sales,\n", + " kind='bar',\n", + " xlabel='Store', ylabel='Sales', figure=figure)\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph shows the Average Weekly Sales for each store. We can see that Store 4 shows highest weekly sales while Store 5 shows the lowest weekly sales.
\n", + "Next we try to get the Weekly Sales for each year separately. For this we group the data for all 3 years by Sales Date for each year
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "week_df = testdf.select(['Sales_Week','Sales_Year','Weekly_Sales'])\n", + "week_df = week_df.assign(Weekly_Sales_2010 = case([(week_df.Sales_Year == 2010, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.assign(Weekly_Sales_2011 = case([(week_df.Sales_Year == 2011, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.assign(Weekly_Sales_2012 = case([(week_df.Sales_Year == 2012, week_df.Weekly_Sales)], else_ = 0))\n", + "week_df = week_df.select(['Sales_Week','Weekly_Sales_2010','Weekly_Sales_2011','Weekly_Sales_2012'])\n", + "week_df = week_df.groupby('Sales_Week')\n", + "week_df = week_df.avg()\n", + "week_df = week_df[((week_df.avg_Weekly_Sales_2010 != 0.0 ) & (week_df.avg_Weekly_Sales_2011 != 0.0) &\n", + " (week_df.avg_Weekly_Sales_2012 != 0.0))]\n", + "week_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=600, heading=\"Average Weekly Sales per Year\")\n", + "week_df.plot(x=week_df.Sales_Week, y=[week_df.avg_Weekly_Sales_2010, week_df.avg_Weekly_Sales_2011, week_df.avg_Weekly_Sales_2012], \n", + " style=['dark orange', 'green','blue'], xlabel='Week', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['2010','2011','2012'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph shows the Average Weekly Sales for different years. We can see that there are peaks mainly during 10-15th week and 20-30th week.
\n", + "We try to get the comparison of Sales during Holidays and Other Working Days. We do a grouping of data for Sales based on whether the Sale is on Holiday or Working Day
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "week_holiday_df = testdf.select(['Sales_Date','Sales_Week','IsHoliday','Weekly_Sales'])\n", + "week_holiday_df = week_holiday_df.assign(Weekly_Sales_True = case([(week_holiday_df.IsHoliday == 'True', week_holiday_df.Weekly_Sales)], else_ = 0))\n", + "week_holiday_df = week_holiday_df.assign(Weekly_Sales_False = case([(week_holiday_df.IsHoliday == 'False', week_holiday_df.Weekly_Sales)], else_ = 0))\n", + "week_holiday_df = week_holiday_df.select(['Sales_Date','Sales_Week','Weekly_Sales_True','Weekly_Sales_False'])\n", + "week_holiday_df = week_holiday_df.groupby(['Sales_Date','Sales_Week'])\n", + "week_holiday_df = week_holiday_df.sum()\n", + "week_holiday_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=600, heading=\"Total Sales per Week\")\n", + "week_holiday_df.plot(x=week_holiday_df.Sales_Week, y=[week_holiday_df.sum_Weekly_Sales_True, week_holiday_df.sum_Weekly_Sales_False], \n", + " style=['blue','brown'], xlabel='Week', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Holidays','Week Days'],kind='bar', figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph shows the Weekly Sales per Week. The Orange colored bars show weekly sales during working days while the Blue colored bars show weekly sales during holidays.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Preparing Dataset by joining the datasets.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "
We prepare the dataset by creating a view by joining data from Weekly Sales, Stores and features. The view is created using SQL to reduce the number of steps to join and data preocessing which gets used in further steps.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query2='''REPLACE VIEW Weekly_Sales_Details AS\n", + "SELECT\n", + " w.Sales_date AS times,\n", + " CAST('2012-02-03' AS DATE) AS cutoff_date,\n", + " w.Dept,\n", + " w.Store,\n", + " CAST(w.Sales_Date AS TIMESTAMP) AS Sales_Date,\n", + " ZEROIFNULL(Weekly_Sales) AS Weekly_Sales,\n", + " ZEROIFNULL(Store_Size) AS Store_Size,\n", + " Store_Type AS Store_Type,\n", + " w.IsHoliday,\n", + " ZEROIFNULL(Temperature) AS Temperature,\n", + " ZEROIFNULL(MarkDown1) AS MarkDown1,\n", + " ZEROIFNULL(MarkDown2) AS MarkDown2,\n", + " ZEROIFNULL(MarkDown3) AS MarkDown3,\n", + " ZEROIFNULL(MarkDown4) AS MarkDown4,\n", + " ZEROIFNULL(MarkDown5) AS MarkDown5,\n", + " ZEROIFNULL(CPI) AS CPI,\n", + " ZEROIFNULL(Unemployment) AS Unemployment,\n", + " ZEROIFNULL(Fuel_Price) AS Fuel_Price,\n", + " CAST(TRIM(w.Dept) || TRIM(w.Store) AS INT) AS idcols\n", + "FROM\n", + " Demo_SalesForecasting.Weekly_Sales w\n", + "LEFT JOIN\n", + " Demo_SalesForecasting.Stores s ON w.Store = s.Store\n", + "LEFT JOIN\n", + " Demo_SalesForecasting.Features f ON w.Store = f.store AND w.Sales_Date = f.Sales_Date\n", + "WHERE\n", + " w.Store IN (20, 4);\n", + "'''\n", + "\n", + "execute_sql(query2)\n", + "modeldf=DataFrame.from_query('select * from Weekly_Sales_Details;')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dfacheck = modeldf.groupby([\"idcols\"])\n", + "dfacheck=dfacheck.count().select([\"idcols\",\"count_Sales_Date\"])\n", + "\n", + "dfa4=modeldf.join(dfacheck, on = 'idcols', how = \"left\", lsuffix = 't1', rsuffix = 't2').drop(['idcols_t2'],axis=1)\n", + "dfa4=dfa4.assign(idcols = dfa4['idcols_t1'])\n", + "dfa4=dfa4.drop(['idcols_t1'],axis=1)\n", + "\n", + "# filter out incomplete time series \n", + "\n", + "modeldf1 = dfa4[dfa4.count_Sales_Date == 143]\n", + "modeldf1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "modeldf1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5. Checking for Stationarity of Time Series using the Dickey Fuller Test
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To be able to model a time series, it needs to be stationary. ARIMA models deal with non-stationary time series by differencing (The \"d' parameter in ARIMA determines the number of differences needed to make a series stationary)
\n", + "Here we will check for stationarity of all time series using the Dickey-Fuller Test. For more info on the test, see here. \n", + "
The null hypothesis for the test is that the data is non-stationary. We want to REJECT the null hypothesis for this test. So, we want a p-value of less than 0.05 (or smaller) and a negative coefficient value for the lag term in our regression model.
\n", + "The Dickey fuller function needs series data, so we use the TDSeries function to create a series and apply DickeyFuller to check the stationarity of the data.
\n", + "We use the OutlierFilterFit and the OutlierFilterTransform functions to remove the outliers in the series and then use the Rescaled Data to check the stationarity of the data using the DickeyFuller function.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_df=modeldf1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The OutlierFilterFit() function calculates the lower_percentile, upper_percentile, count of rows and median for all the \"target_columns\" provided by the user. These metrics for each column helps the function OutlierTransform() detect outliers in the input table. It also stores parameters from arguments into a FIT table used during transformation. The lower_percentile specifies lower range of percentile to be used to detect if value is outlier or not and the upper_percentile specifies upper range of percentile to be used to detect if value is outlier or not.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OutlierFilterFit\n", + "OutlierFilterFit_out = OutlierFilterFit(data = sales_df,\n", + " target_columns = \"Weekly_Sales\",\n", + " )\n", + "out_df=OutlierFilterFit_out.output_data\n", + "out_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The OutlierFilterfit creates a fit table with different values which need to be applied on the data to get the transformed data.
\n", + "\n", + "OutlierFilterTransform() function filters the outliers from the input teradataml DataFrame.
\n", + "OutlierFilterTransform() uses the result DataFrame from OutlierFilterFit() function to get statistics like median, count of rows, lower percentile and upper percentile for every column specified in target columns argument and filters the outliers in the input data.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import OutlierFilterFit, OutlierFilterTransform\n", + "obj = OutlierFilterTransform(data=sales_df,\n", + " object=OutlierFilterFit_out.result)\n", + "out_transform_df = obj.result\n", + "out_transform_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The OutlierFilterTransform transforms the data and creates the output data after applying the Fit Table details on the data.
\n", + "\n", + "The Resample() function transforms an irregular time series into a regular time series. It can also be used to alter the sampling interval for a time series. The Resample functions requires a series as inuput for which we use the TDSeries function.
\n", + "\n", + "TDSeries object from a teradataml DataFrame representing a SERIES in time series which is used as input to Unbounded Array Framework, time series functions. A series is a one-dimensional array. They are the basic input of UAF functions. A series is identified by its series ID, i.e., \"id\" argument, and indexed by \"row_index\" argument. Series is passed to and returned from UAF functions as wavelets. Wavelets are collections of rows, grouped by one or more fields, and ordered on the \"row_index\" argument.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Resample\n", + "data_series_df = TDSeries(data=obj.result,\n", + " id=\"idcols\",\n", + " row_index=(\"Sales_Date\"),\n", + " row_index_style= \"TIMECODE\",\n", + " payload_field=\"Weekly_Sales\",\n", + " payload_content=\"REAL\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "uaf_out1 = Resample(data=data_series_df,\n", + " interpolate='LINEAR',\n", + " timecode_start_value=\"TIMESTAMP '2010-02-05 00:00:00'\",\n", + " timecode_duration=\"WEEKS(1)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df=uaf_out1.result\n", + "df1=df.select(['idcols','ROW_I', 'Weekly_Sales']).assign(Sales_Date=df.ROW_I)\n", + "df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The DickeyFuller() function tests for the presence of one or more unit roots in a series to determine if the series is non-stationary. When a series contains unit roots, it is non-stationary. When a series contains no unit roots, whether the series is stationary is based on other factors.
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import DickeyFuller\n", + "data_series_df_1 = TDSeries(data=df1,\n", + " id=\"Sales_Date\",\n", + " row_index=(\"idcols\"),\n", + " row_index_style= \"SEQUENCE\",\n", + " payload_field=\"Weekly_Sales\",\n", + " payload_content=\"REAL\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_out = DickeyFuller( data=data_series_df_1,\n", + " algorithm='NONE')\n", + "\n", + "# Print the result DataFrame.\n", + "print(df_out.result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "In the above output the p-value corresponding to the calculated test statistic is less than 0.05. It means that the series is stationary. The output column NULL_HYP which means NULL HYPOTHESIS can have 2 values \n", + "
6. ARIMA Modelling
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "ARIMA stands for Autoregressive Integrated Moving Average. It is a statistical method used for time series forecasting and analysis. ARIMA is a form of regression analysis that gauges the strength of one dependent variable relative to other changing variables. ARIMA models are popular in various fields, including finance, economics, and environmental science, for predicting future points in a time series based on its historical values.
\n", + "The ArimaEstimate() function estimates the coefficients corresponding to an ARIMA (AutoRegressive Integrated Moving Average) model, and to fit a series with an existing ARIMA model. The function can also provide the \"goodness of fit\" and the residuals of the fitting operation. The function generates model layer used as input for the ArimaValidate() and ArimaForecast() functions. This function is for univariate series.
\n", + "\n", + "The following procedure is an example of how to use ArimaEstimate() function:
\n", + "Here the input series to the ArimaEstimate is the output series of the Resample function. The series is created by using the output of Resample function and passed to ArimaEstimate.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaEstimate\n", + "# Execute ArimaEstimate function.\n", + "arima_est_out = ArimaEstimate(data1=data_series_df_1,\n", + " nonseasonal_model_order=[2,1,1],\n", + " constant=False,\n", + " algorithm=\"MLE\",\n", + " coeff_stats=True,\n", + " fit_metrics=True,\n", + " residuals=True,\n", + " fit_percentage=80)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "est_result=arima_est_out.fitresiduals\n", + "est_result = est_result.groupby('Sales_Date').avg()\n", + "est_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot the Actual Value of Weekly Sales vs the Calculated Value of the ArimaEstimate function.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Comparison of Actual vs Predicted Sales\")\n", + "est_result.plot(x=est_result.Sales_Date, y=[est_result.avg_ACTUAL_VALUE, est_result.avg_CALC_VALUE], \n", + " style=['dark orange', 'green'], xlabel='Sales Date', ylabel='Sales', grid_color='black',xtick_format='YYYY-MM',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Actual Value','Predicted Value'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ArimaValidate() function performs an in-sample forecast for both seasonal and non-seasonal auto-regressive (AR), moving-average (MA), ARIMA models and Box-Jenkins seasonal ARIMA model formula followed by an analysis of the produced residuals. The aim is to provide a collection of metrics useful to select the model and expose the produced residuals such that multiple model validation and statistical tests can be conducted.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaValidate\n", + "data_art_df = TDAnalyticResult(data=arima_est_out.result)\n", + "\n", + "\n", + "arima_val_out = ArimaValidate(data=data_art_df, fit_metrics=True, residuals=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "val_result=arima_val_out.fitresiduals\n", + "val_result = val_result.groupby('Sales_Date').avg()\n", + "val_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot the Actual Value of Weekly Sales vs the Calculated Value of the ArimaValidate function.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Comparison of Actua vs Predicted\")\n", + "val_result.plot(x=val_result.Sales_Date, y=[val_result.avg_ACTUAL_VALUE, val_result.avg_CALC_VALUE], \n", + " style=['dark orange', 'green'], xlabel='Sales Date', ylabel='Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", legend=['Actual Value','Predicted Value'],figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The ArimaForecast() function is used to forecast a user-defined number of periods based on models fitted from the ArimaEstimate() function.
\n", + "Here we are considering 7 periods (forecast_periods=7)
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ArimaForecast\n", + "arima_estimate_op = ArimaEstimate(data1=data_series_df_1,\n", + " nonseasonal_model_order=[2,1,1],\n", + " constant=False,\n", + " algorithm=\"MLE\",\n", + " coeff_stats=True,\n", + " fit_metrics=True,\n", + " residuals=True,\n", + " fit_percentage=100)\n", + "\n", + "# Create teradataml TDAnalyticResult object over the result attribute of 'arima_estimate_op'\n", + "data_art_df = TDAnalyticResult(data=arima_estimate_op.result)\n", + " \n", + "arima_forcast_out = ArimaForecast(data=data_art_df, forecast_periods=7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "forecast_result=arima_forcast_out.result\n", + "forecast_result = forecast_result.groupby('ROW_I').avg()\n", + "forecast_result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We plot the Forecasted Value of Weekly Sales for the defined number of periods.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import Figure\n", + "figure = Figure(width=1000, height=700, heading=\"Forecast Sales\")\n", + "forecast_result.plot(x=forecast_result.ROW_I, y=forecast_result.avg_FORECAST_VALUE, \n", + " xlabel='Forecast Period', ylabel='Forecast Sales', grid_color='black',\n", + " grid_linewidth=0.5, grid_linestyle=\"-\", figure=figure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "7. Conclusion:
\n", + "We have trained and validated the ARIMA model on the Weekly Sales dataset, and the results closely match the actual data. The goodness of fit metrics calculated in the estimate and validate phase also resonate with our understanding that the model is well-trained to forecast. This can be observed in the Estimate and the Validate function graphs. So, we can say that the model is well trained to forecast the Weekly Sales.
\n", + "\n", + "Thus with Teradata Vantage we can do rapid data exploration, preparation, and testing functions that can analyze massive amounts of data across an unlimited number of forecasts in parallel, drastically reducing the development and testing times. We can create unlimited number of forecasts in parallel, unlocking value in hyper-segmented (per-store-per-SKU inventory demand, per-household energy consumption) predictions, based on individualized models.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "8. Cleanup
\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " db_drop_view('Weekly_Sales_Details')\n", + "except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let’s look at the elements we have available for reference for this notebook:
\n", + "\n", + "Dataset
\n", + "This is the historical data that covers sales from 2010-02-05 to 2012-11-01. Within this file you will find the following fields:
\n", + "\n", + "Filters:
\n", + "Related Resources:
\n", + "\n",
+ " Store Sales Forecasting with Prophet using Script Table Operator\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied.
\n", + "Our Main Objective is to predict sales of store in a week. We are using the python Prophet model and using the Open Analytics Framework(OAF) of VantageCloud Lake for forecasting the Store Sales.
\n", + "The Open Analytics Framework builds on the existing Vantage facilities for data scientists and analysts to do the following:
\n", + "APPLY table operator is the VantageCloud Lake successor to the Vantage Enterprise SCRIPT and ExecR table operators. The APPLY table operator bears more similarities to the SCRIPT operating mode, in that APPLY takes an external language script as input to run, rather than ingesting external language statements in a contract function as ExecR does. The APPLY table operator is nevertheless designed to expand its features in the future in a way that encompasses additional key features from both the SCRIPT and ExecR table operators. The fastpath APPLY table operator runs a user-installed script or any Linux command inside the remote user environment using Open Analytics Framework. Installed script runs in parallel with data from Analytics Database.
\n", + "\n", + "An overview of the steps for using the Open Analytics Framework follow.
\n", + "Hence as a data science consultant, we are showcasing the complete approach about how we can make prediction of sales for different stores in advance. We are demonstrating how we can train our models and use them for scoring using the ClearScape Analytics platform. The data we are using is a sample dataset and the results and predictions may not be entirely accurate.\n", + "
\n", + "Data
\n", + "The dataset contains historical sales data for 1,115 Rossmann stores. The task is to forecast the \"Sales\" column for the test set. Note that some stores in the dataset were temporarily closed for refurbishment.
\n", + "\n", + "Most of the fields are self-explanatory. The following are descriptions for those that aren't.
\n", + "\n", + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
Note: After installing the above libraries, Please restart the kernel. The simplest way is by typing zero zero: 0 0
\n", + "In the section, we import the required libraries and set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings(\"ignore\")\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)\n", + "import getpass\n", + "import pandas as pd\n", + "\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "from time import time\n", + "import os\n", + "# from prophet import Prophet\n", + "import warnings\n", + "import itertools\n", + "from dotenv import load_dotenv, dotenv_values\n", + "import pickle\n", + "import base64\n", + "import time\n", + "from teradataml import *\n", + "from IPython.display import display as ipydisplay\n", + "from IPython.display import clear_output\n", + "from time import sleep\n", + "\n", + "display.max_rows=5\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
3.Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_AnomalyDetection\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_AnomalyDetection_DB database and we have created views in DEMO_AnomalyDetection database which are used in the cells below
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3.1 Prepare data to do some basic Analysis of the Sales data.
\n", + "\n", + "We create dataframe for the Stores and the Sales Data using tables from Vantage. To gain insights into the data's characteristics, we display a sample of 5 rows each.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store=DataFrame(in_schema('DEMO_ProphetSTO','Store'))\n", + "store " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Store dataset contains description of the Stores like, StoreType, distance from the Competition Store and also various Promotion codes and Details.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales=DataFrame(in_schema('DEMO_ProphetSTO','Sales_Data'))\n", + "sales " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Store Sales dataset contains the Store, DayofWeek, Date of Sales , Sales done, Customer involved, SalesOpen is a flag mentioning if the Store is Open or Closed and Promotion Code applied for the Sales.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "4. Data Analysis and Transformation
\n", + "In this first section we go through the Sales and store data, handle missing values and create new features for further analysis.
\n", + "We check the missing values for the CompetitionDistance column and replace it with the median values.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import SimpleImputeFit, SimpleImputeTransform\n", + "fit_obj = SimpleImputeFit(data=store,\n", + " stats_columns=\"CompetitionDistance\",\n", + " stats=\"median\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "obj = SimpleImputeTransform(data=store,\n", + " object=fit_obj.output)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "store=obj.result\n", + "store" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We join the Store and Sales dataset to get the required columns for our analysis.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales.merge(right = store, how = \"inner\", on = \"store=store\",lsuffix='l', rsuffix='r')\n", + "sales_store=sales_store.assign(Store=sales_store.Store_l)\n", + "sales_store=sales_store.drop(['Store_l', 'Store_r'], axis=1)\n", + "sales_store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The final dataset used for analysis contains 18 columns and 91,256 rows.
\n", + "Based on the data available we do some transformations on the data and create various features. From the SalesDate we, generate columns like , Year, Month, DayOfWeek , WeekofYear etc. Using the columns related to Competition like CompetionOpenSinceYear and CompetitionOpenSinceMonth we calculate if the Competition Store is Open or not(CompetitionOpen). Similarly, we do the processing for Promotions and create a flag(PromoOpen)
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(CompetitionOpenSinceYear = \n", + " case([(sales_store.CompetitionOpenSinceYear.isnull() == True, '0')], else_ = sales_store.CompetitionOpenSinceYear),\n", + " CompetitionOpenSinceMonth = \n", + " case([(sales_store.CompetitionOpenSinceMonth.isnull() == True, '0')], else_ = sales_store.CompetitionOpenSinceMonth),\n", + " Promo2SinceYear = \n", + " case([(sales_store.Promo2SinceYear.isnull() == True, '0')], else_ = sales_store.Promo2SinceYear),\n", + " Promo2SinceWeek = \n", + " case([(sales_store.Promo2SinceWeek.isnull() == True, '0')], else_ = sales_store.Promo2SinceWeek)\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(Year = sales_store.SalesDate.year(),\n", + " Month = sales_store.SalesDate.month(),\n", + " Day = sales_store.SalesDate.day_of_month(),\n", + " DayOfWeek = sales_store.SalesDate.day_of_week(),\n", + " WeekOfYear = sales_store.SalesDate.week_of_year())\n", + "\n", + "sales_store = sales_store.assign(CompetitionOpen = 12 * (sales_store.Year - sales_store.CompetitionOpenSinceYear)+\n", + " (sales_store.Month - sales_store.CompetitionOpenSinceMonth),\n", + " PromoOpen = 12 * (sales_store.Year - sales_store.Promo2SinceYear)+\n", + " (sales_store.WeekOfYear - sales_store.Promo2SinceWeek) / 4.0)\n", + "\n", + "\n", + "sales_store = sales_store.assign(CompetitionOpen = case([(sales_store.CompetitionOpen > 0, sales_store.CompetitionOpen)], else_ = 0),\n", + " PromoOpen = case([(sales_store.PromoOpen > 0, sales_store.PromoOpen)], else_ = 0))\n", + " \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(StoreType = case([(sales_store.StoreType == '0', 0),(sales_store.StoreType == 'a', 1),\n", + " (sales_store.StoreType == 'b', 2),(sales_store.StoreType == 'c', 3),\n", + " (sales_store.StoreType == 'd', 4)]),\n", + " Assortment = case([(sales_store.Assortment == '0', 0),(sales_store.Assortment == 'a', 1),\n", + " (sales_store.Assortment == 'b', 2),(sales_store.Assortment == 'c', 3),\n", + " (sales_store.Assortment == 'd', 4)]),\n", + " StateHoliday = case([(sales_store.StateHoliday == '0', 0),(sales_store.StateHoliday == 'a', 1),\n", + " (sales_store.StateHoliday == 'b', 2),(sales_store.StateHoliday == 'c', 3),\n", + " (sales_store.StateHoliday == 'd', 4)])\n", + " \n", + " ) \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store = sales_store.assign(monthStr = case([(sales_store.Month == 1, 'Jan'),(sales_store.Month == 2, 'Feb'),\n", + " (sales_store.Month == 3, 'Mar'),(sales_store.Month == 4, 'Apr'),\n", + " (sales_store.Month == 5, 'May'),(sales_store.Month == 6, 'Jun'),\n", + " (sales_store.Month == 7, 'Jul'),(sales_store.Month == 8, 'Aug'),\n", + " (sales_store.Month == 9, 'Sep'),(sales_store.Month == 10, 'Oct'),\n", + " (sales_store.Month == 11,' Nov'),(sales_store.Month == 12, 'Dec')]),\n", + " IsPromoMonth = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_sales = sales_store.select(['Month','Sales']).groupby('Month').mean()\n", + "plot = plot_sales.plot(x=plot_sales.Month, y=plot_sales.mean_Sales,\n", + " kind='bar', xlabel='Month', ylabel='Sales', color=\"orange\")\n", + " \n", + "# Display the plot.\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph shows the total sales across months for all stores. We can see that the sales are highest in December which is the Holiday Season.
\n", + "Now we will see the same metrics across different Store types and also based on whether there was any Promotion available(Promo=1) or not (Promo=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#Catplot month Vs Sales\n", + "features_df = sales_store.to_pandas(all_rows=True)\n", + "sns.catplot(data = features_df, x = 'Month', y = \"Sales\", \n", + " col = 'StoreType', # per store type in cols\n", + " palette = 'plasma',\n", + " # hue = 'StoreType',\n", + " row = 'Promo' # per promo in the store in rows\n", + " # color ='Year'\n", + " ) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
The above graph shows the Sales per Month for each of the 4 StoreTypes(a,b,c,d) for all the 1,115 Stores. The Top row shows the sales for Promo=0 and the bottom row is for Promo=1. Each dot represents the sum of sales for a particular store in a month depending on the Store Type and Promo Code. We can see that there are peaks mainly during the Year end period.
\n", + "All store types follow the same trend but at different scales depending on the presence of the promotion `Promo` and `StoreType` except for the StoreType = b.\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Next we try to get four stores from store types to represent their group:
\n", + "It also makes sense to down sample the data from days to weeks using the `resample` method to see the present trends more clearly.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = sales_store.select(['Store','SalesDate','Sales'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_a = train_df[train_df.Store == 2].select(['SalesDate','Sales']).groupby('SalesDate').mean()\n", + "sales_b = train_df[train_df.Store == 85].select(['SalesDate','Sales']).groupby('SalesDate').sum()\n", + "# .sort_index(ascending = True) # solve the reverse order\n", + "sales_c = train_df[train_df.Store == 1].select(['SalesDate','Sales']).groupby('SalesDate').sum()\n", + "sales_d = train_df[train_df.Store == 15].select(['SalesDate','Sales']).groupby('SalesDate').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = subplots(nrows=4, ncols=1)\n", + " \n", + "plot = sales_a.plot(x=sales_a.SalesDate, y=sales_a.mean_Sales,\n", + " ax=axes[0], figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 2\", color=\"blue\",figsize=(1200, 1600))\n", + " \n", + "plot = sales_b.plot(x=sales_b.SalesDate, y=sales_b.sum_Sales,\n", + " ax=axes[1],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 85\", color=\"blue\")\n", + " \n", + "plot = sales_c.plot(x=sales_c.SalesDate, y=sales_c.sum_Sales,\n", + " ax=axes[2],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 1\", color=\"blue\")\n", + "\n", + "plot = sales_d.plot(x=sales_d.SalesDate, y=sales_d.sum_Sales,\n", + " ax=axes[3],figure=fig, kind=\"line\",xlabel='Sales Date', ylabel='Sales',\n", + " title=\"Sales for Store 15\", color=\"blue\")\n", + " \n", + "# Display the plot.\n", + "plot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Retail sales for all store types tend to peak for the Christmas season and then decline after the holidays.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next we check the Yearly trend for these Store Types thing to check the presence of a trend in series. Time series decomposition is the process of separating time series data into its core components. These components include a potential trend (overall rise or fall in the mean), seasonality (a recurring cycle), and the remaining random residual. Python’s statsmodels library has a method for time series decomposition called seasonal_decompose(). The model type parameter can either be additive or multiplicative, here we consider additive as If the seasonality’s amplitude is independent of the level then you should use the additive model. The \"period\" parameter is the number of observations in a seasonal cycle. For example, if you have daily observations, the period is 1.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sorting with 'date'\n", + "pd_sales_store = features_df\n", + "train_df = pd_sales_store.set_index('SalesDate')\n", + "# Sales datacheck\n", + "train_df['Sales'] = train_df['Sales'] * 1.0\n", + "# storewise sales data\n", + "sales_a = train_df[train_df.Store == 2]['Sales']\n", + "sales_b = train_df[train_df.Store == 85]['Sales']\n", + "# .sort_index(ascending = True) # solve the reverse order\n", + "sales_c = train_df[train_df.Store == 1]['Sales']\n", + "sales_d = train_df[train_df.Store == 15]['Sales']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Seasonal decompose\n", + "from statsmodels.tsa.seasonal import seasonal_decompose\n", + "\n", + "f, (ax1, ax2, ax3, ax4) = plt.subplots(4, figsize = (15, 15))\n", + "\n", + "# monthly\n", + "decomposition_a = seasonal_decompose(sales_a, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_a.trend.plot(ax = ax1)\n", + "\n", + "decomposition_b = seasonal_decompose(sales_b, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_b.trend.plot( ax = ax2)\n", + "\n", + "decomposition_c = seasonal_decompose(sales_c, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_c.trend.plot( ax = ax3)\n", + "\n", + "decomposition_d = seasonal_decompose(sales_d, model = 'additive', extrapolate_trend='freq', period=1)\n", + "decomposition_d.trend.plot( ax = ax4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Overall sales follow similar Trend for all StoreTypes as seen above. There are spikes around the year end which indicate higher sales over the year end holiday season.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "5. Creating the model and forecasting using Prophet in python (stoSalesForecastnew.py).
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prophet is a procedure for forecasting time series data based on an additive model where non-linear trends are fit with yearly, weekly, and daily seasonality, plus holiday effects. It works best with time series that have strong seasonal effects and several seasons of historical data. Prophet is robust to missing data and shifts in the trend, and typically handles outliers well.
\n", + "Prophet follows the sklearn model API. We create an instance of the Prophet class and then call its fit and predict methods.
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the below steps which include the Prophet model are executed in the python in the file stoSalesForecastnew.py file. We then use this py file in the Script command and get the forecasted values.
\n", + "The input to Prophet is always a dataframe with two columns: ds and y. The ds (datestamp) column should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. The y column must be numeric and represents the measurement we wish to forecast.
\n", + "\n", + "The below code shows the creation of the Sales DataFrame and the holidays Dataframe which are used in the model creation and model fit.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create Sales data dataframe using data from Vantage
\n", + "\n", + "```python \n", + "# create Sales data \n", + "sales = pd_sales_store.rename(columns = {'SalesDate': 'ds','Sales': 'y'})\n", + "``` " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create holidays dataframe
\n", + "\n", + "```python\n", + "#create holidays dataframe\n", + " \n", + "\n", + "school_dates = df[df.SchoolHoliday == 1].loc[:, 'Date'].values\n", + "\n", + "school = pd.DataFrame({'holiday': 'school_holiday',\n", + " 'ds': pd.to_datetime(school_dates)})\n", + "\n", + "holidays = school \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We fit the model by instantiating a new Prophet object. Any settings to the forecasting procedure are passed into the constructor. Then you call its fit method and pass in the historical dataframe(sales).
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instantiate and fit model using Prophet
\n", + "\n", + "```python\n", + "\n", + "# Prophet implementation \n", + "my_model = Prophet(interval_width = 0.95, \n", + " holidays = holidays.head(50000))\n", + "my_model.fit(sales) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Predictions are then made on a dataframe with a column ds containing the dates for which a prediction is to be made. You can get a suitable dataframe that extends into the future a specified number of days using the helper method Prophet.make_future_dataframe. By default, it will also include the dates from the history, so we will see the model fit as well.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Create future dates for forecasting
\n", + "\n", + "```python\n", + "dt = min(sales['ds'].values)\n", + "date1 = datetime.datetime.strptime(dt, \"%y/%m/%d\").date()\n", + "\n", + "\n", + "\n", + "# # Subtract one month\n", + "start_date = date1 - relativedelta(months=1)\n", + "\n", + "# Get man date and then get future dates for 1 month\n", + "dt1 = max(sales['ds'].values)\n", + "date2 = datetime.datetime.strptime(dt1, \"%y/%m/%d\").date()\n", + "# date2 = datetime.datetime.strptime(datetime_str, \"%Y/%m/%dT%H:%M:%S.%f\").date()\n", + "end_date = date2 + relativedelta(months=1)\n", + "# end_date= str(end_value)\n", + "\n", + "\n", + "# # date_range = pd.date_range(start_date, periods=num_days)\n", + "date_range = pd.date_range(str(start_date), str(end_date))\n", + "\n", + "future_dates = pd.DataFrame({'ds': date_range}) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The predict method will assign each row in future a predicted value which it names yhat. If you pass in historical dates, it will provide an in-sample fit. The forecast object here is a new dataframe that includes the \"yhat\" column, which is the forecast values for sales, as well as columns for components and uncertainty intervals.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
Create dataframe with forecast values
\n", + "\n", + "```python\n", + "# forecast\n", + "forecast = my_model.predict(future_dates.head(10000)) \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The forecasted values will be sent back to Vantage using the Returns clause of the Script function as seen in the section below.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
6. Using APPLY Command to get the forecasted values back to Vantage.
\n", + "6.1 Create virtual environment for executing the script
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Function to set the Authentication token to connect to User Environment Service in VantageCloud Lake.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We've already loaded all the values into our environment variables and into a dictionary, env_vars.\n", + "# username=env_vars.get(\"username\") isn't required when using base_url, pat and pem.\n", + "\n", + "if set_auth_token(base_url=env_vars.get(\"ues_uri\"),\n", + " pat_token=env_vars.get(\"access_token\"), \n", + " pem_file=env_vars.get(\"pem_file\"),\n", + " valid_from=int(time.time())\n", + " ):\n", + " print(\"UES Authentication successful\")\n", + "else:\n", + " print(\"UES Authentication failed. Check credentials.\")\n", + " sys.exit(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the session to use the Analytic compute group and cluster to execute the OpenSourceML function.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gpu_compute_group = env_vars.get(\"gpu_compute_group\")\n", + "execute_sql(f\"SET SESSION COMPUTE GROUP {gpu_compute_group};\")\n", + "print(f\"Compute group set to {gpu_compute_group}\") " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check the user environments and create an environment for the usecase.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list_user_envs()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "try:\n", + " env = create_env(\n", + " env_name=\"oaf_demo_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for SalesForecasting Prophet\"\n", + " )\n", + "except:\n", + " remove_env(\"oaf_demo_env\")\n", + " env = create_env(\n", + " env_name=\"oaf_demo_env\",\n", + " base_env=\"python_3.9\",\n", + " desc=\"OAF Demo env for SalesForecasting Prophet\"\n", + " )\n", + " \n", + "env " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Confirm that the versions in the local environment are same to the virtual environment.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip list | grep scikit-learn\n", + "!pip list | grep scipy\n", + "!pip list | grep numpy\n", + "!pip list | grep pandas\n", + "!pip list | grep prophet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "claim_id = env.install_lib([\"pandas==2.1.3\",\n", + " \"scipy==1.11.2\",\n", + " \"scikit-learn==1.1.3\",\n", + " \"numpy==1.24.2\",\n", + " \"sklearn-pandas==2.2.0\", \n", + " \"prophet==1.1.4\"], asynchronous=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Check the status of installation using status() API.\n", + "# Create a loop here for demo purposes\n", + "\n", + "ipydisplay(env.status(claim_id))\n", + "stage = env.status(claim_id)['Stage'].iloc[-1]\n", + "while stage == 'Started':\n", + " stage = env.status(claim_id)['Stage'].iloc[-1]\n", + " clear_output()\n", + " ipydisplay(env.status(claim_id))\n", + " sleep(5)\n", + " \n", + "# Verify the Python libraries have been installed correctly.\n", + "ipydisplay(env.libs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the user environment to the created virtual environment for the execution of the python script.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "configure.openml_user_env = env" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "6.2 Install the file and any additional artifacts
\n", + "\n", + "Use the install_file() method to install this python file to the container. As a reminder, this container is persistent, so these steps need only be done infrequently.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.install_file(\"stoSalesForecastnew.py\", replace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "6.3 APPLY using Python
\n", + "The process is as follows
\n", + "\n", + "First we will create a dataset which can be passed to the Apply function.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "qry='''CREATE SET TABLE Store_Sales_ID \n", + " (\n", + " SlsID INTEGER,\n", + " Store INTEGER,\n", + " DayOfWeek INTEGER,\n", + " SalesDate DATE FORMAT 'yyyy/mm/dd',\n", + " Sales INTEGER,\n", + " Customers INTEGER,\n", + " SalesOpen INTEGER,\n", + " Promo INTEGER,\n", + " StateHoliday CHAR(1) CHARACTER SET LATIN NOT CASESPECIFIC,\n", + " SchoolHoliday INTEGER)\n", + " PRIMARY INDEX ( SlsID ); '''\n", + "qry1='''insert into Store_Sales_ID select 1, Store ,\n", + " DayOfWeek ,\n", + " SalesDate ,\n", + " Sales ,\n", + " Customers ,\n", + " SalesOpen ,\n", + " Promo ,\n", + " StateHoliday,\n", + " SchoolHoliday from DEMO_prophetSTO.Sales_Data where Store <= 5;'''\n", + "try:\n", + " execute_sql(qry)\n", + " execute_sql(qry1) \n", + "except:\n", + " db_drop_table('Store_Sales_ID')\n", + " execute_sql(qry)\n", + " execute_sql(qry1) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "final_table_df2 = DataFrame('Store_Sales_ID')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install the user script file on Vantage. In case of rerun if the file already exists we first remove it and then install again.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# return types\n", + "types_dict = OrderedDict({})\n", + "types_dict[\"ds\"] = VARCHAR(100)\n", + "types_dict[\"yhat\"] = VARCHAR(100)\n", + "types_dict[\"yhat_lower\"] = VARCHAR(100)\n", + "types_dict[\"yhat_upper\"] = VARCHAR(100)\n", + "types_dict[\"trend\"] = VARCHAR(100)\n", + "types_dict[\"trend_lower\"] = VARCHAR(100)\n", + "types_dict[\"trend_upper\"] = VARCHAR(100)\n", + "\n", + "# \"ds\":TIMESTAMP(0), \"yhat\": FLOAT(), \"yhat_lower\": FLOAT(), \"yhat_upper\": FLOAT() , \n", + "# \"trend\": FLOAT(), \"weekly\": FLOAT(), \"yearly\": FLOAT()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "env.install_file(\"stoSalesForecastnew.py\", replace=True)\n", + "apply_obj = Apply(\n", + " data=final_table_df2,\n", + " apply_command=\"python stoSalesForecastnew.py\",\n", + " # returns={\"ds\": VARCHAR(100)},\n", + " returns=types_dict,\n", + " env_name=env,\n", + " delimiter=\"\\t\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execute the script in SQL using APPLY command with the following SQL code:
\n", + "Since the entire process of model training , fitting and scoring takes place in the .py file when used in the script command the below query make take some time approximately 50-60 seconds.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_forecast_df = apply_obj.execute_script()\n", + "sales_forecast_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sales_forecast_df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The output contains 5005 rows(1 for each date) and 7 columns.\n", + "
The forecasting output contains information for:\n", + "
\n", + "To plot the forecast Values we select only the required columns and convert the teradataml dataframe to pandas dataframe.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_output = sales_forecast_df.to_pandas(all_rows=True).reset_index()\n", + "plot_output[\"ds\"] = pd.to_datetime(plot_output['ds']).dt.date\n", + "plot_output[\"yhat\"] = pd.to_numeric(plot_output['yhat'])\n", + "plot_output[\"yhat_lower\"] = pd.to_numeric(plot_output['yhat_lower'])\n", + "plot_output[\"yhat_upper\"] = pd.to_numeric(plot_output['yhat_upper'])\n", + "plot_output[\"trend\"] = pd.to_numeric(plot_output['trend'])\n", + "plot_output[\"trend_lower\"] = pd.to_numeric(plot_output['trend_lower'])\n", + "plot_output[\"trend_upper\"] = pd.to_numeric(plot_output['trend_upper'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "plot_output_forecast = plot_output[['ds','yhat','yhat_lower','yhat_upper']].sort_values('ds', ascending=True)\n", + "# .tail(300)\n", + "plot_output_forecast = plot_output_forecast.reset_index()\n", + "plot_output_forecast.head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To plot the forecast Values and the confidence level we set the lower and upper bounds of the confidence interval to yhat_lower and yhat_upper.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "\n", + "# Create the data for the line graph, including the x-values and the corresponding upper and lower bounds\n", + "x_values = plot_output_forecast['ds'].values\n", + "y_values = plot_output_forecast['yhat'].values\n", + "lower_bounds = plot_output_forecast['yhat_lower'].values\n", + "upper_bounds = plot_output_forecast['yhat_upper'].values\n", + "\n", + " \n", + "plt.figure(figsize=(12, 8))\n", + "# Plot the line graph\n", + "plt.plot(x_values, y_values, color='black', label='Forecast Values')\n", + "plt.fill_between(x_values, lower_bounds, upper_bounds, color='lightblue', alpha=0.3, label='Confidence Interval')\n", + "\n", + " \n", + "\n", + "# Customize the plot\n", + "\n", + "plt.xlabel('Date')\n", + "plt.ylabel('Forecast Values')\n", + "plt.title('Forecast Sales Values with Confidence Interval')\n", + "plt.legend()\n", + "\n", + " \n", + "\n", + "# Show the plot\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above graph contains the Forecast values(black line) and the light blue area is the range of the lower(yhat_lower) and upper(yhat_upper) limits of the forecasted values.
\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "7. Conclusion:
\n", + "We have trained and validated the Prophet model using the python script and used the APPLY Operator using OAF and data from Vantage. We get the forecasted data in Vantage using the python script.
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "8. Cleanup
\n", + "Work Tables
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "db_drop_table(table_name='Store_Sales_ID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_env(\"oaf_demo_env\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you have updated the teradataml package, reinstall the package by uncommenting and running the below code cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "# !pip install teradataml==17.20.0.6 --force-reinstall\n", + "!pip install scikit-learn==1.0.2 --force-reinstall\n", + "!pip install numpy==1.24.2 --force-reinstall" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png new file mode 100644 index 00000000..632c32c9 Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_Steps.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png new file mode 100644 index 00000000..4dac803c Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/OAF_flow.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png new file mode 100644 index 00000000..8916da63 Binary files /dev/null and b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/images/STO.png differ diff --git a/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py new file mode 100644 index 00000000..aee10640 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Store_Sales_Forecasting_Prophet_OAF/stoSalesForecastnew.py @@ -0,0 +1,150 @@ +# ####################################################################################################################### +# The code in the file gets input from Vantage table and creates prophet model and forecats sales using the forecast +# function of the Prophet model. These forecasted values are passed back to Vantage when this script is called using the +# Vantage Script command. +# ####################################################################################################################### +# Import the necessary libraries +import sys +import numpy as np +import pandas as pd +import subprocess + +# Prophet Library +from prophet import Prophet +import pickle +import base64 +import sys, os + +from contextlib import contextmanager +import logging +import datetime +# from datetime import date +from dateutil.relativedelta import relativedelta + +logging.basicConfig(format='%(process)d-%(levelname)s-%(message)s') + +# create a class which will be used to supress the output of the model.fit function +class suppress_stdout_stderr(object): + """ + Filter out Prophet logs from stdout and stderr + + from https://stackoverflow.com/questions/11130156/suppress-stdout-stderr-print-from-python-functions + Update: https://github.com/facebook/prophet/issues/223 randlet, 2017-09-31 + """ + def __init__(self): + self.null_fds = [os.open(os.devnull, os.O_RDWR) for x in range(2)] + self.save_fds = [os.dup(1), os.dup(2)] + + def __enter__(self): + os.dup2(self.null_fds[0], 1) + os.dup2(self.null_fds[1], 2) + + def __exit__(self, *_): + os.dup2(self.save_fds[0], 1) + os.dup2(self.save_fds[1], 2) + for fd in self.null_fds + self.save_fds: + os.close(fd) + + +### +### Read input +### + + +delimiter = '\t' +inputData = [] + +for line in sys.stdin.read().splitlines(): + line = line.split(delimiter) + inputData.append(line) + +### +### If no data received, gracefully exit rather than producing an error later. +### + +if not inputData: + sys.exit() + +### +### Set up input DataFrame according to input schema +### + +# Know your data: You must know in advance the number of incoming columns from the database! + +columns = ['SlsID','Store','DayOfWeek', 'SalesDate', 'Sales', 'Customers', 'SalesOpen','Promo', 'StateHoliday', + 'SchoolHoliday'] + +df = pd.DataFrame(inputData, columns=columns).copy() + +del inputData + +# create sales dataframe using the SalesDate as 'ds' and Sales as 'y' which is needed as input to the Prophet model +sales = df.rename(columns = {'SalesDate': 'ds', + 'Sales': 'y'}) + +sales=sales[['ds','y']] + +# Get dates for school holidays + +school_dates_df=df[['SalesDate','SchoolHoliday']] +school_dates_df['SchoolHoliday'] = pd.to_numeric(school_dates_df['SchoolHoliday']) +school_dates = school_dates_df.loc[school_dates_df.SchoolHoliday == 1, 'SalesDate'].values + + +school = pd.DataFrame({'holiday': 'school_holiday', + 'ds': pd.to_datetime(school_dates)}) + + +holidays = school + + +# # Prophet implementation +# Train model +my_model = Prophet(interval_width = 0.70, changepoint_prior_scale=0.05,seasonality_prior_scale=0.03,holidays_prior_scale=0.03, + holidays = holidays.head(1000)) + + +# Fit model using the Sales data +with suppress_stdout_stderr(): + my_model.fit(sales) + + + +# dataframe that extends into future and history +# future_dates = my_model.make_future_dataframe(periods=365) + +# Get min date and then go back 1 month +dt = min(sales['ds'].values) +# date1 = datetime.datetime.strptime(dt, "%y/%m/%d").date() +date1 = datetime.datetime.strptime(dt, "%Y-%m-%d").date() + +# Subtract one month +start_date = date1 - relativedelta(months=1) + +# Get max date and then get future dates for 1 month +dt1 = max(sales['ds'].values) +# date2 = datetime.datetime.strptime(dt1, "%y/%m/%d").date() +date2 = datetime.datetime.strptime(dt1, "%Y-%m-%d").date() + +# Add one month +end_date = date2 + relativedelta(months=1) +# end_date= str(end_value) + +# Create date range using start date and end date +date_range = pd.date_range(str(start_date), str(end_date)) + +# Create data frame for the dates to be passed to predict function +future_dates = pd.DataFrame({'ds': date_range}) + +# forecast +forecast_df = my_model.predict(future_dates) +# df_5 = forecast_df.head(5) +# for index, row in sales.iterrows(): +# print(row['ds']) +# Export results to Advanced SQL Engine through standard output in expected format. +# for index, row in future_dates.iterrows(): +# print(row['ds']) +# for ind, column in enumerate(forecast_df.columns): +# print(column) +for index, row in forecast_df.iterrows(): + print(row['ds'], delimiter, row['yhat'], delimiter,row['yhat_lower'], delimiter, row['yhat_upper'], delimiter, row['trend'], delimiter, row['trend_lower'], delimiter, row['trend_upper']) diff --git a/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb new file mode 100644 index 00000000..eb4db4d3 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Telco_Customer_Churn/VCL_Telco_Customer_Churn_Python.ipynb @@ -0,0 +1,1576 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "78ad8a32", + "metadata": {}, + "source": [ + "\n",
+ " Telco Customer Churn\n",
+ "
\n",
+ " \n",
+ "
Introduction
\n", + "\n", + "\n", + "Customer churn is a concern for all companies, but the complexity makes it difficult to track. Customers may leave due to various reasons such dissatisfaction with service quality, pricing, customer service, or finding better alternatives from competitors. Although some churn may be expected, companies aim to retain their customers to avoid using additional resources to find new customers. Thus, with the help of Teradata Vantage, companies can attain their goal of identifying the factors contributing to the churn, so they can take appropriate measures to retain customers. Vantage’s capabilities allow companies to analyze large amounts of customer data, such as usage patterns, billing information, demographics, and interactions, to find patterns that may indicate customers who are at risk of churning. Plus, Teradata’s machine learning and predictive analytics can be used to build models to predict customers which are likely to churn in the future. This information will give companies the chance to intervene, including sending targeted marketing campaigns, personalized offers, improved customer service, or addressing customer concern.
\n", + "Business Values
\n", + "Why Vantage?
\n", + "\n",
+ "Traditional ML and AI development and deployment pipelines require users to manually combine various tools and techniques across the lifecycle. This leads to lengthy, fragile, manual, error-prone processes that are, in many cases, impossible to migrate out of the lab and into production in order to realize business value.
ClearScape Analytics helps to solve this “development to deployment gap” by providing highly scalable, performant, and easy-to-use analytic capabilities that address all aspects of the development lifecycle. The same tools and techniques that data scientists use in development can be seamlessly deployed into production using the same code, platform, and operational pipeline.
\n", + "Managing telco churn is complex and requires continuous monitoring, analysis, and proactive customer engagement strategies. By using data and advanced analytics, telecom companies can better understand customer behavior and preferences, and take proactive measures to retain customers and maintain profitability.
\n", + "\n", + "\n", + "Let's demonstrate this use case with sample data using InDb analytics in Vantage which can pre-process and analyze huge amounts of data and at scale. \n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "22173c1b-a4eb-4cd7-b0ae-ba68bc39aba2", + "metadata": {}, + "source": [ + "1. Configure the environment
\n", + "\n",
+ "Before we start working with our data, we need to set up our environment. This involves importing the necessary packages and establishing a connection to Vantage.\n",
+ "
\n",
+ "Here's how we can do this:
2. Connect to VantageCloud Lake
\n", + "Connect to VantageCloud using create_context
from the teradataml Python library. If this environment has been prepared for connecting to a VantageCloud Lake OAF Container, all the details required will be loaded and you will see an acknowledgement after executing this cell.
3. Load the data
\n", + "\n", + "We have provided data for this demo in the lake environment. The data is available in the database \"DEMO_Telco\". Your user should have read access to the database. In case of any issues please write a mail to the support group (\"SC230208@teradata.com\").
\n", + " \n", + "**Note: The tables are available in DEMO_Telco_DB database and we have created views in DEMO_Telco database which are used in the cells below
" + ] + }, + { + "cell_type": "markdown", + "id": "bdd6dd8c", + "metadata": {}, + "source": [ + "4. Data Exploration
" + ] + }, + { + "cell_type": "markdown", + "id": "13288769-f1b3-40a5-8cad-95e5f4ae92fd", + "metadata": {}, + "source": [ + "Customer Churn
\n", + "Let us start by creating a \"Virtual DataFrame\" that points directly to the dataset in Vantage. We then begin our analysis by checking the shape of the DataFrame and examining the data types of all its columns.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40df274-d9cb-439b-93bb-343d317f052c", + "metadata": {}, + "outputs": [], + "source": [ + "tdf = DataFrame(in_schema(\"DEMO_Telco\", \"Customer_Churn\"))\n", + "tdf" + ] + }, + { + "cell_type": "markdown", + "id": "1d620292-c936-4546-89eb-59fd50c35221", + "metadata": {}, + "source": [ + "We can check the demographics of data by shape and info method.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "279ee6be-9288-41ae-b21a-f2389add4623", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Shape of the data: \", tdf.shape)\n", + "tdf.info()" + ] + }, + { + "cell_type": "markdown", + "id": "4d9927f9-f0f6-4f45-966d-7b7b2ca36f84", + "metadata": {}, + "source": [ + "As we can see from above result our dataset has 7043 rows with 21 columns.
" + ] + }, + { + "cell_type": "markdown", + "id": "b03454cf-d47a-4edc-aea4-5b517b7da9d6", + "metadata": {}, + "source": [ + "Summary of Columns
\n",
+ "
We can use the ColumnSummary function for quickly examining the columns, their datatypes, and summary of NULLs/non-NULLs for a given table.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21eece2c-533a-40e3-bcad-4ed4bb2b6cf3", + "metadata": {}, + "outputs": [], + "source": [ + "from teradataml import ColumnSummary\n", + "obj = ColumnSummary(data=tdf,\n", + " target_columns=[':']\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7eeb1e86-c00a-48d5-8e3b-363118c847ce", + "metadata": {}, + "outputs": [], + "source": [ + "obj.result.head(21)" + ] + }, + { + "cell_type": "markdown", + "id": "a91e3850-12c5-4b74-b17d-852092e81925", + "metadata": {}, + "source": [ + "4.1 Exploratory Data Analysis
" + ] + }, + { + "cell_type": "markdown", + "id": "befaaeb8-fab9-43f6-8a0a-efdb0e486377", + "metadata": {}, + "source": [ + "\n",
+ "Exploratory Data Analysis (EDA) is a process where we visually and statistically examine, analyze, and summarize data to comprehend its characteristics, patterns, and relationships. This approach is crucial for gaining insights and a deeper understanding of the dataset at hand.
First let us analyse the Gender and Churn distributions in our data.
\n", + "We can see that the aggregated data is available to us in teradataml dataframe. Let's visualize this data to better understand the Churn and gender distributions. Clearscape Analytics can easily integrate with 3rd party visualization tools like Tableau, PowerBI or many python modules available like plotly, seaborn etc. We can do all the calculations and pre-processing on Vantage and pass only the necessary information to visulazation tools, this will not only make the calculation faster but also reduce the overall time due to less data movement between tools.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d38b54fe-bbb5-45dd-b685-c82be89211f3", + "metadata": {}, + "outputs": [], + "source": [ + "d1=d1.to_pandas().reset_index()\n", + "d2=d2.to_pandas().reset_index()\n", + "#Gender and Churn percentage distribution\n", + "# Create subplots: use 'domain' type for Pie subplot\n", + "fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])\n", + "fig.add_trace(go.Pie(labels=d1['Gender'], values=d1['Count'], name=\"Gender\"),\n", + " 1, 1)\n", + "fig.add_trace(go.Pie(labels=d2['Churn'], values=d2['Count'], name=\"Churn\"),\n", + " 1, 2)\n", + "\n", + "# Use `hole` to create a donut-like pie chart\n", + "fig.update_traces(hole=.4, hoverinfo=\"label+percent+name\", textfont_size=16)\n", + "\n", + "fig.update_layout(\n", + " title_text=\"Gender and Churn Distributions\",\n", + " # Add annotations in the center of the donut pies.\n", + " annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),\n", + " dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "6d394641-ba1c-44de-9db3-be2286aa3d13", + "metadata": {}, + "source": [ + "From the above plot we can see that 26.6 % of customers switched to another firm.
And of total customers 49.5 % are female and 50.5 % are male.
Now, let us see the chrun with respect to gender.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d127df-8eea-4d51-9f68-7179a3969884", + "metadata": {}, + "outputs": [], + "source": [ + "d3=tdf.select(['Churn','Gender','CustomerID']).groupby(['Churn','Gender']).count()\n", + "d3 = d3.assign(drop_columns=True,\n", + " Churn=d3.Churn,\n", + " Gender=d3.Gender, \n", + " Count=d3.count_CustomerID)\n", + "d3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8075c4b0-f668-4924-bf32-ef8c3a375c92", + "metadata": {}, + "outputs": [], + "source": [ + "d3=d3.to_pandas().reset_index()\n", + "fig2=px.sunburst(d3,path=['Churn','Gender'],values='Count')\n", + "fig2.update_layout(\n", + " title_text=\"Churn Distribution w.r.t Gender\")\n", + "fig2.show()" + ] + }, + { + "cell_type": "markdown", + "id": "eedfa546-3b86-4aa8-a4b9-1c47f922c5db", + "metadata": {}, + "source": [ + "We can see that there is negligible difference in customer count who changed the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43b27db1-3a04-440b-a382-da72669babb3", + "metadata": {}, + "outputs": [], + "source": [ + "d4=tdf.select(['Churn','Contract','CustomerID']).groupby(['Churn','Contract']).count()\n", + "d4 = d4.assign(drop_columns=True,\n", + " Churn=d4.Churn,\n", + " Contract=d4.Contract, \n", + " Count=d4.count_CustomerID)\n", + "d4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0d4bd6b-f9fa-4930-ba88-379b1308c795", + "metadata": {}, + "outputs": [], + "source": [ + "d4=d4.to_pandas().reset_index()\n", + "fig4 = px.bar(d4,x=\"Churn\",y=\"Count\", color=\"Contract\", barmode=\"group\", title=\"Customer contract distribution\")\n", + "fig4.update_layout(width=700, height=500, bargap=0.1)\n", + "fig4.show()" + ] + }, + { + "cell_type": "markdown", + "id": "cb97cf05-b143-43e9-bf3a-b7ed267c1ad7", + "metadata": {}, + "source": [ + "We can see that about 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customers with One Year Contract and 3% with Two Year Contract.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57466f9f-ab11-4e32-ae9f-7c0fce65ed4d", + "metadata": {}, + "outputs": [], + "source": [ + "d5=tdf.select(['PaymentMethod','CustomerID']).groupby('PaymentMethod').count()\n", + "d5 = d5.assign(drop_columns=True,\n", + " PaymentMethod=d5.PaymentMethod,\n", + " Count=d5.count_CustomerID)\n", + "d5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21d214d3-db22-4602-9fa2-9c694282d056", + "metadata": {}, + "outputs": [], + "source": [ + "d5=d5.to_pandas().reset_index()\n", + "fig5 = go.Figure(data=[go.Pie(labels=d5['PaymentMethod'], values=d5['Count'], hole=.3)])\n", + "fig5.update_layout(title_text=\"Payment Method Distribution\")\n", + "fig5.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea725ada-aa8c-4dec-9648-1cdc26a17cd9", + "metadata": {}, + "outputs": [], + "source": [ + "d6=tdf.select(['Churn','PaymentMethod','CustomerID']).groupby(['Churn','PaymentMethod']).count()\n", + "d6 = d6.assign(drop_columns=True,\n", + " Churn=d6.Churn,\n", + " PaymentMethod=d6.PaymentMethod, \n", + " Count=d6.count_CustomerID)\n", + "d6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a576fade-bb95-4c62-ab6f-94c06d0b5ddf", + "metadata": {}, + "outputs": [], + "source": [ + "d6=d6.to_pandas().reset_index()\n", + "fig6 = px.bar(d6,x=\"Churn\",y=\"Count\", color=\"PaymentMethod\", barmode=\"stack\", title=\"Customer Payment Method distribution w.r.t. Churn\")\n", + "fig6.update_layout(width=700, height=500, bargap=0.1)\n", + "fig6.show()" + ] + }, + { + "cell_type": "markdown", + "id": "8fd84375-3d85-47cd-9076-6a6b8ea3e496", + "metadata": {}, + "source": [ + "Major customers who moved out were having Electronic Check as Payment Method.\n",
+ "
Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.
We can see that a lot of customers choose the Fiber optic service as compared to DSL but it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.\n",
+ "
Customers having DSL service have less churn rate compared to Fiber optic service.
Customers without dependents are more likely to churn.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d4ca94e9-b31f-4fb5-824b-d21f58ef3055", + "metadata": {}, + "outputs": [], + "source": [ + "d9=tdf.select(['Churn','Partner','CustomerID']).groupby(['Churn','Partner']).count()\n", + "d9 = d9.assign(drop_columns=True,\n", + " Churn=d9.Churn,\n", + " Partner=d9.Partner,\n", + " Count=d9.count_CustomerID)\n", + "d9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e587308-8648-406b-9aca-f8906d8abe60", + "metadata": {}, + "outputs": [], + "source": [ + "d9=d9.to_pandas().reset_index()\n", + "color_map = {\"Yes\": '#FFA15A', \"No\": '#00CC96'}\n", + "fig9 = px.bar(d9, x=\"Churn\",y=\"Count\", color=\"Partner\", barmode=\"group\", title=\"Chrun distribution w.r.t. Partners\", color_discrete_map=color_map)\n", + "fig9.update_layout(width=700, height=500, bargap=0.1)\n", + "fig9.show()" + ] + }, + { + "cell_type": "markdown", + "id": "649567b1-3232-49bf-840a-8518b38c29b4", + "metadata": {}, + "source": [ + "Customers that don't have partners are more likely to churn.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5efe37b2-f14b-4499-abde-0f155cb8e3f0", + "metadata": {}, + "outputs": [], + "source": [ + "d10=tdf.select(['Churn','PaperlessBilling','CustomerID']).groupby(['Churn','PaperlessBilling']).count()\n", + "d10 = d10.assign(drop_columns=True,\n", + " Churn=d10.Churn,\n", + " PaperlessBilling=d10.PaperlessBilling,\n", + " Count=d10.count_CustomerID)\n", + "d10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f24940fb-b15f-4460-803a-8c82ffbef623", + "metadata": {}, + "outputs": [], + "source": [ + "d10=d10.to_pandas().reset_index()\n", + "color_map = {\"Yes\": '#FFA15A', \"No\": '#00CC96'}\n", + "fig10 = px.bar(d10, x=\"Churn\",y=\"Count\", color=\"PaperlessBilling\", title=\"Chrun distribution w.r.t. Paperless Billing\", color_discrete_map=color_map)\n", + "fig10.update_layout(width=700, height=500, bargap=0.1)\n", + "fig10.show()" + ] + }, + { + "cell_type": "markdown", + "id": "fd6697e7-1dcc-44b1-9428-290856a1cb0c", + "metadata": {}, + "source": [ + "Customers with Paperless Billing are most likely to churn.
" + ] + }, + { + "cell_type": "markdown", + "id": "fa5ea57f-b8ba-44eb-9d9b-cf07d37b77b5", + "metadata": {}, + "source": [ + "5. Data Preprocessing
" + ] + }, + { + "cell_type": "markdown", + "id": "b58490d2-1f7a-4941-a62a-e99a4b7f7543", + "metadata": {}, + "source": [ + "Before the data can be used for model creation; we will need to do some data cleansing and transformation on it. We can do this InDb with Teradata Vantage's inbuilt functions.
We will use the CategoricalSummary function to showcase the distinct values and their corresponding counts for each specified column in the input DataFrame. This function provides a concise summary of categorical data, aiding in a quick understanding of the distribution of values within the specified columns.
\n", + "As we can see from the sample data above and the categorical summary values, the columns
\n", + "are related to InternetService, wherever InternetService value is \"No\" the column have value of \"No internet service\". For our model let us replace \"No internet service\" to No in our column. We will do similar operation for replacing \"No phone service\" to \"No\".
We will use sqlalchemy's oreplace function to replace the respective strings to desired value.
Onehotencoding & Ordinal encoding
\n", + "From our categorical attributes we can see that there are limited distinct values in each of these columns. We will use Teradata's OneHotEncodingFit and Transform and OrdinalEncodingFit and Transform functions to convert the categorical attributes to numerical.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc112884-d0bf-462d-9024-003bb4c2640a", + "metadata": {}, + "outputs": [], + "source": [ + "onehotfit_df = OneHotEncodingFit(data=tdf2,\n", + " is_input_dense=True,\n", + " approach=\"auto\",\n", + " target_column=[\"Gender\",\"Partner\",\"Dependents\",\"PhoneService\",\"MultipleLines\",\"OnlineSecurity\"\n", + " ,\"OnlineBackup\",\"DeviceProtection\",\"TechSupport\",\"StreamingTV\",\"StreamingMovies\",\n", + " \"PaperlessBilling\"],\n", + " category_counts=[2,2,2,2,2,2,2,2,2,2,2,2])" + ] + }, + { + "cell_type": "markdown", + "id": "2c136b69-41c1-42f1-bf47-89d1dae800a3", + "metadata": {}, + "source": [ + "\n", + "The other categorical columns
\n", + "have more values where we can apply ordinalencoding on it
\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e6b2946-b138-4ad6-aaa7-dff619fcf1e7", + "metadata": {}, + "outputs": [], + "source": [ + "ordinalfit_df = OrdinalEncodingFit(target_column=['InternetService','Contract','PaperlessBilling','PaymentMethod'],\n", + " default_value=-1,\n", + " data=tdf2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffe66188-9511-485c-84e5-2638f758f4c6", + "metadata": {}, + "outputs": [], + "source": [ + "ordinalfit_df.result" + ] + }, + { + "cell_type": "markdown", + "id": "7a437c68-b650-4300-a3e8-cd39a5e21054", + "metadata": {}, + "source": [ + "Scale the numerical values
For the numercial attributes we will use ScaleFit and ScaleTransform function to scale the specified input table columns i.e perform the specific scale methods like standard deviation, mean etc to the input columns.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "059dd73a-3751-46a8-85bf-7a11a7eacb7c", + "metadata": {}, + "outputs": [], + "source": [ + "scalefit_df = ScaleFit(data=tdf2,\n", + " target_columns=['MonthlyCharges','TotalCharges'],\n", + " scale_method=\"MIDRANGE\",\n", + " miss_value=\"KEEP\",\n", + " global_scale=False)" + ] + }, + { + "cell_type": "markdown", + "id": "faec04de-8197-4c10-bcc5-3bc3f605e81e", + "metadata": {}, + "source": [ + "Putting it altogether
We will use ColumnTransformer function to apply all the transformations from the fit tables created below in one go.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d108b118-0970-4c84-9371-df4ef125ce69", + "metadata": {}, + "outputs": [], + "source": [ + "ColumnTransformer_out = ColumnTransformer(fillrowid_column_name=\"output_value\",\n", + " input_data=tdf2,\n", + " onehotencoding_fit_data=onehotfit_df.result,\n", + " ordinalencoding_fit_data=ordinalfit_df.result,\n", + " scale_fit_data=scalefit_df.output)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4da18dae-fec0-4bba-b87c-ab7045fd21e1", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data= ColumnTransformer_out.result.assign(drop_columns=True,\n", + " Churn=tdf2.Churn,\n", + " CustomerID=ColumnTransformer_out.result.CustomerID,\n", + " SeniorCitizen=ColumnTransformer_out.result.SeniorCitizen,\n", + " Tenure=ColumnTransformer_out.result.Tenure,\n", + " InternetService=ColumnTransformer_out.result.InternetService,\n", + " Contract=ColumnTransformer_out.result.Contract,\n", + " PaperlessBilling=ColumnTransformer_out.result.PaperlessBilling,\n", + " PaymentMethod=ColumnTransformer_out.result.PaymentMethod,\n", + " MonthlyCharges=ColumnTransformer_out.result.MonthlyCharges,\n", + " TotalCharges=ColumnTransformer_out.result.TotalCharges,\n", + " Gender_0=ColumnTransformer_out.result.Gender_0,\n", + " Gender_1=ColumnTransformer_out.result.Gender_1,\n", + " Partner_0=ColumnTransformer_out.result.Partner_0,\n", + " Partner_1=ColumnTransformer_out.result.Partner_1,\n", + " Dependents_0=ColumnTransformer_out.result.Dependents_0,\n", + " Dependents_1=ColumnTransformer_out.result.Dependents_1,\n", + " PhoneService_0=ColumnTransformer_out.result.PhoneService_0,\n", + " PhoneService_1=ColumnTransformer_out.result.PhoneService_1,\n", + " MultipleLines_0=ColumnTransformer_out.result.MultipleLines_0,\n", + " MultipleLines_1=ColumnTransformer_out.result.MultipleLines_1,\n", + " OnlineSecurity_0=ColumnTransformer_out.result.OnlineSecurity_0,\n", + " OnlineSecurity_1=ColumnTransformer_out.result.OnlineSecurity_1,\n", + " OnlineBackup_0=ColumnTransformer_out.result.OnlineBackup_0,\n", + " OnlineBackup_1=ColumnTransformer_out.result.OnlineBackup_1,\n", + " DeviceProtection_0=ColumnTransformer_out.result.DeviceProtection_0,\n", + " DeviceProtection_1=ColumnTransformer_out.result.DeviceProtection_1,\n", + " TechSupport_0=ColumnTransformer_out.result.TechSupport_0,\n", + " TechSupport_1=ColumnTransformer_out.result.TechSupport_1,\n", + " StreamingTV_0=ColumnTransformer_out.result.StreamingTV_0,\n", + " StreamingTV_1=ColumnTransformer_out.result.StreamingTV_1,\n", + " StreamingMovies_0=ColumnTransformer_out.result.StreamingMovies_0,\n", + " StreamingMovies_1=ColumnTransformer_out.result.StreamingMovies_1,\n", + " PaperlessBilling_0=ColumnTransformer_out.result.PaperlessBilling_0,\n", + " PaperlessBilling_1=ColumnTransformer_out.result.PaperlessBilling_1)\n", + " \n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e4c30b0-4989-4540-bf60-3e4631afeacd", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f38c7f0f-6da8-4ef9-acaf-0774d29c92e8", + "metadata": {}, + "outputs": [], + "source": [ + "Transformed_data.shape" + ] + }, + { + "cell_type": "markdown", + "id": "0edef2c8-e568-4626-9377-e189d66e3350", + "metadata": {}, + "source": [ + "We can see from above how our data is transformed from the original values.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05c50e66-97a9-4fc9-8bd1-20654dc318fc", + "metadata": {}, + "outputs": [], + "source": [ + "# Copying the intermediate table to database\n", + "Transformed_data.to_sql(\"Transformed_data\",primary_index = \"CustomerID\", if_exists = \"replace\")" + ] + }, + { + "cell_type": "markdown", + "id": "37551d5e-2366-42cf-83f9-1a48ee438c6c", + "metadata": {}, + "source": [ + "Create train and test data
Now we have transformed our data and it is fit to be used in machine learning models, let us split the whole dataset into train and test sets for model training and scoring. We will use TrainTestSplit function for this task.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b22bf0d0-255e-4ea9-8bdc-ffac9ea02f34", + "metadata": {}, + "outputs": [], + "source": [ + "TrainTestSplit_out = TrainTestSplit(\n", + " data = DataFrame('Transformed_data'),\n", + " id_column = \"CustomerID\",\n", + " train_size = 0.75,\n", + " test_size = 0.25,\n", + " seed = 21\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "19d382b8-7f46-43f8-aee9-5598d6f24ebf", + "metadata": {}, + "outputs": [], + "source": [ + "# Split into 2 virtual dataframes\n", + "df_train = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 1].drop(['TD_IsTrainRow'], axis = 1)\n", + "df_test = TrainTestSplit_out.result[TrainTestSplit_out.result['TD_IsTrainRow'] == 0].drop(['TD_IsTrainRow'], axis = 1)" + ] + }, + { + "cell_type": "markdown", + "id": "88e87734-1eb6-47a6-87d4-3b2d0585088a", + "metadata": {}, + "source": [ + "We have done our preprocessing of data and we created our training and test datasets, let's now create some predictive models." + ] + }, + { + "cell_type": "markdown", + "id": "3624ff0f-2e80-450e-a76e-85398a8c73da", + "metadata": {}, + "source": [ + "
6. InDb Model Training and Scoring
" + ] + }, + { + "cell_type": "markdown", + "id": "2893fa15-2812-473d-b91c-5949ba436461", + "metadata": {}, + "source": [ + "6.1 Logistic Regression
" + ] + }, + { + "cell_type": "markdown", + "id": "f4561645-5edd-4e2b-9983-59e77ab4745a", + "metadata": {}, + "source": [ + "For our model we will use logistic regression.
\n",
+ " Logistic regression is a statistical algorithm used for binary classification problems. It is a type of supervised learning algorithm that predicts the probability of an input belonging to a certain class (e.g., positive or negative) based on its features.
Logistic regression works by modeling the relationship between the input features and the probability of belonging to a certain class using a logistic function. The logistic function takes the input feature values and maps them onto a probability scale between 0 and 1, which represents the probability of belonging to the positive class.
\n",
+ " The GLM function is a generalized linear model (GLM) that performs regression and classification analysis on data sets.\n",
+ "
Please refer GLM for function elements and output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc6639e3-2427-42d2-b302-08e18196b2b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e92723cc-1aec-4fee-97fe-96b3e86e7802",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from teradataml import GLM, TDGLMPredict\n",
+ "\n",
+ "glm_model = GLM(data = df_train,\n",
+ " #input_columns = train_col,\n",
+ " input_columns = ['1:8','10:33'], \n",
+ " response_column = 'Churn',\n",
+ " family = 'Binomial')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0e3b4239-40a5-4b2c-9589-7d974c574641",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "glm_model.result"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a24bb51b-7115-486e-957a-848bad5bc4d9",
+ "metadata": {},
+ "source": [
+ "
We have created our model, let's do the predictions on the test dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd7ac897-3b98-47f6-a9c8-33a75f6dac4f", + "metadata": {}, + "outputs": [], + "source": [ + "glm_prediction = TDGLMPredict(newdata = df_test, #test_dataset,\n", + " id_column = 'CustomerID',\n", + " object = glm_model.result,\n", + " accumulate = 'Churn',\n", + " family = 'Binomial',\n", + " output_prob=True,\n", + " output_responses = ['0', '1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5280b57-9e01-42c4-9b89-5734dc0968bf", + "metadata": {}, + "outputs": [], + "source": [ + "glm_prediction.result\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "498799ce-2607-4cba-94e6-26a40abd7d0e", + "metadata": {}, + "outputs": [], + "source": [ + "out_glm = glm_prediction.result.assign(prediction = glm_prediction.result.prediction.cast(type_ = BYTEINT))\n", + "out_glm = out_glm.assign(prediction = out_glm.prediction.cast(type_ = VARCHAR(2)))\n", + "out_glm = out_glm.assign(Churn = out_glm.Churn.cast(type_ = VARCHAR(2)))\n", + "out_glm" + ] + }, + { + "cell_type": "markdown", + "id": "121887bb-4fb2-429f-99d7-120c6dd47e51", + "metadata": {}, + "source": [ + "
The output above shows prob_1, i.e. customer will Churn and prob_0, i.e. customer will not Churn. The prediction column uses these probabilities to give a class label, i.e. prediction column.
" + ] + }, + { + "cell_type": "markdown", + "id": "7fa0a4c0-c4f8-499b-9e60-8bc87684a5c8", + "metadata": { + "tags": [] + }, + "source": [ + "6.2 Evaluation of Logistic Regression Model
\n", + "We will use the ClassificationEvaluator function to evaluate the trained glm model on test data. This will let us know how well our model has performed on unseen data.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "683015ad-d54b-4a33-a71f-345a348ee912", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_glm = ClassificationEvaluator(\n", + " data = out_glm,\n", + " observation_column = 'Churn',\n", + " prediction_column = 'prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "679b7a22-e08e-414a-aec3-e8ae6b1e0701", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_glm.output_data.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "d281b3fa-0793-4e86-986b-26bda09833ec", + "metadata": {}, + "source": [ + "The above output shows recall, and F1-score values of confusion matrix.
\n", + "Column | \n", + "Description | \n", + "
---|---|
Precision | \n", + "The positive predictive value. Refers to the fraction of relevant instances among\n", + "the total retrieved instances.\n", + " Precision answers the following question: what proportion of predicted Positives is truly Positive? \n", + " Precision = (TP)/(TP+FP) | \n", + "
Recall | \n", + "Refers to the fraction of relevant instances retrieved over the total amount of\n", + "relevant instances. Recall answers a different question: what proportion of actual Positives is correctly classified?\n", + "Recall = (TP)/(TP+FN) | \n", + "
F1 | \n", + "F1 score, defined as the harmonic mean of the precision and recall and is a number between 0 and 1. F1 score maintains a balance between the precision and recall for your classifier. \n", + " F1 = 2*(precision*recall/precision+recall) | \n", + "
Support | \n", + "The number of times a label displays in the Observation Column. | \n", + "
**TP:- True Positive , FP :- False Positive, TN :- True Negative , FN :- False Negative
" + ] + }, + { + "cell_type": "markdown", + "id": "6aaf3776-22c7-4697-a7f8-e5e334067b18", + "metadata": { + "tags": [] + }, + "source": [ + "We can also calculate mean absolute error and AUC(Area Under the Curve) for Receiver Operating Characteristic Curve.
Mean Absolute Error is the summation of the difference between actual and predicted values averaged over the number of observations.
The ROC curve is a graph between TPR(True Positive Rate) and FPR(False Positive Rate). The area under the ROC curve is a metric of how well the model can distinguish between positive and negative classes. The higher the AUC, the better the model's performance in distinguishing between the positive and negative classes.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5962c501-8e8e-4594-ab63-d6e0df2e07a8", + "metadata": {}, + "outputs": [], + "source": [ + "AUC = roc_auc_score(glm_pred['Churn'], glm_pred['prob_1'])\n", + "AUC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "483506fb-d59d-4df0-a2fe-9d1e86ffdf72", + "metadata": {}, + "outputs": [], + "source": [ + "fpr, tpr, thresholds = roc_curve(glm_pred['Churn'], glm_pred['prob_1'])\n", + "plt.plot(fpr, tpr, color='orange', label='ROC. AUC = {}'.format(str(AUC)))\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver Operating Characteristic (ROC) Curve')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "ae6647ac-3dd6-406a-8d73-fb793b54d28f", + "metadata": {}, + "source": [ + "6.3 XGB
\n", + "\n",
+ " XGBoost (eXtreme Gradient Boosting) is based on the gradient boosting framework, which is an ensemble learning method that combines multiple weak or base models (typically decision trees) to create a more accurate and robust predictive model. XGBoost improves upon traditional gradient boosting by using a number of optimization techniques, including parallelization, regularization, and efficient handling of missing values, to achieve faster training times and better model performance.
\n",
+ " Teradata's XGBoost function is an implementation of the gradient boosted decision tree designed for speed and performance. In gradient boosting, each iteration fits a model to the residuals (errors) of the previous iteration to correct the errors made by existing models. The predicted residual is multiplied by this learning rate and then added to the previous prediction. Models are added sequentially until no further improvements can be made. It is called gradient boosting because it uses a gradient descent algorithm to minimize the loss when adding new models.\n",
+ "
Please refer XGBoost for function elements and output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aff1ebc7-ecb6-44e7-8c8d-b1da9505f9bb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "XGBoost_model = XGBoost(\n",
+ " data = df_train,\n",
+ " input_columns = ['1:8','10:33'],\n",
+ " response_column = 'Churn',\n",
+ " model_type = 'CLASSIFICATION',\n",
+ " \n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "062f024d-5e3d-4a59-93d3-763812d70053",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "XGBoostPredict_out = XGBoostPredict(\n",
+ " newdata = df_test,\n",
+ " object = XGBoost_model.result,\n",
+ " id_column = 'CustomerID',\n",
+ " accumulate = 'Churn',\n",
+ " model_type = 'CLASSIFICATION',\n",
+ " object_order_column = ['task_index', 'tree_num', 'iter', 'class_num', 'tree_order'],\n",
+ " output_responses = ['0', '1'],\n",
+ " output_prob = True\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "0c70378d-9b30-41e9-bfc3-ae901884f9c4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "out_xgb = XGBoostPredict_out.result.assign(Prediction = XGBoostPredict_out.result.Prediction.cast(type_ = BYTEINT))\n",
+ "out_xgb = out_xgb.assign(Prediction = out_xgb.Prediction.cast(type_ = VARCHAR(2)))\n",
+ "out_xgb = out_xgb.assign(Churn = out_xgb.Churn.cast(type_ = VARCHAR(2)))\n",
+ "out_xgb"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "48a42d5b-5fba-4036-b094-19858e2c560f",
+ "metadata": {},
+ "source": [
+ "
We have created our model, let's do the predictions on the test dataset." + ] + }, + { + "cell_type": "markdown", + "id": "0bd173a4-10ef-4528-a6e2-b67d5f4df07e", + "metadata": {}, + "source": [ + "
6.4 Evaluation of XGB Model
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23fd8827-dc00-4604-9c9e-4ef7352cf834", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_xgb = ClassificationEvaluator(\n", + " data = out_xgb,\n", + " observation_column = 'Churn',\n", + " prediction_column = 'Prediction',\n", + " labels = ['0', '1']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aefe83b-aedc-4173-9396-68572258fcd4", + "metadata": {}, + "outputs": [], + "source": [ + "ClassificationEvaluator_xgb.output_data.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e7466ae-b42c-4005-a094-729bb1230b33", + "metadata": {}, + "outputs": [], + "source": [ + "xgb_pred = XGBoostPredict_out.result.to_pandas().reset_index().sort_values(\"CustomerID\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "26c69f90-824e-4f84-a5d1-46e771a0ee00", + "metadata": {}, + "outputs": [], + "source": [ + "print(mean_absolute_error(xgb_pred['Churn'], xgb_pred['Prob_1']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c71ab9ce-e83c-4f1b-82fb-550119a98704", + "metadata": {}, + "outputs": [], + "source": [ + "AUC = roc_auc_score(xgb_pred['Churn'], xgb_pred['Prob_1'])\n", + "AUC" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78aa9c8e-e5e0-4a89-af63-27e1e638023f", + "metadata": {}, + "outputs": [], + "source": [ + "fpr, tpr, thresholds = roc_curve(xgb_pred['Churn'], xgb_pred['Prob_1'])\n", + "plt.plot(fpr, tpr, color='orange', label='ROC. AUC = {}'.format(str(AUC)))\n", + "plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('Receiver Operating Characteristic (ROC) Curve')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "e91ddacb-4ca3-405c-93c8-c587788191c4", + "metadata": {}, + "source": [ + "Conclusion
" + ] + }, + { + "cell_type": "markdown", + "id": "516e0588-5e98-4373-8f30-8e8f40898835", + "metadata": {}, + "source": [ + "In this demo we have seen how we can do analysis and pre-processing of the data in Vantage using InDb functions. We have also used created two commonly used predictive models for classification and predicted the customers that are likely to churn. " + ] + }, + { + "cell_type": "markdown", + "id": "35ebb886-8da9-479a-8995-c6dd7ccebffd", + "metadata": {}, + "source": [ + "
7. Cleanup
" + ] + }, + { + "cell_type": "markdown", + "id": "f0b01f3e-03fa-4a14-b388-02eeb210b8c1", + "metadata": {}, + "source": [ + "Work Tables
\n", + "\n", + "We need to clean up our work tables to prevent errors next time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "914cfbcf-f229-496c-be13-b63c62729291", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['Transformed_data']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name = table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ad94d1e-d82d-4611-b5c7-4180397f6c94", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "10724002-0091-4ef1-b091-71c0a2fdda5a", + "metadata": {}, + "source": [ + "
Let’s look at the elements we have available for reference for this notebook:
" + ] + }, + { + "cell_type": "markdown", + "id": "90d6c2a3-92e9-4121-a46a-2beaba63cac2", + "metadata": {}, + "source": [ + "Filters:
\n", + "Related Resources:
\n", + "Reference Links:
\n", + "