diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Analysis_Customer360.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Analysis_Customer360.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Analysis_Customer360.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Summarization.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Summarization.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaint_Summarization.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Classification.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Classification.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Classification.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Clustering.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Clustering.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Complaints_Clustering.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Sentiment_Analysis.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Sentiment_Analysis.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Sentiment_Analysis.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Topic_Modelling.yaml b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Topic_Modelling.yaml
new file mode 100644
index 00000000..57799a8d
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/.Topic_Modelling.yaml
@@ -0,0 +1,10 @@
+inputs:
+ - type: env
+ value: 'AWS_ACCESS_KEY_ID'
+ cell: 12
+ - type: env
+ value: 'AWS_SECRET_ACCESS_KEY'
+ cell: 12
+ - type: env
+ value: 'AWS_DEFAULT_REGION'
+ cell: 12
diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Analysis_Customer360.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Analysis_Customer360.ipynb
new file mode 100644
index 00000000..51006c8a
--- /dev/null
+++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Analysis_Customer360.ipynb
@@ -0,0 +1,476 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "1f44f3bc-51cc-47e3-a033-f2883ed97408",
+ "metadata": {},
+ "source": [
+ " \n",
+ " In-Database Complaints Analysis Integration with Customer360 using LLMs\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "Complaints Analysis Integration with Customer360 is a comprehensive approach to managing customer complaints and feedback within the framework of a Customer 360-degree view using Teradata Vantage and Amazon Bedrock. This integration aims to provide a seamless and personalized customer experience by leveraging data from various sources, including CRM systems, marketing platforms, and social media.
The key components of this integration include:
\n", + "\n", + "The benefits of this integration include:
By integrating complaints analysis with Customer 360, businesses can create a more comprehensive and personalized customer experience, driving business growth and customer satisfaction.
\n", + "\n", + "Steps in the analysis:
\n", + "Download and install additional software as needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc0a97b-78f8-4274-9844-d3b1d31848e6", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install --upgrade -r requirements.txt --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "7a8a5583-362f-4d11-8b3a-17b20637f517", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
" + ] + }, + { + "cell_type": "markdown", + "id": "7e7e5046-3c5f-4f6d-aeaf-60028655ff13", + "metadata": {}, + "source": [ + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83dc0922-f932-4378-a505-3bb2d1f1243b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import json, warnings\n", + "import getpass\n", + "from teradataml import *\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "\n", + "\n", + "# Set display options for dataframes, plots, and warnings\n", + "%matplotlib inline\n", + "warnings.filterwarnings('ignore')\n", + "display.max_rows = 5\n", + "pd.set_option('display.max_colwidth', None)\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "id": "768bf2ed-ae11-4969-b20a-88496e4a2b67", + "metadata": {}, + "source": [ + "Connection information has been defined in an external file - adjust as necessary.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "454a2e81-c377-4058-9e68-78abd801ad9c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "c62b56de-5a86-4c82-9d47-17d7a3314974", + "metadata": { + "tags": [] + }, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3. Use the TextAnalyticsAI
API to Perform Various Text Analytics Tasks
You can execute the help function at the bottom of this notebook to read more about this API.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d47859af-aebc-41c7-a3a1-919dccf57584", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Provide model details\n", + "model_name=\"anthropic.claude-v2\"\n", + "\n", + "# Select in-database or external model\n", + "llm = TeradataAI(api_type = 'AWS',\n", + " model_name = model_name,\n", + " region = region_name,\n", + " # authorization = 'Repositories.BedrockAuth'\n", + " access_key = access_key,\n", + " secret_key = secret_key)\n", + "\n", + "obj = TextAnalyticsAI(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "id": "69cdee3e-feb5-4cfd-b8d5-99baadcde44e", + "metadata": {}, + "source": [ + "Sentiment Analysis, Topic Modeling and Complaint Summarization using Large Language Models (LLMs) revolutionizes the way we understand and categorize vast collections of text data. LLMs excel in understanding the semantics and context of words, enabling sophisticated topic modeling techniques.
\n", + "\n", + "Sentiment Analysis Using Large Language Models (LLMs) is a cutting-edge approach to understanding customer opinions and emotions expressed through text-based data. This advanced technique leverages the capabilities of LLMs to accurately identify and categorize sentiment as positive, negative, or neutral, providing businesses with valuable insights into customer perceptions and preferences.
\n", + "\n", + "LLMs can generate coherent topics without needing predefined categories, making them ideal for exploratory analysis of diverse datasets. Moreover, their ability to capture subtle nuances in language allows for more precise topic identification, even in noisy or ambiguous texts.
\n", + "\n", + "4.1 Inspect source data
\n", + "\n", + "The Teradata python package (teradataml) allows users to work with data using common python syntax and methods without moving data to the client - all operations are pushed to the MPP platform, allowing rapid, performant analytics on data at any scale. In this case, the DataFrame object represents a table or query in-database which could contain millions or billions of records.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3506e344-bf92-44f4-9cee-7d85eac3290b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "customer_data = DataFrame('\"DEMO_ComplaintAnalysis\".\"Customer_360_Details\"')\n", + "customer_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb0b5121-6071-42cd-a928-af0116094289", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "complaints_data = DataFrame(in_schema('DEMO_ComplaintAnalysis', 'Customer_360_Complaints'))\n", + "complaints_data" + ] + }, + { + "cell_type": "markdown", + "id": "85336f1f-88af-4a03-ba6d-23a69df6a881", + "metadata": {}, + "source": [ + "4.2 Sentiment Analysis
\n", + "\n", + "Extract the sentiment (positive, negative, neutral) using in-database functions that can execute in-platform or call out to Large Language Models of choice.
" + ] + }, + { + "cell_type": "markdown", + "id": "d1eb6496-37ca-4060-82b9-4137b27a733d", + "metadata": {}, + "source": [ + "A simple method call will extract the sentiment for patient comments in-database using the desired LLM and CSP provider.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7da87396-1753-49ec-a9c2-cf4eed6ad304", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_sentiment = obj.analyze_sentiment(column = 'consumer_complaint_narrative', \n", + " data = complaints_data)[['Customer_ID','Sentiment','consumer_complaint_narrative']]\n", + "tdf_sentiment" + ] + }, + { + "cell_type": "markdown", + "id": "c023695d-eb93-4a57-bb26-95d45b13152a", + "metadata": {}, + "source": [ + "4.3 Topic Modeling
\n", + "\n", + "\n", + "LLMs can generate coherent topics without needing predefined categories, making them ideal for exploratory analysis of diverse datasets. Moreover, their ability to capture subtle nuances in language allows for more precise topic identification, even in noisy or ambiguous texts. In this case, we are looking for specific topics to drive downstream analytics.
\n", + "Provide a list of topics to use for classification.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f93540a7-731a-4cfb-a047-a8dbd3241b32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_topics = obj.classify(column = 'consumer_complaint_narrative', \n", + " data = complaints_data,\n", + " labels = ['Mortgage Application',\n", + " 'Payment Trouble',\n", + " 'Mortgage Closing',\n", + " 'Report Inaccuracy',\n", + " 'Payment Struggle'])[['Customer_ID','Labels','consumer_complaint_narrative']]\n", + "tdf_topics" + ] + }, + { + "cell_type": "markdown", + "id": "394cf875-57a4-4f2a-8c5e-e4fa6424f037", + "metadata": {}, + "source": [ + "4.3 Summarization
\n", + "\n", + "\n", + "The summarize method uses the model to summarize the text in the specified column of a database table. It generates an abstractive summary for the input using different levels. The conciseness of the summary can be adjusted using different levels. Higher levels yield more concise summaries.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3bd5286-1dd5-4e59-8f36-edbfcde946bf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_summary = obj.summarize(column = 'consumer_complaint_narrative', \n", + " data = complaints_data,\n", + " levels = 2 # higher values provide more concise summary\n", + " )[['Customer_ID','Summary','consumer_complaint_narrative']]\n", + "\n", + "tdf_summary" + ] + }, + { + "cell_type": "markdown", + "id": "6830c6a0-a260-4c0a-8643-d68fa5509d68", + "metadata": {}, + "source": [ + "The developer can now perform simple joins on the data in-database to provide a consolidated view of the complaint summary, sentiment, topic label, and customer information.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72d12627-7440-4988-bab3-cd4811474f18", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_combined = customer_data.join(tdf_topics.drop('consumer_complaint_narrative', axis = 1), on = ['\"Customer Identifier\" = Customer_ID']).drop('Customer_ID', axis = 1)\n", + "tdf_combined = tdf_combined.join(tdf_summary.drop('consumer_complaint_narrative', axis = 1), on = ['\"Customer Identifier\" = Customer_ID']).drop('Customer_ID', axis = 1)\n", + "tdf_combined = tdf_combined.join(tdf_sentiment.drop('consumer_complaint_narrative', axis = 1), on = ['\"Customer Identifier\"= Customer_ID']).drop('Customer_ID', axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed87a62b-a38a-4acb-9e94-51475866afbd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_combined[['Customer Identifier','Name','Sentiment','Labels','Summary']]" + ] + }, + { + "cell_type": "markdown", + "id": "06cbb0f4-d026-4a0b-8ab8-6982b7f7777a", + "metadata": {}, + "source": [ + "5.1 Persist the dataset
\n", + "Simple python methods will materialize the data to a permanent table if desired.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2aae2e32-0a4b-4fef-9e65-501c41f98564", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(tdf_combined, table_name = 'Customer360', temporary = True, if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "561ff317-6468-4941-bf9b-840849bfb09d", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "358f56a0-ad97-4317-ab4c-88be00b8d179", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "6cf670cd-4594-458d-af98-8efff5a72f73", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "eeebf3ab-357c-488e-ba9d-78bf82f4d0dd", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Summarization.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Summarization.ipynb new file mode 100644 index 00000000..84a2b743 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaint_Summarization.ipynb @@ -0,0 +1,1594 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1f44f3bc-51cc-47e3-a033-f2883ed97408", + "metadata": {}, + "source": [ + "\n",
+ " Complaints Summarization Using Vantage and LLM model\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "In this demo we'll deep dive on Complaints Summarization using Teradata Vantage and AWS Bedrock - Anthropic's Claude LLM model model. This cutting-edge solution empowers organizations to efficiently manage and analyze customer complaints, providing actionable insights to enhance customer satisfaction and improve business operations.
\n", + "\n", + "Key Features:
\n", + "\n", + "Benefits:
\n", + "Steps in the analysis:
\n", + "1.1 Downloading and installing additional software needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebc0a97b-78f8-4274-9844-d3b1d31848e6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -r requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "54b6bcb4-ea98-42f9-9088-b15b4ddd03bc", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "1.2 Import the required libraries
\n", + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "83dc0922-f932-4378-a505-3bb2d1f1243b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Data manipulation and analysis\n", + "import numpy as np\n", + "import pandas as pd\n", + "import getpass\n", + "\n", + "# Plotting\n", + "import plotly.express as px\n", + "\n", + "# Progress bar\n", + "from tqdm import tqdm\n", + "\n", + "# Machine learning and other utilities from Teradata\n", + "from teradataml import *\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "\n", + "# Requests\n", + "import requests\n", + "\n", + "# Display settings\n", + "display.max_rows = 5\n", + "pd.set_option('display.max_colwidth', None)\n", + "\n", + "# Set display options for dataframes, plots, and warnings\n", + "%matplotlib inline\n", + "warnings.filterwarnings('ignore')\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "id": "768bf2ed-ae11-4969-b20a-88496e4a2b67", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "454a2e81-c377-4058-9e68-78abd801ad9c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking if this environment is ready to connect to VantageCloud Lake...\n", + "Your environment parameter file exist. Please proceed with this use case.\n", + "Connected to VantageCloud Lake with: Engine(teradatasql://CH255039:***@54.156.178.22)\n" + ] + } + ], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "aed444a1-f0de-4bff-b0b9-d2c4e92573f5", + "metadata": {}, + "source": [ + "Begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "id": "8f83a8eb-1bd1-4bdb-ab20-cb02bf1ac869", + "metadata": {}, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3. Use the TextAnalyticsAI
API to Perform Various Text Analytics Tasks
You can execute the help function at the bottom of this notebook to read more about this API.
" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "00636e83-ffec-4415-9309-4b025bbb0276", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Provide model details\n", + "model_name=\"anthropic.claude-v2\"\n", + "\n", + "# Select in-database or external model\n", + "llm = TeradataAI(api_type = 'AWS',\n", + " model_name = model_name,\n", + " region = region_name,\n", + " # authorization = 'Repositories.BedrockAuth'\n", + " access_key = access_key,\n", + " secret_key = secret_key)\n", + "\n", + "obj = TextAnalyticsAI(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "id": "596cfbe5-e6d7-4b14-9d46-08929397e1a3", + "metadata": {}, + "source": [ + "Complaints summarization with Language Model (LLM) models involves condensing lengthy complaints into concise, informative summaries. By leveraging advanced natural language processing techniques, LLMs efficiently extract key issues, sentiments, and resolutions, aiding in quicker understanding and response to customer grievances.
\n", + "\n", + "Streamlining the complaint summarization process, Language Model (LLM) models efficiently distill verbose grievances into concise, yet informative summaries. These summaries meticulously capture crucial elements including primary issues, prevalent sentiments, and possible resolutions. Harnessing advanced natural language processing capabilities, LLMs accelerate both comprehension and response to customer concerns, thereby elevating operational efficiency and bolstering overall customer satisfaction.
" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "30ac5e3a-9264-46f5-94eb-76e813503ac6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = DataFrame(in_schema('DEMO_ComplaintAnalysis', 'Consumer_Complaints'))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0174db51-edcc-42d2-b251-c3171648a0e2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_summary = obj.summarize(column = 'consumer_complaint_narrative', \n", + " data = df.iloc[:5],\n", + " levels = 2 # higher values provide more concise summary\n", + " )[['complaint_id','Summary','consumer_complaint_narrative']]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "a33a52e0-fe44-4c6a-aeef-af8245a28b1d", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "882ab327761a4339a82d3574e992f984", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + "complaint_id | Summary | consumer_complaint_narrative | \n", + "
---|---|---|
1295007 | \n", + "\t\t'Discover Card payments were withdrawn over a month after being posted, causing frustration.' | \n", + "\t\tOn XXXX XXXX and XXXX, 2015 ( as well several phone calls and chat sessions with Discover Card customer service ), I made inquiries to why a payment of {$61.00} that was posted to my Discover Card account on XXXX XXXX, 2015, did not have the funds withdrawn from my financial institution, XXXX XXXX XXXX XXXX XXXX in XXXX, NC, until XXXX XXXX, 2015. Nobody at either Discover Card or XXXX XXXX ' XXXX XXXX could give me a direct answer. Instead, I got the \" pass the buck '' routine. Neither financial institution should be allowed to treat customers this way. I have documentation available as proof of these events taking place. | \n", + "\t
1294108 | \n", + "\t\tA Discover cardholder had his long-term account unexpectedly closed, losing his rewards, and though promised the rewards by check, has yet to receive it after 6 weeks. | \n", + "\t\tI have been a Discover credit card holder since 2007. During my entire membership with Discover, I was never late with payments, and always stayed under my credit line, and never charged my credit card for any purposes other than making a legitimate purchase. \n", + "\n", + "About a month ago, without any notice in advance, Discover closed my account and thereby wiped out my existing cashback rewards of {$300.00}. I contacted the company and demanded for an explanation. However, the only reason I got is \" we are no longer able to meet your servicing needs '', and I was told the rewards will be mailed to me in a check. Now, after 6 weeks, I still have n't received any check from Discover regarding my rewards. \n", + "\n", + "I respectfully urge the CFPB to take this matter seriously and to look into this case. We consumers are powerless to protect ourselves from discriminatory actions like this. | \n", + "\t
1294987 | \n", + "\t\t'I am falsely accused of owing debt for an unused credit card.' | \n", + "\t\tI am being accused of having a Discover Card debt that I did n't pay off, so Discover has turned over the account to another company ; XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX to harass me. I knew nothing about this, I 've NEVER used a credit card, the name is not even my legal name, now a sheriff served my mom papers where this attorney office is threatening to seize my properties! This is so wrong to do to people! I 'VE NEVER OPENED A CREDIT CARD!!!! | \n", + "\t
1294888 | \n", + "\t\t'In XXXX my checking account was compromised resulting in additional unexpected interest charges despite sending regular payments.' | \n", + "\t\tIn XXXX XXXX my checking acct was compromised that my credit card payment was auto-pay through. Since then I made monthly money order payments. My balance in XX/XX/XXXX was {$2000.00}. I sent 4 ) {$200.00} payments. I do NOT use the card for purchases & the interest is about {$3.00} a month. My balance is still {$1400.00}. I am clearly not credited for my XX/XX/XXXX payment. My bill went up that month instead of down. They insist that I was credited. There is some confusion because they repeatedly tried getting the payment through the bank, even though I told them not to. So they credited the bank payment then added it back on repeatedly. But it should n't take a genius to do the math. {$2000.00} minus {$800.00} is {$1200.00} not {$1400.00}. I called twice & wrote a letter & keep getting the same answer. | \n", + "\t
1294631 | \n", + "\t\t'XXXX did not fulfill a promotion although the customer satisfied the terms.' | \n", + "\t\tXXXX offered a {$100.00} gift card when applying for aDiscoverXXXX Card. I clicked on the link, applied and compliedwith the requirements. The requirement was to make XXXXpurchase with the card within 3 months. It did not say thatthe purchase had to be with XXXX. After a few months, Icontacted the bank, and they said they knew nothing of theoffer. I contacted them again and the management saidI did not apply for the right card. XXXX denied anyknowledge of the offer, although it was on their website. \n", + "I would not have applied for the card, but for the offer. | \n", + "\t
4.1 Graph for Complaint and Summary Lengths
A graph illustrating the Narrative length vs summary length. On the x-axis, you'd have \"Narrative length\" ranging from short to long complaints or narratives. On the y-axis, you'd have \"Summary length\" ranging from brief to detailed summaries. As narrative length increases, summary length would generally decrease, indicating the summarization process effectively condenses longer narratives into shorter summaries. This relationship would likely follow a downward trend, showcasing the summarization efficiency of the LLM models.
" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3f8191c6-4ab5-4b6c-9a51-f8cbca2c9aba", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "customdata": [ + [ + 1294987, + "I am being accused of having a Discover Card debt ...", + " 'I am falsely accused of owing debt for an unused..." + ], + [ + 1294631, + "XXXX offered a {$100.00} gift card when applying f...", + " 'XXXX did not fulfill a promotion although the cu..." + ], + [ + 1295007, + "On XXXX XXXX and XXXX, 2015 ( as well several phon...", + " 'Discover Card payments were withdrawn over a mon..." + ], + [ + 1294888, + "In XXXX XXXX my checking acct was compromised that...", + " 'In XXXX my checking account was compromised resu..." + ], + [ + 1294108, + "I have been a Discover credit card holder since 20...", + " A Discover cardholder had his long-term account u..." + ] + ], + "hovertemplate": "Narrative Length=%{x}Now the results can be saved back to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "8b9a58e0-ec2e-4e76-8f51-f1757ca5a8b5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(df = df, table_name = 'Complaints_Summaries', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "561ff317-6468-4941-bf9b-840849bfb09d", + "metadata": {}, + "source": [ + "Work Tables
\n", + "Cleanup work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "56ae9d4d-5a68-496c-884c-911de451314e", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['Complaints_Summaries']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "476fb82b-d7a0-4e29-a6d4-8372a247cea8", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "358f56a0-ad97-4317-ab4c-88be00b8d179", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "6cf670cd-4594-458d-af98-8efff5a72f73", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "eeebf3ab-357c-488e-ba9d-78bf82f4d0dd", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Classification.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Classification.ipynb new file mode 100644 index 00000000..46560a95 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Classification.ipynb @@ -0,0 +1,555 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "9fa8099b-5aa9-4a85-ab26-21e7df9e5a9a", + "metadata": {}, + "source": [ + "\n",
+ " Complaints Classification using Vantage and LLM\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "Revolutionize customer complaint resolution with our pioneering solution, which seamlessly integrates the capabilities of Teradata Vantage and AWS Bedrock - Anthropic's Claude LLM model model as LLM. This powerful synergy enables businesses to classify customer complaints with unmatched precision and speed, allowing them to swiftly identify and address concerns, thereby elevating overall customer satisfaction and loyalty.
\n", + "\n", + "Key Features:
\n", + "Benefits:
\n", + "Experience the transformative power of Generative AI in complaints classification.
\n", + "\n", + "Steps in the analysis:
\n", + "1.1 Downloading and installing additional software needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74720dfc-1195-403e-a596-c9a55773fedc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -r requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "3e8d47d7-1a44-4da1-afa5-46ce90312021", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "1.2 Import the required libraries
\n", + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83dc0922-f932-4378-a505-3bb2d1f1243b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Data manipulation and analysis\n", + "import numpy as np\n", + "import pandas as pd\n", + "import json, warnings\n", + "import getpass\n", + "\n", + "# Visualization\n", + "import plotly.express as px\n", + "import matplotlib.pyplot as plt\n", + "from wordcloud import WordCloud\n", + "\n", + "# Progress bar\n", + "from tqdm import tqdm\n", + "\n", + "# Machine learning and other utilities from Teradata\n", + "from teradataml import *\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "\n", + "# Requests\n", + "import requests\n", + "\n", + "# Display settings\n", + "display.max_rows = 5\n", + "pd.set_option('display.max_colwidth', None)\n", + "# Set display options for dataframes, plots, and warnings\n", + "%matplotlib inline\n", + "warnings.filterwarnings('ignore')\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "id": "e365debb-d67d-45b9-aaad-e54bcc474818", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "454a2e81-c377-4058-9e68-78abd801ad9c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "dee44870-6034-4353-96df-f5bd00970fa9", + "metadata": {}, + "source": [ + "Begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "id": "6cbcc301-e792-46b4-84de-e775832be566", + "metadata": {}, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3. Use the TextAnalyticsAI
API to Perform Various Text Analytics Tasks
You can execute the help function at the bottom of this notebook to read more about this API.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbb331fe-a3ff-42cf-a491-8347d53818e8", + "metadata": {}, + "outputs": [], + "source": [ + "# Provide model details\n", + "model_name=\"anthropic.claude-v2\"\n", + "\n", + "# Select in-database or external model\n", + "llm = TeradataAI(api_type = 'AWS',\n", + " model_name = model_name,\n", + " region = region_name,\n", + " # authorization = 'Repositories.BedrockAuth'\n", + " access_key = access_key,\n", + " secret_key = secret_key)\n", + "\n", + "obj = TextAnalyticsAI(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "id": "3d1a07c7-a9d7-49a7-a2ac-1e68c0a1282c", + "metadata": {}, + "source": [ + "We'll use a sample of the data to classify complaints
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cac31f37-2eab-410a-b1ec-9fda12f141c5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf = DataFrame(in_schema('DEMO_ComplaintAnalysis', 'Consumer_Complaints'))\n", + "tdf = tdf.assign(id = tdf.complaint_id).drop('complaint_id', axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3f38a59-8ffb-4010-897d-5d571692365a", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_classified = obj.classify(\n", + " column=\"consumer_complaint_narrative\",\n", + " data=tdf,\n", + " accumulate=\"0:5\",\n", + " labels=[\"Complaint\", \"Non-Complaint\"],\n", + " multi_label=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7734659a-5b9e-4989-b334-1e17b77754d4", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_classified = tdf_classified.assign(Prediction = tdf_classified.Labels.oreplace(\"[\").oreplace(\"]\")).drop(columns=['Labels', 'Message'])\n", + "tdf_classified = tdf_classified.assign(Prediction = tdf_classified.Prediction.cast(type_=VARCHAR(15)))\n", + "tdf_classified = tdf_classified.assign(Prediction = tdf_classified.Prediction.str.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60feb089-0011-44db-83cd-f4f70f5ff79b", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_classified" + ] + }, + { + "cell_type": "markdown", + "id": "e8daa142-e48f-4caf-bd3a-4e85e93d2e6e", + "metadata": {}, + "source": [ + "4.1 Consumer Complaints Prediction vs Occurrences
\n", + "\n", + "A graph illustrating the relationship between consumer complaints prediction and the number of occurrences. This visual representation helps identify trends, patterns, and areas for improvement, enabling data-driven decision making.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32919a9a-7a29-410a-9207-341b0f857652", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "def display_helper(msg):\n", + " return display(Markdown(\n", + " f\"\"\"Note: \n", + "{msg}
\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "141400a2-a4a6-42ea-9f44-651534e474d6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "df = tdf_classified.to_pandas()\n", + "data = Counter(df['Prediction'])\n", + "\n", + "# Convert Counter data to DataFrame\n", + "viz_df = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index()\n", + "\n", + "# Rename columns\n", + "viz_df.columns = ['Prediction', 'Count']\n", + "\n", + "# Create bar graph using Plotly Express\n", + "fig = px.bar(viz_df, x='Prediction', y='Count', color='Prediction',\n", + " labels={'Count': 'Number of Occurrences', 'Prediction': 'Prediction'})\n", + "\n", + "# Show the plot\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "5cc01cc7-269f-4136-a945-fcc149b04e37", + "metadata": {}, + "source": [ + "4.2 Word Cloud for Consumer Complaints Prediction
\n", + "\n", + "A visual representation of consumer complaints prediction, highlighting the most frequent words and pain points in customer feedback. This word cloud helps identify trends, sentiment, and areas for improvement, enabling data-driven decision making.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df58cb69-27f5-4878-a0d3-132c501a95e8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "complaint = df[df['Prediction'] == 'Complaint']\n", + "complaint_text = ' '.join(complaint['consumer_complaint_narrative'])\n", + "\n", + "# Replace 'X' with blank space\n", + "modified_string = complaint_text.replace('X', '')\n", + "\n", + "if len(modified_string) > 0:\n", + " wordcloud = WordCloud(width=800, height=400, background_color='white').generate(modified_string)\n", + "\n", + " # Display the word cloud\n", + " plt.imshow(wordcloud, interpolation='bilinear')\n", + " plt.title(\"Complaints\")\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "else:\n", + " display_helper(\"We included both complaint and non-complaint options for completeness. But since this is a complaints dataset, we don't expect to see any complaints.\")" + ] + }, + { + "cell_type": "markdown", + "id": "afe85dc8-e179-4f31-93db-1ba497724aa4", + "metadata": {}, + "source": [ + "4.3 Word Cloud for Non-Complaints Prediction
\n", + "\n", + "A visual representation of non-complaints prediction, highlighting the most frequent words and positive sentiments in customer feedback. This word cloud helps identify trends, sentiment, and areas of satisfaction, enabling data-driven decision making.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef35daa-b6a9-44a8-8562-a464d49d92f5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "non_complaint = df[df['Prediction'] == 'Non-Complaint']\n", + "non_complaint_text = ' '.join(non_complaint['consumer_complaint_narrative'])\n", + "\n", + "# Replace 'X' with blank space\n", + "modified_string = non_complaint_text.replace('X', '')\n", + "\n", + "if len(modified_string) > 0:\n", + " wordcloud = WordCloud(width=800, height=400, background_color='white').generate(modified_string)\n", + "\n", + " # Display the word cloud\n", + " plt.imshow(wordcloud, interpolation='bilinear')\n", + " plt.title(\"Non-Complaints\")\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "else:\n", + " display_helper(\"We included both complaint and non-complaint options for completeness. But since this is a complaints dataset, we don't expect to see any non-complaints.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8522801e-48fb-4c6d-af93-3538e13ea294", + "metadata": {}, + "source": [ + "Now the results can be saved back to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b2d6203-9526-43ed-8b25-f5548376a10c", + "metadata": {}, + "outputs": [], + "source": [ + "copy_to_sql(df = df, table_name = 'complaints_classified', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "d37c5d71-e309-496a-a0d9-5ac9b3567fe0", + "metadata": {}, + "source": [ + "Work Tables
\n", + "Cleanup work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe618c2d-b637-4577-a6c7-01a417c2e27d", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['complaints_classified']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "3265d0a0-2b15-4383-b657-c6329d547ece", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "358f56a0-ad97-4317-ab4c-88be00b8d179", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "b82f52a3-2aad-44d8-b4dd-606249c85962", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "ffa325e2-7b1b-49c1-a932-e75f71bf002d", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Clustering.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Clustering.ipynb new file mode 100644 index 00000000..fa69a4aa --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Complaints_Clustering.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "473a7d4a-dc19-4b44-a062-7da705197114", + "metadata": {}, + "source": [ + "\n",
+ " Complaints Clustering using Vantage and LLM\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "This feature uses advanced clustering techniques powered by Teradata Vantage and AWS Bedrock - Amazon's Titan embeddings model model to group similar customer complaints together. By identifying common themes and patterns, this functionality provides valuable insights into the key issues and pain points experienced by customers.
\n", + "\n", + "\n", + "Key Features of Complaints Clustering:
\n", + "Unlock the revolutionary potential of Generative AI to categorize and analyze complaints with unparalleled efficiency.
\n", + "\n", + "Steps in the analysis:
\n", + "1.1 Downloading and installing additional software needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e97af3c-0907-4a05-adf1-40b4ea1cded0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -r requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "3a81070d-da02-49db-a422-c36143018277", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "1.2 Import the required libraries
\n", + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9f49cf5-f7b9-461a-a59c-867a414d3bcc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Data manipulation and analysis\n", + "import pandas as pd\n", + "\n", + "# Suppress warnings\n", + "import warnings\n", + "warnings.filterwarnings(\"ignore\", category=DeprecationWarning)\n", + "\n", + "# General imports\n", + "import os\n", + "import getpass\n", + "\n", + "# Plotting packages\n", + "import plotly.express as px\n", + "import plotly.graph_objects as go\n", + "\n", + "# Teradata library\n", + "from teradataml import *\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "from sqlalchemy import func\n", + "\n", + "# Display settings\n", + "display.max_rows = 5\n", + "display.print_sqlmr_query = False\n", + "display.suppress_vantage_runtime_warnings = True\n", + "configure.val_install_location = \"val\"\n", + "configure.byom_install_location = \"byom\"" + ] + }, + { + "cell_type": "markdown", + "id": "8452ad4b-84f7-4b7d-b523-5f24125264c7", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c25f3d12-a0db-43ce-8244-294ce0132097", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "11417221-3825-471d-af67-2f38de4059b7", + "metadata": {}, + "source": [ + "Begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "id": "62bbb47c-6490-45a9-b3c0-f9281f49ae35", + "metadata": {}, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3.1 Graph for Count of Product Complaints Over Years
\n", + "\n", + "The provided graph visualizes the count of complaints over the past few years, categorized by product names.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4ab30b1b-13e4-4720-9df2-a2901cf92785", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "viz_df = df.assign(year = func.td_year_of_calendar(df.date_received.expression))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b692e80-8976-4240-9599-5aac385c94ef", + "metadata": {}, + "outputs": [], + "source": [ + "pd_df = viz_df.select(['product','year','complaint_id']).groupby(['product', 'year']).agg(['count']).to_pandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4161f6ec-4250-4c0c-823f-c832daea1192", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Sorting the DataFrame by year for each product\n", + "pd_df_sorted = pd_df.sort_values(by = ['product', 'year'])\n", + "\n", + "# Plotting using Plotly\n", + "fig = px.line(\n", + " pd_df_sorted,\n", + " x = 'year',\n", + " y = 'count_complaint_id',\n", + " color = 'product',\n", + " markers = True,\n", + " title = 'Count of Product Complaints Over Years'\n", + ")\n", + "\n", + "fig.update_layout(\n", + " xaxis_title = 'Year',\n", + " yaxis_title = 'Count',\n", + " legend_title = 'Product',\n", + " width = 1200,\n", + " height = 600\n", + ")\n", + "\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d595d54c-e906-4411-b744-3cc61585dd5e", + "metadata": {}, + "source": [ + "3.2 Graph for Count of Complaints by Months
\n", + "The provided graph visualizes the count of complaints by months. We can see that the mean count is above 500, and the July and August months have the maximum complaints count.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d99c576-05c8-470c-b7a5-afaa1c71086c", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.assign(complaint_month = func.td_month_of_year(df.date_received.expression))\n", + "grp_gen = df.select(['complaint_month','complaint_id']).groupby(['complaint_month']).agg(['count']).to_pandas()\n", + "\n", + "# Define a reverse mapping dictionary\n", + "reverse_month_mapping = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',\n", + " 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}\n", + "\n", + "# Create a new column with month names based on reverse mapping\n", + "grp_gen['month'] = grp_gen['complaint_month'].map(reverse_month_mapping)\n", + "\n", + "\n", + "fig = px.bar(\n", + " grp_gen.sort_values(by = 'complaint_month'),\n", + " x = 'month', y = 'count_complaint_id',\n", + " labels = {\n", + " 'count_complaint_id': 'Number of Complaints',\n", + " 'month': 'Complaint Month'\n", + " },\n", + " title = 'Number of Complaints by Month'\n", + ")\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate = 'Month: %{x}3.3 Graph for Number of Complaints by Product
The graph displays the number of complaints received for different products. The data shows that the highest number of complaints are related to credit cards or prepaid cards, as well as credit reporting and credit repair services.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d1797ad2-aa25-4945-aa85-e30611ae96da", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "grp_gen = df.select(['product','complaint_id']).groupby(['product']).agg(['count']).to_pandas()\n", + "\n", + "fig = px.bar(\n", + " grp_gen,\n", + " x = 'product',\n", + " y = 'count_complaint_id',\n", + " labels = {\n", + " 'count_complaint_id': 'Number of Complaints',\n", + " 'product': 'Product'\n", + " },\n", + " title = 'Number of Complaints by Product'\n", + ")\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate = 'Product: %{x}3.4 Graph for Number of Complaints by Issue
The graph displays the number of complaints received for different issues. The data shows that the highest number of complaints are related to issue of incorrect information on your report.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eab68ba2-e74b-418d-bee4-46750a90a444", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "grp_gen = df.select(['issue','complaint_id']).groupby(['issue']).agg(['count']).to_pandas()\n", + "\n", + "grp_gen = grp_gen.sort_values('count_complaint_id', ascending = False)[:10]\n", + "\n", + "fig = px.bar(\n", + " grp_gen,\n", + " x = 'issue',\n", + " y = 'count_complaint_id',\n", + " labels = {\n", + " 'count_complaint_id': 'Number of Complaints',\n", + " 'issue': 'Issue'\n", + " },\n", + " title = 'Number of Complaints by Issue(Top 10)'\n", + ")\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate = 'Issue: %{x}3.5 Graph for Number of Complaints by Sub-Issue
\n", + "\n", + "The graph displays the number of complaints received for different sub-issues. The data shows that the highest number of complaints are related to issue of information belongs to someone else.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d456215-beee-4133-9ff5-70131299bdf0", + "metadata": {}, + "outputs": [], + "source": [ + "grp_gen = df.select(['sub_issue','complaint_id']).groupby(['sub_issue']).agg(['count']).to_pandas()\n", + "\n", + "grp_gen = grp_gen.sort_values('count_complaint_id', ascending = False)[:10]\n", + "\n", + "fig = px.bar(\n", + " grp_gen,\n", + " x = 'sub_issue',\n", + " y = 'count_complaint_id',\n", + " labels = {\n", + " 'count_complaint_id': 'Number of Complaints',\n", + " 'sub_issue': 'Sub-Issue'\n", + " },\n", + " title='Number of Complaints by Sub-Issue(Top 10)'\n", + ")\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate = 'Sub-Issue: %{x}3.6 Graph for Number of Complaints by Channel
\n", + "\n", + "The graph displays the number of complaints received for different issues. The data shows that the all the complaints are submitted by web channel.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c63e282f-4e6f-40d5-8edf-852c07b1cb4c", + "metadata": {}, + "outputs": [], + "source": [ + "grp_gen = df.select(['submitted_via','complaint_id']).groupby(['submitted_via']).agg(['count']).to_pandas()\n", + "\n", + "# Create a mapping of numbers to product names\n", + "product_mapping = {i: product for i, product in enumerate(grp_gen['submitted_via'])}\n", + "\n", + "# Replace product names with numbers in the DataFrame\n", + "grp_gen['product_num'] = grp_gen['submitted_via'].map(\n", + " {product: i for i, product in enumerate(grp_gen['submitted_via'])}\n", + ")\n", + "\n", + "fig = px.bar(\n", + " grp_gen,\n", + " x = 'submitted_via',\n", + " y = 'count_complaint_id',\n", + " labels = {\n", + " 'count_complaint_id': 'Number of Complaints',\n", + " 'submitted_via': 'Submitted Via'\n", + " },\n", + " title = 'Number of Complaints by Channel'\n", + ")\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate = 'Submitted Via: %{x}\n", + "
\n",
+ " The embeddings()
function generates vector representations of text from a specified column, capturing the semantic meaning of each entry.\n",
+ "
\n", + " These embeddings can then be used for tasks such as semantic similarity, clustering, retrieval, or as input features for downstream machine learning models.\n", + "
\n", + "For our complaint clustering task, we'll be using a sample of the data to cluster the complaints. This approach will allow us to effectively analyze and categorize the complaints without using the entire dataset.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8511855-c7f5-4783-8e5a-5a2b14b79b4f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "KMeans_Model = KMeans(\n", + " data = DataFrame('complaints_embeddings'),\n", + " id_column = \"complaint_id\",\n", + " target_columns = [\"Embedding\"],\n", + " output_cluster_assignment = True,\n", + " num_clusters = 5\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a55e88-1632-46eb-b86c-3ee3d16edd8a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Data information: \\n\", KMeans_Model.model_data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d5a3743-a51a-46cd-888c-eecd85e3f3ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "KMeans_Model.result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd95cbc-7434-4248-84f8-20260410bd7e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "embeddings_cluster = DataFrame('complaints_embeddings').join(\n", + " other = KMeans_Model.result,\n", + " how = \"inner\",\n", + " on = \"complaint_id=complaint_id\",\n", + " lprefix = \"L_\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94563f0e-6d80-4b73-ac5a-8aa787ddfc9b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# View complaints in cluster 1\n", + "embeddings_cluster[['td_clusterid_kmeans','complaint_id','consumer_complaint_narrative']] \\\n", + " .loc[embeddings_cluster.td_clusterid_kmeans == 1]" + ] + }, + { + "cell_type": "markdown", + "id": "f8f29be4-9609-4c96-a7ab-e2f4a60ec106", + "metadata": {}, + "source": [ + "Work Tables
\n", + "Cleanup work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9da55306-cfe5-4d34-978d-16c236504630", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tables = ['complaints_embeddings']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "73a8ead8-0698-4449-bc11-5a07351e3fb6", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e697922-8d63-4709-b7b3-0f705a938255", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "75e7e6a6-1413-4b18-b800-4899bcdfb31a", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "f9586bbb-6521-4988-bc00-40a67b08b695", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Sentiment_Analysis.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Sentiment_Analysis.ipynb new file mode 100644 index 00000000..a2573934 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Sentiment_Analysis.ipynb @@ -0,0 +1,761 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5ce3dd8a-f3e5-40d5-ab4c-d40cc358cf7f", + "metadata": {}, + "source": [ + "\n",
+ " Sentiment Analysis Using Vantage and LLM\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "Sentiment analysis using Teradata Vantage and the advanced AWS Bedrock - Anthropic's Claude LLM model model involves leveraging cutting-edge technologies to extract insights from unstructured data. This process empowers businesses to swiftly identify and address customer concerns, enhancing overall customer satisfaction and loyalty.
\n", + "\n", + "Key Features:
\n", + "Benefits:
\n", + "Experience the transformative power of Generative AI in complaints classification.
\n", + "\n", + "Steps in the analysis:
\n", + "1.1 Downloading and installing additional software needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03eb4587-6632-4b08-a121-c98ea5a8fc48", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -r requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "c1fd81e4-0df9-4360-b3ac-72214f135296", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "1.2 Import the required libraries
\n", + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70311ba4-fad6-42b7-b0bc-bf6f45b4d73b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Data manipulation and analysis\n", + "import numpy as np\n", + "import pandas as pd\n", + "import getpass\n", + "\n", + "# Visualization\n", + "import plotly.express as px\n", + "import matplotlib.pyplot as plt\n", + "from wordcloud import WordCloud\n", + "\n", + "# Progress bar\n", + "from tqdm import tqdm\n", + "\n", + "# Machine learning and other utilities from Teradata\n", + "from teradataml import *\n", + "from sqlalchemy import func\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "\n", + "# Requests\n", + "import requests\n", + "\n", + "# Display settings\n", + "display.max_rows = 5\n", + "pd.set_option('display.max_colwidth', None)\n", + "\n", + "# Set display options for dataframes, plots, and warnings\n", + "%matplotlib inline\n", + "warnings.filterwarnings('ignore')\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "id": "1763fd95-5000-4a2d-8b86-7be261e20847", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de5eba56-d38f-4204-b30d-232c7d894eb0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "0ce81699-6345-4dbf-a3a8-b9402c7b6a98", + "metadata": {}, + "source": [ + "Begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "id": "c4aedced-59c4-4887-9ff1-25cfe56e0403", + "metadata": {}, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3. Use the TextAnalyticsAI
API to Perform Various Text Analytics Tasks
You can execute the help function at the bottom of this notebook to read more about this API.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9ee8d70-231f-4fff-b416-6588acd6ac88", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Provide model details\n", + "model_name=\"anthropic.claude-v2\"\n", + "\n", + "# Select in-database or external model\n", + "llm = TeradataAI(api_type = 'AWS',\n", + " model_name = model_name,\n", + " region = region_name,\n", + " # authorization = 'Repositories.BedrockAuth'\n", + " access_key = access_key,\n", + " secret_key = secret_key)\n", + "\n", + "obj = TextAnalyticsAI(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "id": "4332348c-f45e-4f03-9184-81746d39f566", + "metadata": {}, + "source": [ + "We'll analyze the sentiments of a sample of customer complaints data.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff462b32-8742-4ac0-a40f-3e201120b3bc", + "metadata": {}, + "outputs": [], + "source": [ + "tdf = DataFrame(in_schema('DEMO_ComplaintAnalysis', 'Consumer_Complaints'))\n", + "tdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d091617-4179-4953-ac50-7f9e966e4c0d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_sentiment = obj.analyze_sentiment(column = 'consumer_complaint_narrative', \n", + " data = tdf)[['date_received','complaint_id','Sentiment','consumer_complaint_narrative', 'product']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f09bdb52-f057-4cad-b82d-8b92c29ca10d", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_sentiment" + ] + }, + { + "cell_type": "markdown", + "id": "9162a5c4-a3ee-44c0-be09-5bfaa9b5d827", + "metadata": {}, + "source": [ + "Now the results can be saved back to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1626db2a-1159-4560-8ae3-704abcd9e081", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(df = tdf_sentiment, table_name = 'complaints_sentiment', if_exists = 'replace')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5245af62-abf4-4ed5-b4c2-7f06620587a8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sentiment_df = DataFrame('complaints_sentiment')\n", + "sentiment_df = sentiment_df.assign(date_received = sentiment_df.date_received.cast(type_=DATE))\n", + "sentiment_df = sentiment_df.assign(Sentiment = sentiment_df.Sentiment.str.strip())\n", + "print('Before: ', sentiment_df.shape)\n", + "sentiment_df = sentiment_df.loc[sentiment_df.Sentiment.isin(['positive', 'negative', 'neutral'])]\n", + "print('After: ', sentiment_df.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "c29ca4fd-3282-4eac-a402-0c72542a5a30", + "metadata": {}, + "source": [ + "4.1 Consumer Sentiments Prediction vs Occurrences
\n", + "\n", + "A graph illustrating the relationship between consumer sentiments (positive, negative, neutral) prediction and the number of occurrences. This visual representation helps identify trends, patterns, and areas for improvement, enabling data-driven decision making.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31955976-23c1-4897-9140-77768fd4400c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from IPython.display import display, Markdown\n", + "def display_helper(msg):\n", + " return display(Markdown(\n", + " f\"\"\"Note: \n", + "{msg}
\"\"\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "faa3be08-0b27-4748-a7d8-32023b183761", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "data = Counter(sentiment_df[['Sentiment']].get_values().flatten())\n", + "\n", + "# Convert Counter data to DataFrame\n", + "df = pd.DataFrame.from_dict(data, orient='index', columns=['Count']).reset_index()\n", + "\n", + "# Rename columns\n", + "df.columns = ['Sentiment', 'Count']\n", + "\n", + "# Create bar graph using Plotly Express\n", + "fig = px.bar(df, x='Sentiment', y='Count', color='Sentiment',\n", + " labels={'Count': 'Number of Occurrences', 'Sentiment': 'Sentiment'})\n", + "\n", + "# Show the plot\n", + "fig.show()" + ] + }, + { + "cell_type": "markdown", + "id": "1211a149-ec81-4cd3-8423-963952da4d01", + "metadata": {}, + "source": [ + "4.2 Word Cloud for Negative Consumer Sentiment Prediction
\n", + "\n", + "Unlock the power of customer feedback with our intuitive word cloud visualization, which provides a comprehensive snapshot of negative consumer complaints sentiment. This innovative tool highlights the most frequently occurring words and pain points in customer feedback, empowering businesses to:
By leveraging this word cloud, businesses can proactively address customer concerns, refine their products and services, and ultimately drive growth through a deeper understanding of their customers' needs and preferences.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7f3644c-c55d-4864-87ee-57403c86e9b7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "neg = sentiment_df[sentiment_df['Sentiment'] == 'negative'].to_pandas()\n", + "neg_text = ' '.join(neg['consumer_complaint_narrative'])\n", + "\n", + "# Replace 'X' with blank space\n", + "modified_string = neg_text.replace('X', '')\n", + "\n", + "if len(modified_string) > 0:\n", + " wordcloud = WordCloud(width=800, height=400, background_color='white').generate(modified_string)\n", + "\n", + " # Display the word cloud\n", + " plt.imshow(wordcloud, interpolation='bilinear')\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "else:\n", + " display_helper(\"We included positive, negative, and neutral categories to cover all bases. But in this sample, it's possible that none of the complaints are actually negative.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a3c9017a-4470-4184-b7c3-334a2dd7ef4d", + "metadata": {}, + "source": [ + "4.3 Word Cloud for Neutral Consumer Sentiment Prediction
\n", + "\n", + "Tap into the insights of customer feedback with our intuitive word cloud visualization, which offers a detailed overview of neutral consumer complaints sentiment
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02aede14-b229-4835-a878-aed4bf1fef94", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "neu = sentiment_df[sentiment_df['Sentiment'] == 'neutral'].to_pandas()\n", + "neu_text = ' '.join(neu['consumer_complaint_narrative'])\n", + "\n", + "# Replace 'X' with blank space\n", + "modified_string = neu_text.replace('X', '')\n", + "\n", + "if len(modified_string) > 0:\n", + " wordcloud = WordCloud(width=800, height=400, background_color='white').generate(modified_string)\n", + "\n", + " # Display the word cloud\n", + " plt.imshow(wordcloud, interpolation='bilinear')\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "else:\n", + " display_helper(\"To cover all possible scenarios, we included positive, negative, and neutral categories in our analysis. However, given that this dataset consists of complaints, it's expected that the model would rarely, if ever, encounter positive or neutral responses.\")" + ] + }, + { + "cell_type": "markdown", + "id": "d63d3d37-cf05-4497-9d74-a612762fe3e7", + "metadata": {}, + "source": [ + "4.4 Word Cloud for Positive Consumer Sentiment Prediction
\n", + "\n", + "Explore customer feedback insights with our intuitive word cloud visualization, providing a detailed overview of consumer sentiment.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f881a57-4421-436b-9d2b-1103ba6ad005", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pos = sentiment_df[sentiment_df['Sentiment'] == 'positive'].to_pandas()\n", + "pos_text = ' '.join(pos['consumer_complaint_narrative'])\n", + "\n", + "# Replace 'X' with blank space\n", + "modified_string = pos_text.replace('X', '')\n", + "\n", + "if len(modified_string) > 0:\n", + " wordcloud = WordCloud(width=800, height=400, background_color='white').generate(modified_string)\n", + "\n", + " # Display the word cloud\n", + " plt.imshow(wordcloud, interpolation='bilinear')\n", + " plt.tight_layout()\n", + " plt.axis(\"off\")\n", + " plt.show()\n", + "else:\n", + " display_helper(\"To cover all possible scenarios, we included positive, negative, and neutral categories in our analysis. However, given that this dataset consists of complaints, it's expected that the model would rarely, if ever, encounter positive or neutral responses.\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f81682d-268c-498a-b7d6-fa3d57698fd8", + "metadata": {}, + "source": [ + "4.5 Negative Sentiment per Product Over Years
\n", + "\n", + "This graph tracks the negative sentiment associated with different products over time, offering valuable insights into evolving customer perceptions and pain points.
\n", + "\n", + "We will use Vantage in-db function OrdinalEncodingFit which will identifies distinct categorical values from the input data or a user-defined list and generates the distinct categorical values along with the ordinal value for each category.
0:\n", + "\n", + " viz_senti = viz_neg.select(['product','Sentiment', 'year']).groupby(['product', 'year']).agg(['sum']).to_pandas()\n", + "\n", + " # Sorting the DataFrame by year for each product\n", + " pd_df_sorted = viz_senti.sort_values(by=['product', 'year'])\n", + "\n", + " # Plotting using Plotly\n", + " fig = px.line(pd_df_sorted, x='year', y='sum_Sentiment', color='product', markers=True, title='Negative Sentiment per Product Over Years')\n", + " fig.update_layout(xaxis_title='Year', yaxis_title='Count', legend_title='Product', width=1000, height=600)\n", + "\n", + " fig.show()\n", + "else:\n", + " display_helper(\"We included positive, negative, and neutral categories to cover all bases. But in this sample, it's possible that none of the complaints are actually negative.\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fc49e14-3beb-4206-811e-558026246e50", + "metadata": {}, + "source": [ + "
4.6 Neutral Sentiment per Product Over Years
\n", + "\n", + "This graph tracks the neutral sentiment associated with different products over time, offering valuable insights into evolving customer perceptions and pain points.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04da72eb-c4c4-4087-9a1f-f47f0dc31e23", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "viz_neu = result[result['Sentiment'] == 0]\n", + "\n", + "if viz_neu.shape[0] > 0:\n", + " viz_senti = viz_neu.select(['product','Sentiment', 'year']).groupby(['product', 'year']).agg(['sum']).to_pandas()\n", + "\n", + " # Sorting the DataFrame by year for each product\n", + " pd_df_sorted = viz_senti.sort_values(by=['product', 'year'])\n", + "\n", + " # Plotting using Plotly\n", + " fig = px.line(pd_df_sorted, x='year', y='sum_Sentiment', color='product', markers=True, title='Neutral Sentiment per Product Over Years')\n", + " fig.update_layout(xaxis_title='Year', yaxis_title='Count', legend_title='Product', width=1000, height=600)\n", + "\n", + " fig.show()\n", + "else:\n", + " display_helper(\"To cover all possible scenarios, we included positive, negative, and neutral categories in our analysis. However, given that this dataset consists of complaints, it's expected that the model would rarely, if ever, encounter positive or neutral responses.\")" + ] + }, + { + "cell_type": "markdown", + "id": "389e811a-737d-4953-965b-56b591409aad", + "metadata": {}, + "source": [ + "4.7 Positive Sentiment per Product Over Years
\n", + "\n", + "This graph tracks the positive sentiment associated with different products over time, offering valuable insights into evolving customer perceptions and pain points.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b218073f-c7b5-4971-ac96-5bd2217e80dd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "viz_pos = result[result['Sentiment'] == 1]\n", + "\n", + "if viz_pos.shape[0] > 0:\n", + " viz_senti = viz_pos.select(['product','Sentiment', 'year']).groupby(['product', 'year']).agg(['sum']).to_pandas()\n", + "\n", + " # Sorting the DataFrame by year for each product\n", + " pd_df_sorted = viz_senti.sort_values(by=['product', 'year'])\n", + "\n", + " # Plotting using Plotly\n", + " fig = px.line(pd_df_sorted, x='year', y='sum_Sentiment', color='product', markers=True, title='Positive Sentiment per Product Over Years')\n", + " fig.update_layout(xaxis_title='Year', yaxis_title='Count', legend_title='Product', width=1000, height=600)\n", + "\n", + " fig.show()\n", + "else:\n", + " display_helper(\"To cover all possible scenarios, we included positive, negative, and neutral categories in our analysis. However, given that this dataset consists of complaints, it's expected that the model would rarely, if ever, encounter positive or neutral responses.\")" + ] + }, + { + "cell_type": "markdown", + "id": "a44bc090-a387-41b6-90a5-638a16dc3d4f", + "metadata": {}, + "source": [ + "Work Tables
\n", + "Cleanup work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bc7d7637-276b-4c0e-a79c-82aaa33e9a51", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['complaints_sentiment']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "3b6ba28d-07d2-4322-9f59-e81855e1389c", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7e82715-0855-4d59-990c-481d3a9d3f1b", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "ebbd137a-7bfc-49fa-a18e-ca34ba68919d", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "19bdc0df-dc13-4a13-bf73-c7039a54c3ba", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Topic_Modelling.ipynb b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Topic_Modelling.ipynb new file mode 100644 index 00000000..e58410f2 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/Topic_Modelling.ipynb @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "877a4d39-8348-4330-9121-ec41f89e0b66", + "metadata": {}, + "source": [ + "\n",
+ " Topic Modelling using Vantage and LLM\n",
+ "
\n",
+ " \n",
+ "
Introduction:
\n", + "\n", + "In this comprehensive user demo, we will delve into the world of topic modeling using Teradata Vantage and AWS Bedrock - Anthropic's Claude LLM model. This cutting-edge technology empowers businesses to uncover hidden insights from vast amounts of consumer complaints data, enabling them to identify trends, improve customer satisfaction, and enhance their overall brand reputation.
\n", + "\n", + "Key Features:
\n", + "\n", + "Benefits:
\n", + "\n", + "Steps in the analysis:
\n", + "1.1 Downloading and installing additional software needed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef4ea764-3ec3-43bf-ae6a-528ab9fd5883", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -r requirements.txt --upgrade --quiet" + ] + }, + { + "cell_type": "markdown", + "id": "d700c07d-c611-455f-9335-35312dbc12a2", + "metadata": {}, + "source": [ + "
Note: Please restart the kernel after executing these two lines. The simplest way to restart the Kernel is by typing zero zero: 0 0
\n", + "1.2 Import the required libraries
\n", + "Here, we import the required libraries, set environment variables and environment paths (if required).
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d327004-e3cd-4c48-90d2-e3b1c4602e24", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Data manipulation and analysis\n", + "import numpy as np\n", + "import pandas as pd\n", + "import json, warnings\n", + "import getpass\n", + "\n", + "# Visualization\n", + "import plotly.express as px\n", + "\n", + "# Progress bar\n", + "from tqdm import tqdm\n", + "\n", + "# Machine learning and other utilities from Teradata\n", + "from teradataml import *\n", + "from teradatagenai import TeradataAI, TextAnalyticsAI, VSManager, VectorStore, VSApi\n", + "\n", + "# Requests\n", + "import requests\n", + "\n", + "# Display settings\n", + "display.max_rows = 5\n", + "pd.set_option('display.max_colwidth', None)\n", + "\n", + "# Set display options for dataframes, plots, and warnings\n", + "%matplotlib inline\n", + "warnings.filterwarnings('ignore')\n", + "display.suppress_vantage_runtime_warnings = True" + ] + }, + { + "cell_type": "markdown", + "id": "3cc65823-8d76-4ce4-90c0-7005679e8dcc", + "metadata": {}, + "source": [ + "We will be prompted to provide the password. We will enter the password, press the Enter key, and then use the down arrow to go to the next cell.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bec3d84c-b60d-4bd8-9d85-9a905d1c01fa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(\"Checking if this environment is ready to connect to VantageCloud Lake...\")\n", + "\n", + "if os.path.exists(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\"):\n", + " print(\"Your environment parameter file exist. Please proceed with this use case.\")\n", + " # Load all the variables from the .env file into a dictionary\n", + " env_vars = dotenv_values(\"/home/jovyan/JupyterLabRoot/VantageCloud_Lake/.config/.env\")\n", + " # Create the Context\n", + " eng = create_context(host=env_vars.get(\"host\"), username=env_vars.get(\"username\"), password=env_vars.get(\"my_variable\"))\n", + " execute_sql('''SET query_band='DEMO=text_analytics_teradatagenai_aws_huggingface.ipynb;' UPDATE FOR SESSION;''')\n", + " print(\"Connected to VantageCloud Lake with:\", eng)\n", + "else:\n", + " print(\"Your environment has not been prepared for connecting to VantageCloud Lake.\")\n", + " print(\"Please contact the support team.\")" + ] + }, + { + "cell_type": "markdown", + "id": "cdd806c9-dcd4-4b6a-ae42-854b9cc74187", + "metadata": {}, + "source": [ + "Begin running steps with Shift + Enter keys.
" + ] + }, + { + "cell_type": "markdown", + "id": "7e0fc107-abf5-4d22-8ed5-6113299808da", + "metadata": {}, + "source": [ + "2. Set up the LLM connection
\n", + "\n", + "The teradatagenai python library can both connect to cloud-based LLM services as well as instantiate private models running at scale on local GPU compute. In this case we will use anthropoc claude-instant-v1 for low-cost, high-throughput tasks.
\n", + "\n", + "3. Use the TextAnalyticsAI
API to Perform Various Text Analytics Tasks
You can execute the help function at the bottom of this notebook to read more about this API.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67b80af4-55d3-48e5-95d5-b30224ad484d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Provide model details\n", + "model_name=\"anthropic.claude-v2\"\n", + "\n", + "# Select in-database or external model\n", + "llm = TeradataAI(api_type = 'AWS',\n", + " model_name = model_name,\n", + " region = region_name,\n", + " # authorization = 'Repositories.BedrockAuth'\n", + " access_key = access_key,\n", + " secret_key = secret_key)\n", + "\n", + "obj = TextAnalyticsAI(llm=llm)" + ] + }, + { + "cell_type": "markdown", + "id": "a9cabc44-d40d-4f27-ab0a-cfc09315e179", + "metadata": {}, + "source": [ + "Here we subset the data to get only the complaints related to Mortgage. We further analyze the issues of those complaints and pick the top 5 topics.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c11e365c-0c34-447b-9889-8c7681790cce", + "metadata": {}, + "outputs": [], + "source": [ + "df = df[df.product == 'Mortgage']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fd59f8a-349e-4f4f-b689-534b6cb9a97f", + "metadata": {}, + "outputs": [], + "source": [ + "df.select(['issue', 'sub_issue', 'complaint_id']).groupby(['issue', 'sub_issue']).agg(['count']).sort('count_complaint_id', ascending = False)" + ] + }, + { + "cell_type": "markdown", + "id": "deeba736-584f-4fff-8439-fe934764722e", + "metadata": {}, + "source": [ + "According to the result above, we can classify the issues into the following topics:
\n", + "\n", + "Topic modeling using Large Language Models (LLMs) revolutionizes the way we understand and categorize vast collections of text data. LLMs excel in understanding the semantics and context of words, enabling sophisticated topic modeling techniques.
\n", + "\n", + "Traditionally, topic modeling algorithms like Latent Dirichlet Allocation (LDA) rely on statistical patterns within documents to identify topics. However, LLMs offer a more nuanced approach. By leveraging their deep understanding of language, LLMs can extract complex themes and topics from unstructured text data with higher accuracy and flexibility.
\n", + "\n", + "LLMs can generate coherent topics without needing predefined categories, making them ideal for exploratory analysis of diverse datasets. Moreover, their ability to capture subtle nuances in language allows for more precise topic identification, even in noisy or ambiguous texts.
\n", + "\n", + "Reasoning with a Chain of Thought: Imagine you're trying to solve a problem. With a large language model, you start with an initial idea or question. Then, you use the model's capabilities to explore related concepts, gradually connecting them together. Each step builds upon the previous one, leading you closer to understanding or solving the problem. It's like putting together puzzle pieces, one by one, until you see the whole picture.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf854c5a-5b50-4950-b146-bcd137abfa75", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tdf_topics = obj.classify(column = 'consumer_complaint_narrative', \n", + " data = df,\n", + " labels = ['Mortgage Application',\n", + " 'Payment Trouble',\n", + " 'Mortgage Closing',\n", + " 'Report Inaccuracy',\n", + " 'Payment Struggle'])[['complaint_id','Labels','consumer_complaint_narrative']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0ae39c62-a536-40ed-aafd-59e2cda7dc41", + "metadata": {}, + "outputs": [], + "source": [ + "tdf_topics" + ] + }, + { + "cell_type": "markdown", + "id": "00cb0cb4-572d-49c4-9068-c4dcfc326c7e", + "metadata": {}, + "source": [ + "Now the results can be saved back to Vantage.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a394e4d4-c28a-411c-8b03-cc1cc28b6814", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "copy_to_sql(df = tdf_topics, table_name = 'topic_prediction', if_exists = 'replace')" + ] + }, + { + "cell_type": "markdown", + "id": "ea01a05e-2116-4c7c-94db-ac369a14231f", + "metadata": {}, + "source": [ + "5.1 Number of Complaints by Predicted Topic
\n", + "\n", + "A graph illustrating the Number of Complaints by Predicted Topic reveals that the majority of complaints are centered around Mortgage Application, while the fewest are related to Mortgage Closing.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67c13569-4302-400d-965f-288c1e42dcf4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "grp_gen = DataFrame('topic_prediction').select(['Labels','complaint_id']).groupby(['Labels']).agg(['count']).to_pandas()\n", + "\n", + "grp_gen = grp_gen.sort_values('count_complaint_id', ascending = False)[:10]\n", + "\n", + "fig = px.bar(grp_gen, x='Labels', y='count_complaint_id',\n", + " labels={'count_complaint_id': 'Number of Complaints', 'Labels': 'Labels'},\n", + " title='Number of Complaints by Predicted Topic')\n", + "\n", + "# Add hover information\n", + "fig.update_traces(hovertemplate='Issue: %{x}Work Tables
\n", + "Cleanup work tables to prevent errors next time.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f132db5b-4889-4089-9bd3-ef737aa51d9a", + "metadata": {}, + "outputs": [], + "source": [ + "tables = ['topic_prediction']\n", + "\n", + "# Loop through the list of tables and execute the drop table command for each table\n", + "for table in tables:\n", + " try:\n", + " db_drop_table(table_name=table)\n", + " except:\n", + " pass" + ] + }, + { + "cell_type": "markdown", + "id": "07a477ec-ea8e-4394-afcb-3fce4f3c0d19", + "metadata": {}, + "source": [ + "Databases and Tables
\n", + "The following code will clean up tables and databases created above.
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef7255f3-2d27-4b26-b391-149e79ee0d50", + "metadata": {}, + "outputs": [], + "source": [ + "remove_context()" + ] + }, + { + "cell_type": "markdown", + "id": "b6cbe888-1ced-464e-b5e4-05d6af91dd5e", + "metadata": {}, + "source": [ + "The dataset is sourced from Consumer Financial Protection Bureau
" + ] + }, + { + "cell_type": "markdown", + "id": "6c5372e5-c7c9-4e9e-908a-6fba19b4e022", + "metadata": {}, + "source": [ + "" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/requirements.txt b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/requirements.txt new file mode 100644 index 00000000..7a3fb086 --- /dev/null +++ b/VantageCloud_Lake/UseCases/Customer_Complaints_Analyzer/requirements.txt @@ -0,0 +1,4 @@ +teradataml==20.0.0.5 +teradatagenai>=20.0.0.1 +wordcloud +Pillow \ No newline at end of file