pixegami · RavinduBA · May 25, 2025 · May 25, 2025 · May 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,6 @@
 *.pyc
 .DS_Store
 backup
-chroma
+chroma
+venv/
+
diff --git a/README.md b/README.md
@@ -1 +1,118 @@
-# rag-tutorial-v2
+# RAG Tutorial v2
+
+## Best Practices
+
+### 🔑 Key Requirements
+
+- **Use the exact embedding function for both scenarios**: The same vector embedding function MUST be used for:
+  - Storing data in vector database
+  - Querying the database
+  - LangChain has many different embedding functions - refer to [LangChain documentation on embedding functions](https://python.langchain.com/docs/integrations/text_embedding/)
+
+- **Document Loaders**: Various document loaders are available for different document types:
+  - CSV
+  - Markdown
+  - HTML
+  - MS Office
+  - JSON
+  - Refer to [LangChain documentation on document loaders](https://python.langchain.com/docs/integrations/document_loaders/)
+
+## Current Problem: HuggingFace Models in Two Places
+
+### Overview
+
+HuggingFace is used in **two different places** in your RAG system:
+
+### 1. 📊 Embeddings (`get_embedding_function.py`)
+
+```python
+from langchain_huggingface import HuggingFaceEmbeddings
+
+embeddings = HuggingFaceEmbeddings(
+    model_name="all-MiniLM-L6-v2",
+    model_kwargs={'device': 'cpu'},
+    encode_kwargs={'normalize_embeddings': True}
+)
+```
+
+**Purpose**: Convert text into numerical vectors (embeddings) for similarity search
+
+- **Input**: Text chunks from your PDFs
+- **Output**: Vector representations (arrays of numbers)
+- **Used for**: Finding similar documents when you search
+
+### 2. 🤖 Language Model (`query_data.py`)
+
+```python
+from langchain_community.llms import HuggingFacePipeline
+
+model = HuggingFacePipeline.from_model_id(
+    model_id="microsoft/DialoGPT-small",
+    task="text-generation",
+    model_kwargs={"temperature": 0.7, "max_length": 512}
+)
+```
+
+**Purpose**: Generate human-like responses based on the retrieved context
+
+- **Input**: Your question + relevant document chunks
+- **Output**: Natural language answer
+- **Used for**: Creating the final response to your question
+
+## 🔄 Why Both Are Needed
+
+```mermaid
+graph TD
+    A[Your Question: "How many clues can I give in Codenames?"] --> B[HuggingFace Embeddings: Convert question to vector]
+    B --> C[Find similar docs]
+    C --> D[Retrieved relevant text chunks from database]
+    D --> E[HuggingFace LLM: Generate answer from question + context]
+    E --> F[Final Answer]
+```
+
+## ⚠️ The Problem in Your Case
+
+1. **✅ Embeddings work fine** - they found documents (just wrong ones)
+2. **❌ Language Model is poor** - DialoGPT-small gives garbled responses
+
+## 💡 Better Alternatives
+
+### For Embeddings (Keep This - It's Working)
+
+```python
+# This is fine, keep using it
+HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+```
+
+### For Language Model (Replace This)
+
+```python
+# Instead of HuggingFacePipeline, use:
+from langchain_openai import ChatOpenAI
+model = ChatOpenAI(model="gpt-3.5-turbo")  # Much better responses
+```
+
+## 📋 Installation Requirements
+
+```bash
+# For improved embeddings
+pip install -U langchain-huggingface langchain-chroma
+
+# For OpenAI integration
+pip install langchain-openai
+
+# Set your API key
+export OPENAI_API_KEY="your-api-key-here"
+```
+
+## 🎯 Conclusion
+
+The **embeddings part** works well with HuggingFace, but the **text generation part** needs a better model like OpenAI GPT or Claude for good results.
+
+### Recommended Setup:
+- **Embeddings**: HuggingFace (`all-MiniLM-L6-v2`)
+- **Language Model**: OpenAI (`gpt-3.5-turbo`) or Claude
+
+This combination gives you:
+- Fast, local embeddings for document retrieval
+- High-quality language model for response generation
diff --git a/get_embedding_function.py b/get_embedding_function.py
@@ -1,10 +1,16 @@
-from langchain_community.embeddings.ollama import OllamaEmbeddings
-from langchain_community.embeddings.bedrock import BedrockEmbeddings
-
+from langchain_community.embeddings import OllamaEmbeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
 
 def get_embedding_function():
-    embeddings = BedrockEmbeddings(
-        credentials_profile_name="default", region_name="us-east-1"
-    )
+    # Option 1: Use Ollama embeddings (requires Ollama to be installed and running)
+    # Uncomment this if you have Ollama installed with an embedding model
     # embeddings = OllamaEmbeddings(model="nomic-embed-text")
-    return embeddings
+
+    # Option 2: Use HuggingFace embeddings (downloads model locally, no API key needed)
+    embeddings = HuggingFaceEmbeddings(
+        model_name="all-MiniLM-L6-v2",  # Good balance of performance and speed
+        model_kwargs={'device': 'cpu'},  # Use 'cuda' if you have GPU
+        encode_kwargs={'normalize_embeddings': True}
+    )
+
+    return embeddings
diff --git a/populate_database.py b/populate_database.py
@@ -1,20 +1,17 @@
 import argparse
 import os
 import shutil
-from langchain.document_loaders.pdf import PyPDFDirectoryLoader
+from langchain_community.document_loaders import PyPDFDirectoryLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain.schema.document import Document
 from get_embedding_function import get_embedding_function
-from langchain.vectorstores.chroma import Chroma
-
+from langchain_community.vectorstores import Chroma
 
 CHROMA_PATH = "chroma"
 DATA_PATH = "data"
 
-
 def main():
-
-    # Check if the database should be cleared (using the --clear flag).
+    # Check if the database should be cleared (using the --reset flag).
     parser = argparse.ArgumentParser()
     parser.add_argument("--reset", action="store_true", help="Reset the database.")
     args = parser.parse_args()
@@ -27,12 +24,10 @@ def main():
     chunks = split_documents(documents)
     add_to_chroma(chunks)
 
-
 def load_documents():
     document_loader = PyPDFDirectoryLoader(DATA_PATH)
     return document_loader.load()
 
-
 def split_documents(documents: list[Document]):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=800,
@@ -42,7 +37,6 @@ def split_documents(documents: list[Document]):
     )
     return text_splitter.split_documents(documents)
 
-
 def add_to_chroma(chunks: list[Document]):
     # Load the existing database.
     db = Chroma(
@@ -71,9 +65,7 @@ def add_to_chroma(chunks: list[Document]):
     else:
         print("✅ No new documents to add")
 
-
 def calculate_chunk_ids(chunks):
-
     # This will create IDs like "data/monopoly.pdf:6:2"
     # Page Source : Page Number : Chunk Index
 
@@ -100,11 +92,9 @@ def calculate_chunk_ids(chunks):
 
     return chunks
 
-
 def clear_database():
     if os.path.exists(CHROMA_PATH):
         shutil.rmtree(CHROMA_PATH)
 
-
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/query_data.py b/query_data.py
@@ -1,8 +1,7 @@
 import argparse
-from langchain.vectorstores.chroma import Chroma
+from langchain_community.vectorstores import Chroma
 from langchain.prompts import ChatPromptTemplate
-from langchain_community.llms.ollama import Ollama
-
+from langchain_community.llms import HuggingFacePipeline
 from get_embedding_function import get_embedding_function
 
 CHROMA_PATH = "chroma"
@@ -17,7 +16,6 @@
 Answer the question based on the above context: {question}
 """
 
-
 def main():
     # Create CLI.
     parser = argparse.ArgumentParser()
@@ -26,28 +24,47 @@ def main():
     query_text = args.query_text
     query_rag(query_text)
 
-
 def query_rag(query_text: str):
     # Prepare the DB.
     embedding_function = get_embedding_function()
     db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
 
     # Search the DB.
     results = db.similarity_search_with_score(query_text, k=5)
+
+    if not results:
+        print("No relevant documents found.")
+        return "No relevant documents found."
 
     context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
     prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
     prompt = prompt_template.format(context=context_text, question=query_text)
+
+    # Uncomment this line if you want to see the prompt being sent to the model
+    # print("Prompt being sent to model:")
     # print(prompt)
-
-    model = Ollama(model="mistral")
-    response_text = model.invoke(prompt)
-
-    sources = [doc.metadata.get("id", None) for doc, _score in results]
-    formatted_response = f"Response: {response_text}\nSources: {sources}"
-    print(formatted_response)
-    return response_text
-
+    # print("="*50)
+
+    try:
+        # Option 1: Use HuggingFace Pipeline (completely local, no API key needed)
+        model = HuggingFacePipeline.from_model_id(
+            model_id="microsoft/DialoGPT-small",  # Small model for faster loading
+            task="text-generation",
+            model_kwargs={"temperature": 0.7, "max_length": 512}
+        )
+
+        response_text = model.invoke(prompt)
+
+        sources = [doc.metadata.get("id", None) for doc, _score in results]
+        formatted_response = f"Response: {response_text}\nSources: {sources}"
+        print(formatted_response)
+        return response_text
+
+    except Exception as e:
+        print(f"Error with HuggingFace model: {e}")
+        print("Installing required packages...")
+        print("Run: pip install transformers torch")
+        return None
 
 if __name__ == "__main__":
-    main()
+    main()