diff --git a/.gitignore b/.gitignore index ea09c3ecf..b1ea714df 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ *.pyc .DS_Store backup -chroma \ No newline at end of file +chroma +venv/ + \ No newline at end of file diff --git a/README.md b/README.md index 6af3a5251..0abf3d32e 100644 --- a/README.md +++ b/README.md @@ -1 +1,118 @@ -# rag-tutorial-v2 +# RAG Tutorial v2 + +## Best Practices + +### 🔑 Key Requirements + +- **Use the exact embedding function for both scenarios**: The same vector embedding function MUST be used for: + - Storing data in vector database + - Querying the database + - LangChain has many different embedding functions - refer to [LangChain documentation on embedding functions](https://python.langchain.com/docs/integrations/text_embedding/) + +- **Document Loaders**: Various document loaders are available for different document types: + - CSV + - Markdown + - HTML + - MS Office + - JSON + - Refer to [LangChain documentation on document loaders](https://python.langchain.com/docs/integrations/document_loaders/) + +## Current Problem: HuggingFace Models in Two Places + +### Overview + +HuggingFace is used in **two different places** in your RAG system: + +### 1. 📊 Embeddings (`get_embedding_function.py`) + +```python +from langchain_huggingface import HuggingFaceEmbeddings + +embeddings = HuggingFaceEmbeddings( + model_name="all-MiniLM-L6-v2", + model_kwargs={'device': 'cpu'}, + encode_kwargs={'normalize_embeddings': True} +) +``` + +**Purpose**: Convert text into numerical vectors (embeddings) for similarity search + +- **Input**: Text chunks from your PDFs +- **Output**: Vector representations (arrays of numbers) +- **Used for**: Finding similar documents when you search + +### 2. 🤖 Language Model (`query_data.py`) + +```python +from langchain_community.llms import HuggingFacePipeline + +model = HuggingFacePipeline.from_model_id( + model_id="microsoft/DialoGPT-small", + task="text-generation", + model_kwargs={"temperature": 0.7, "max_length": 512} +) +``` + +**Purpose**: Generate human-like responses based on the retrieved context + +- **Input**: Your question + relevant document chunks +- **Output**: Natural language answer +- **Used for**: Creating the final response to your question + +## 🔄 Why Both Are Needed + +```mermaid +graph TD + A[Your Question: "How many clues can I give in Codenames?"] --> B[HuggingFace Embeddings: Convert question to vector] + B --> C[Find similar docs] + C --> D[Retrieved relevant text chunks from database] + D --> E[HuggingFace LLM: Generate answer from question + context] + E --> F[Final Answer] +``` + +## ⚠️ The Problem in Your Case + +1. **✅ Embeddings work fine** - they found documents (just wrong ones) +2. **❌ Language Model is poor** - DialoGPT-small gives garbled responses + +## 💡 Better Alternatives + +### For Embeddings (Keep This - It's Working) + +```python +# This is fine, keep using it +HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") +``` + +### For Language Model (Replace This) + +```python +# Instead of HuggingFacePipeline, use: +from langchain_openai import ChatOpenAI +model = ChatOpenAI(model="gpt-3.5-turbo") # Much better responses +``` + +## 📋 Installation Requirements + +```bash +# For improved embeddings +pip install -U langchain-huggingface langchain-chroma + +# For OpenAI integration +pip install langchain-openai + +# Set your API key +export OPENAI_API_KEY="your-api-key-here" +``` + +## 🎯 Conclusion + +The **embeddings part** works well with HuggingFace, but the **text generation part** needs a better model like OpenAI GPT or Claude for good results. + +### Recommended Setup: +- **Embeddings**: HuggingFace (`all-MiniLM-L6-v2`) +- **Language Model**: OpenAI (`gpt-3.5-turbo`) or Claude + +This combination gives you: +- Fast, local embeddings for document retrieval +- High-quality language model for response generation \ No newline at end of file diff --git a/get_embedding_function.py b/get_embedding_function.py index 79d04113b..244c5f201 100644 --- a/get_embedding_function.py +++ b/get_embedding_function.py @@ -1,10 +1,16 @@ -from langchain_community.embeddings.ollama import OllamaEmbeddings -from langchain_community.embeddings.bedrock import BedrockEmbeddings - +from langchain_community.embeddings import OllamaEmbeddings +from langchain_community.embeddings import HuggingFaceEmbeddings def get_embedding_function(): - embeddings = BedrockEmbeddings( - credentials_profile_name="default", region_name="us-east-1" - ) + # Option 1: Use Ollama embeddings (requires Ollama to be installed and running) + # Uncomment this if you have Ollama installed with an embedding model # embeddings = OllamaEmbeddings(model="nomic-embed-text") - return embeddings + + # Option 2: Use HuggingFace embeddings (downloads model locally, no API key needed) + embeddings = HuggingFaceEmbeddings( + model_name="all-MiniLM-L6-v2", # Good balance of performance and speed + model_kwargs={'device': 'cpu'}, # Use 'cuda' if you have GPU + encode_kwargs={'normalize_embeddings': True} + ) + + return embeddings \ No newline at end of file diff --git a/populate_database.py b/populate_database.py index 3d2a1ab8a..6f966e258 100644 --- a/populate_database.py +++ b/populate_database.py @@ -1,20 +1,17 @@ import argparse import os import shutil -from langchain.document_loaders.pdf import PyPDFDirectoryLoader +from langchain_community.document_loaders import PyPDFDirectoryLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain.schema.document import Document from get_embedding_function import get_embedding_function -from langchain.vectorstores.chroma import Chroma - +from langchain_community.vectorstores import Chroma CHROMA_PATH = "chroma" DATA_PATH = "data" - def main(): - - # Check if the database should be cleared (using the --clear flag). + # Check if the database should be cleared (using the --reset flag). parser = argparse.ArgumentParser() parser.add_argument("--reset", action="store_true", help="Reset the database.") args = parser.parse_args() @@ -27,12 +24,10 @@ def main(): chunks = split_documents(documents) add_to_chroma(chunks) - def load_documents(): document_loader = PyPDFDirectoryLoader(DATA_PATH) return document_loader.load() - def split_documents(documents: list[Document]): text_splitter = RecursiveCharacterTextSplitter( chunk_size=800, @@ -42,7 +37,6 @@ def split_documents(documents: list[Document]): ) return text_splitter.split_documents(documents) - def add_to_chroma(chunks: list[Document]): # Load the existing database. db = Chroma( @@ -71,9 +65,7 @@ def add_to_chroma(chunks: list[Document]): else: print("✅ No new documents to add") - def calculate_chunk_ids(chunks): - # This will create IDs like "data/monopoly.pdf:6:2" # Page Source : Page Number : Chunk Index @@ -100,11 +92,9 @@ def calculate_chunk_ids(chunks): return chunks - def clear_database(): if os.path.exists(CHROMA_PATH): shutil.rmtree(CHROMA_PATH) - if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/query_data.py b/query_data.py index 33299e582..190009501 100644 --- a/query_data.py +++ b/query_data.py @@ -1,8 +1,7 @@ import argparse -from langchain.vectorstores.chroma import Chroma +from langchain_community.vectorstores import Chroma from langchain.prompts import ChatPromptTemplate -from langchain_community.llms.ollama import Ollama - +from langchain_community.llms import HuggingFacePipeline from get_embedding_function import get_embedding_function CHROMA_PATH = "chroma" @@ -17,7 +16,6 @@ Answer the question based on the above context: {question} """ - def main(): # Create CLI. parser = argparse.ArgumentParser() @@ -26,7 +24,6 @@ def main(): query_text = args.query_text query_rag(query_text) - def query_rag(query_text: str): # Prepare the DB. embedding_function = get_embedding_function() @@ -34,20 +31,40 @@ def query_rag(query_text: str): # Search the DB. results = db.similarity_search_with_score(query_text, k=5) + + if not results: + print("No relevant documents found.") + return "No relevant documents found." context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results]) prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE) prompt = prompt_template.format(context=context_text, question=query_text) + + # Uncomment this line if you want to see the prompt being sent to the model + # print("Prompt being sent to model:") # print(prompt) - - model = Ollama(model="mistral") - response_text = model.invoke(prompt) - - sources = [doc.metadata.get("id", None) for doc, _score in results] - formatted_response = f"Response: {response_text}\nSources: {sources}" - print(formatted_response) - return response_text - + # print("="*50) + + try: + # Option 1: Use HuggingFace Pipeline (completely local, no API key needed) + model = HuggingFacePipeline.from_model_id( + model_id="microsoft/DialoGPT-small", # Small model for faster loading + task="text-generation", + model_kwargs={"temperature": 0.7, "max_length": 512} + ) + + response_text = model.invoke(prompt) + + sources = [doc.metadata.get("id", None) for doc, _score in results] + formatted_response = f"Response: {response_text}\nSources: {sources}" + print(formatted_response) + return response_text + + except Exception as e: + print(f"Error with HuggingFace model: {e}") + print("Installing required packages...") + print("Run: pip install transformers torch") + return None if __name__ == "__main__": - main() + main() \ No newline at end of file