From 6a2f80a411840a3eb9c1efa1c31e830dc4b5774d Mon Sep 17 00:00:00 2001 From: Martin Hirzel Date: Mon, 7 Oct 2024 06:20:02 -0400 Subject: [PATCH] Use simple TFIDF for RAG example. Signed-off-by: Martin Hirzel --- examples/rag/rag.pdl | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/examples/rag/rag.pdl b/examples/rag/rag.pdl index 955091293..ea6e92462 100644 --- a/examples/rag/rag.pdl +++ b/examples/rag/rag.pdl @@ -2,21 +2,14 @@ description: Retrieval-augmented generation for NL-to-Code generation task. text: - lang: python code: | # initialize PDL_SESSION.vec_db and PDL_SESSION.embed() function - import datasets, numpy, os, requests - genai_key, genai_api = os.environ["WATSONX_KEY"], os.environ["WATSONX_API"] - def embed(text): - endpoint = f"{genai_api}/v1/text/embeddings?version=2024-05-02" - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {genai_key}", - } - json_data = { - "model_id": "sentence-transformers/all-minilm-l6-v2", - "input": text, - } - response = requests.post(endpoint, headers=headers, json=json_data) - return numpy.asarray(response.json()["results"][0]) + import datasets, sklearn.feature_extraction.text train_in = datasets.load_dataset("mbpp", "sanitized", split="train") + corpus = [row["prompt"] for row in train_in] + tfidf = sklearn.feature_extraction.text.TfidfVectorizer().fit(corpus) + def embed(text): + singleton_batch = [text] + sparse_result = tfidf.transform(raw_documents=singleton_batch) + return sparse_result.toarray().flatten() train_em = train_in.map(lambda row: {"embeddings": embed(row["prompt"])}) PDL_SESSION.vec_db = train_em.add_faiss_index("embeddings") PDL_SESSION.embed = embed @@ -50,6 +43,6 @@ text: Q: ${ TEST_PROMPT } A: ``` -- model: watsonx/ibm/granite-20b-code-instruct-v2 +- model: watsonx/ibm/granite-34b-code-instruct parameters: stop: ["```"]