From d3abfc3c0ce5bdd6d5ef1ffe04c647ec7e32a6ca Mon Sep 17 00:00:00 2001
From: Daniel Jannai <danielj@ai21.com>
Date: Thu, 20 Jun 2024 16:58:26 +0300
Subject: [PATCH 1/4] feat: Added `chat_template` and `template_kwargs` to the
 `ChatCompletionRequest` class to allow full control over HF's
 `apply_chat_template`

---
 vllm/entrypoints/openai/protocol.py     | 12 ++++++++++++
 vllm/entrypoints/openai/serving_chat.py |  2 ++
 2 files changed, 14 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index b57d79859aec..7c8a797972df 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -190,6 +190,18 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "special tokens so this should be set to False (as is the "
             "default)."),
     )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "If this is not passed, the model's default chat template will be "
+            "used instead."),
+    )
+    template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
     include_stop_str_in_output: Optional[bool] = Field(
         default=False,
         description=(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 744e1d94511b..990d546ceb6d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -222,6 +222,8 @@ async def create_chat_completion(
                 conversation=conversation,
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
+                chat_template=request.chat_template,
+                **request.template_kwargs,
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)

From 172f596eb9130c594bc4966d4a7bdcc794991169 Mon Sep 17 00:00:00 2001
From: Daniel Jannai <danielj@ai21.com>
Date: Thu, 20 Jun 2024 17:48:55 +0300
Subject: [PATCH 2/4] fix: renamed parameter name

---
 vllm/entrypoints/openai/protocol.py     | 2 +-
 vllm/entrypoints/openai/serving_chat.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 7c8a797972df..4bfe4d26ebcd 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -197,7 +197,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "If this is not passed, the model's default chat template will be "
             "used instead."),
     )
-    template_kwargs: Optional[Dict[str, Any]] = Field(
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
         default=None,
         description=("Additional kwargs to pass to the template renderer. "
                      "Will be accessible by the chat template."),
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 990d546ceb6d..0333d03d6a64 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -223,7 +223,7 @@ async def create_chat_completion(
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
                 chat_template=request.chat_template,
-                **request.template_kwargs,
+                **request.chat_template_kwargs,
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)

From bc1753b94c7a6d72ab6a745ffce360976ff4c3d4 Mon Sep 17 00:00:00 2001
From: Daniel Jannai <danielj@ai21.com>
Date: Mon, 1 Jul 2024 18:02:18 +0300
Subject: [PATCH 3/4] feat: bump transformers and added tools and documents
 parameters

---
 requirements-common.txt                 | 2 +-
 vllm/entrypoints/openai/protocol.py     | 9 +++++++++
 vllm/entrypoints/openai/serving_chat.py | 6 +++++-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index 05969cfa5d65..8762491338ea 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -6,7 +6,7 @@ numpy < 2.0.0
 requests
 tqdm
 py-cpuinfo
-transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
+transformers >= 4.42.3  # Required for StarCoder2 & Llava, Llama 3 and for additional chat template parameters.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
 aiohttp
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 4bfe4d26ebcd..fc8b44137a1c 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -190,6 +190,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
             "special tokens so this should be set to False (as is the "
             "default)."),
     )
+    documents: Optional[List[Dict[str, str]]] = Field(
+        default=None,
+        description=
+        ("A list of dicts representing documents that will be accessible to "
+         "the model if it is performing RAG (retrieval-augmented generation)."
+         " If the template does not support RAG, this argument will have no "
+         "effect. We recommend that each document should be a dict containing "
+         "\"title\" and \"text\" keys."),
+    )
     chat_template: Optional[str] = Field(
         default=None,
         description=(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 0333d03d6a64..b81647cd2018 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -218,12 +218,16 @@ async def create_chat_completion(
                 conversation.extend(chat_parsed_result.messages)
                 image_futures.extend(chat_parsed_result.image_futures)
 
+            tool_dicts = [tool.model_dump() for tool in (request.tools or [])]
+
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
                 tokenize=False,
                 add_generation_prompt=request.add_generation_prompt,
+                tools=tool_dicts,
+                documents=request.documents,
                 chat_template=request.chat_template,
-                **request.chat_template_kwargs,
+                **(request.chat_template_kwargs or {}),
             )
         except Exception as e:
             logger.error("Error in applying chat template from request: %s", e)

From d2ec754cf538269e5338bd7cde61994045a1a08d Mon Sep 17 00:00:00 2001
From: Daniel Jannai <danielj@ai21.com>
Date: Mon, 1 Jul 2024 18:06:15 +0300
Subject: [PATCH 4/4] fix: make sure `tool_dicts` is `None` if `request.tools`
 is `None`

---
 vllm/entrypoints/openai/serving_chat.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index b81647cd2018..4a960fd7ebe1 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -218,7 +218,9 @@ async def create_chat_completion(
                 conversation.extend(chat_parsed_result.messages)
                 image_futures.extend(chat_parsed_result.image_futures)
 
-            tool_dicts = [tool.model_dump() for tool in (request.tools or [])]
+            tool_dicts = None if request.tools is None else [
+                tool.model_dump() for tool in request.tools
+            ]
 
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,