From d3abfc3c0ce5bdd6d5ef1ffe04c647ec7e32a6ca Mon Sep 17 00:00:00 2001 From: Daniel Jannai Date: Thu, 20 Jun 2024 16:58:26 +0300 Subject: [PATCH 1/4] feat: Added `chat_template` and `template_kwargs` to the `ChatCompletionRequest` class to allow full control over HF's `apply_chat_template` --- vllm/entrypoints/openai/protocol.py | 12 ++++++++++++ vllm/entrypoints/openai/serving_chat.py | 2 ++ 2 files changed, 14 insertions(+) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index b57d79859aec..7c8a797972df 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -190,6 +190,18 @@ class ChatCompletionRequest(OpenAIBaseModel): "special tokens so this should be set to False (as is the " "default)."), ) + chat_template: Optional[str] = Field( + default=None, + description=( + "A Jinja template to use for this conversion. " + "If this is not passed, the model's default chat template will be " + "used instead."), + ) + template_kwargs: Optional[Dict[str, Any]] = Field( + default=None, + description=("Additional kwargs to pass to the template renderer. " + "Will be accessible by the chat template."), + ) include_stop_str_in_output: Optional[bool] = Field( default=False, description=( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 744e1d94511b..990d546ceb6d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -222,6 +222,8 @@ async def create_chat_completion( conversation=conversation, tokenize=False, add_generation_prompt=request.add_generation_prompt, + chat_template=request.chat_template, + **request.template_kwargs, ) except Exception as e: logger.error("Error in applying chat template from request: %s", e) From 172f596eb9130c594bc4966d4a7bdcc794991169 Mon Sep 17 00:00:00 2001 From: Daniel Jannai Date: Thu, 20 Jun 2024 17:48:55 +0300 Subject: [PATCH 2/4] fix: renamed parameter name --- vllm/entrypoints/openai/protocol.py | 2 +- vllm/entrypoints/openai/serving_chat.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 7c8a797972df..4bfe4d26ebcd 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -197,7 +197,7 @@ class ChatCompletionRequest(OpenAIBaseModel): "If this is not passed, the model's default chat template will be " "used instead."), ) - template_kwargs: Optional[Dict[str, Any]] = Field( + chat_template_kwargs: Optional[Dict[str, Any]] = Field( default=None, description=("Additional kwargs to pass to the template renderer. " "Will be accessible by the chat template."), diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 990d546ceb6d..0333d03d6a64 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -223,7 +223,7 @@ async def create_chat_completion( tokenize=False, add_generation_prompt=request.add_generation_prompt, chat_template=request.chat_template, - **request.template_kwargs, + **request.chat_template_kwargs, ) except Exception as e: logger.error("Error in applying chat template from request: %s", e) From bc1753b94c7a6d72ab6a745ffce360976ff4c3d4 Mon Sep 17 00:00:00 2001 From: Daniel Jannai Date: Mon, 1 Jul 2024 18:02:18 +0300 Subject: [PATCH 3/4] feat: bump transformers and added tools and documents parameters --- requirements-common.txt | 2 +- vllm/entrypoints/openai/protocol.py | 9 +++++++++ vllm/entrypoints/openai/serving_chat.py | 6 +++++- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/requirements-common.txt b/requirements-common.txt index 05969cfa5d65..8762491338ea 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -6,7 +6,7 @@ numpy < 2.0.0 requests tqdm py-cpuinfo -transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. +transformers >= 4.42.3 # Required for StarCoder2 & Llava, Llama 3 and for additional chat template parameters. tokenizers >= 0.19.1 # Required for Llama 3. fastapi aiohttp diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4bfe4d26ebcd..fc8b44137a1c 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -190,6 +190,15 @@ class ChatCompletionRequest(OpenAIBaseModel): "special tokens so this should be set to False (as is the " "default)."), ) + documents: Optional[List[Dict[str, str]]] = Field( + default=None, + description= + ("A list of dicts representing documents that will be accessible to " + "the model if it is performing RAG (retrieval-augmented generation)." + " If the template does not support RAG, this argument will have no " + "effect. We recommend that each document should be a dict containing " + "\"title\" and \"text\" keys."), + ) chat_template: Optional[str] = Field( default=None, description=( diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 0333d03d6a64..b81647cd2018 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -218,12 +218,16 @@ async def create_chat_completion( conversation.extend(chat_parsed_result.messages) image_futures.extend(chat_parsed_result.image_futures) + tool_dicts = [tool.model_dump() for tool in (request.tools or [])] + prompt = self.tokenizer.apply_chat_template( conversation=conversation, tokenize=False, add_generation_prompt=request.add_generation_prompt, + tools=tool_dicts, + documents=request.documents, chat_template=request.chat_template, - **request.chat_template_kwargs, + **(request.chat_template_kwargs or {}), ) except Exception as e: logger.error("Error in applying chat template from request: %s", e) From d2ec754cf538269e5338bd7cde61994045a1a08d Mon Sep 17 00:00:00 2001 From: Daniel Jannai Date: Mon, 1 Jul 2024 18:06:15 +0300 Subject: [PATCH 4/4] fix: make sure `tool_dicts` is `None` if `request.tools` is `None` --- vllm/entrypoints/openai/serving_chat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index b81647cd2018..4a960fd7ebe1 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -218,7 +218,9 @@ async def create_chat_completion( conversation.extend(chat_parsed_result.messages) image_futures.extend(chat_parsed_result.image_futures) - tool_dicts = [tool.model_dump() for tool in (request.tools or [])] + tool_dicts = None if request.tools is None else [ + tool.model_dump() for tool in request.tools + ] prompt = self.tokenizer.apply_chat_template( conversation=conversation,