Skip to content

Commit f6e95fb

Browse files
bofenghuangjoerunde
authored andcommitted
[Frontend] OpenAI API server: Do not add bos token by default when encoding (vllm-project#4688)
1 parent fd979cb commit f6e95fb

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

vllm/entrypoints/openai/serving_chat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ async def create_chat_completion(
158158
try:
159159
# Tokenize/detokenize depending on prompt format (string/token list)
160160
prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
161-
request, prompt=prompt)
161+
request, prompt=prompt, add_special_tokens=False)
162162
sampling_params = request.to_sampling_params()
163163
lora_request = self._maybe_get_lora(request)
164164
decoding_config = await self.engine.get_decoding_config()

vllm/entrypoints/openai/serving_engine.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import json
22
from dataclasses import dataclass
33
from http import HTTPStatus
4-
from typing import Dict, List, Optional, Tuple, Union
4+
from typing import Any, Dict, List, Optional, Tuple, Union
55

66
from pydantic import Field
77
from typing_extensions import Annotated
@@ -165,24 +165,34 @@ def _maybe_get_lora(
165165
raise ValueError(f"The model `{request.model}` does not exist.")
166166

167167
def _validate_prompt_and_tokenize(
168-
self,
169-
request: Union[ChatCompletionRequest, CompletionRequest,
170-
EmbeddingRequest],
171-
prompt: Optional[str] = None,
172-
prompt_ids: Optional[List[int]] = None,
173-
truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
174-
) -> Tuple[List[int], str]:
168+
self,
169+
request: Union[ChatCompletionRequest, CompletionRequest,
170+
EmbeddingRequest],
171+
prompt: Optional[str] = None,
172+
prompt_ids: Optional[List[int]] = None,
173+
truncate_prompt_tokens: Optional[Annotated[int,
174+
Field(ge=1)]] = None,
175+
add_special_tokens: bool = True) -> Tuple[List[int], str]:
175176
if not (prompt or prompt_ids):
176177
raise ValueError("Either prompt or prompt_ids should be provided.")
177178
if (prompt and prompt_ids):
178179
raise ValueError(
179180
"Only one of prompt or prompt_ids should be provided.")
180181

181182
if prompt_ids is None:
182-
tokenizer_kwargs = {} if truncate_prompt_tokens is None else {
183-
"truncation": True,
184-
"max_length": truncate_prompt_tokens,
183+
# When using OpenAIServingChat for chat completions, the
184+
# special tokens (e.g., BOS) have already been added by the
185+
# chat template. Therefore, we do not need to add them again.
186+
# Set add_special_tokens to False to avoid adding the BOS tokens
187+
# again.
188+
tokenizer_kwargs: Dict[str, Any] = {
189+
"add_special_tokens": add_special_tokens
185190
}
191+
if truncate_prompt_tokens is not None:
192+
tokenizer_kwargs.update({
193+
"truncation": True,
194+
"max_length": truncate_prompt_tokens,
195+
})
186196
input_ids = self.tokenizer(prompt, **tokenizer_kwargs).input_ids
187197
elif truncate_prompt_tokens is not None:
188198
input_ids = prompt_ids[-truncate_prompt_tokens:]

0 commit comments

Comments
 (0)