From 23a6b41c395798de4b745cfb3fa129696b75fea6 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Thu, 30 May 2024 10:23:50 +0000 Subject: [PATCH 1/6] [Frontend] Add tokenize/detokenize endpoints --- tests/entrypoints/test_openai_server.py | 35 +++++++++++++++++++ vllm/entrypoints/openai/api_server.py | 20 ++++++++++- vllm/entrypoints/openai/protocol.py | 17 +++++++++ vllm/entrypoints/openai/serving_completion.py | 18 +++++++++- vllm/entrypoints/openai/serving_engine.py | 11 ++++-- 5 files changed, 97 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index 972137030f46..dd189d951780 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -8,6 +8,7 @@ # using Ray for overall ease of process management, parallel requests, # and debugging. import ray +import requests import torch # downloading lora to test lora requests from huggingface_hub import snapshot_download @@ -1154,5 +1155,39 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI, assert embeddings.usage.total_tokens == 17 +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_tokenize(server, client: openai.AsyncOpenAI, model_name: str): + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") + + for add_special in [False, True]: + prompt = "This is a test prompt." + tokens = tokenizer.encode(prompt, add_special_tokens=add_special) + + response = requests.post("http://localhost:8000/tokenize", + json={ + "add_special_tokens": add_special, + "prompt": prompt + }) + assert response.json() == {"tokens": tokens} + + +@pytest.mark.parametrize( + "model_name", + [MODEL_NAME], +) +async def test_detokenize(server, client: openai.AsyncOpenAI, model_name: str): + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + + prompt = "This is a test prompt." + tokens = tokenizer.encode(prompt, add_special_tokens=False) + + response = requests.post("http://localhost:8000/detokenize", + json={"tokens": tokens}) + assert response.json() == {"prompt": prompt} + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 97b35262329e..23c0ad760502 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -23,7 +23,11 @@ from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, - EmbeddingRequest, ErrorResponse) + DetokenizeRequest, + DetokenizeResponse, + EmbeddingRequest, ErrorResponse, + TokenizeRequest, + TokenizeResponse) from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding @@ -85,6 +89,20 @@ async def health() -> Response: return Response(status_code=200) +@app.post("/tokenize") +async def tokenize(request: TokenizeRequest): + response = openai_serving_completion.create_tokenize(request) + assert isinstance(response, TokenizeResponse) + return JSONResponse(content=response.model_dump()) + + +@app.post("/detokenize") +async def detokenize(request: DetokenizeRequest): + response = openai_serving_completion.create_detokenize(request) + assert isinstance(response, DetokenizeResponse) + return JSONResponse(content=response.model_dump()) + + @app.get("/v1/models") async def show_available_models(): models = await openai_serving_chat.show_available_models() diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e380212a4d76..4f1d26acaaf4 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -602,3 +602,20 @@ class BatchRequestOutput(OpenAIBaseModel): # For requests that failed with a non-HTTP error, this will contain more # information on the cause of the failure. error: Optional[Any] + + +class TokenizeRequest(OpenAIBaseModel): + prompt: str + add_special_tokens: bool = Field(default=True) + + +class TokenizeResponse(OpenAIBaseModel): + tokens: List[int] + + +class DetokenizeRequest(OpenAIBaseModel): + tokens: List[int] + + +class DetokenizeResponse(OpenAIBaseModel): + prompt: str diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 2fb122edaf98..706b616e8b52 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -15,7 +15,10 @@ CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, - UsageInfo) + DetokenizeRequest, + DetokenizeResponse, + TokenizeRequest, + TokenizeResponse, UsageInfo) # yapf: enable from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing) @@ -413,3 +416,16 @@ def _create_completion_logprobs( tokens=out_tokens, top_logprobs=out_top_logprobs, ) + + def create_tokenize(self, request: TokenizeRequest) -> TokenizeResponse: + (input_ids, input_text) = self._validate_prompt_and_tokenize( + request, + prompt=request.prompt, + add_special_tokens=request.add_special_tokens) + return TokenizeResponse(tokens=input_ids) + + def create_detokenize(self, + request: DetokenizeRequest) -> DetokenizeResponse: + (input_ids, input_text) = self._validate_prompt_and_tokenize( + request, prompt_ids=request.tokens) + return DetokenizeResponse(prompt=input_text) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 066acdf1c019..307c6d868eac 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -10,9 +10,10 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, CompletionRequest, + DetokenizeRequest, EmbeddingRequest, ErrorResponse, ModelCard, ModelList, - ModelPermission) + ModelPermission, TokenizeRequest) from vllm.logger import init_logger from vllm.lora.request import LoRARequest from vllm.sequence import Logprob @@ -125,7 +126,8 @@ def _maybe_get_lora( def _validate_prompt_and_tokenize( self, request: Union[ChatCompletionRequest, CompletionRequest, - EmbeddingRequest], + DetokenizeRequest, EmbeddingRequest, + TokenizeRequest], prompt: Optional[str] = None, prompt_ids: Optional[List[int]] = None, truncate_prompt_tokens: Optional[Annotated[int, @@ -171,6 +173,11 @@ def _validate_prompt_and_tokenize( f"generation. Please reduce the length of the input.", ) return input_ids, input_text + # Note: TokenizeRequest and DetokenizeRequest doesn't have max_tokens + # and does not require model context length validation + if isinstance(request, (TokenizeRequest, DetokenizeRequest)): + return input_ids, input_text + if request.max_tokens is None: if token_num >= self.max_model_len: raise ValueError( From 6326e74f5176be5e71619f4a6dc983ef5106dc00 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 4 Jun 2024 10:40:24 +0000 Subject: [PATCH 2/6] fix yapf error --- vllm/entrypoints/openai/api_server.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 23c0ad760502..b5c7b345d414 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -20,6 +20,8 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.cli_args import make_arg_parser +# yapf conflicts with isort for this block +# yapf: disable from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, ChatCompletionResponse, CompletionRequest, @@ -28,6 +30,7 @@ EmbeddingRequest, ErrorResponse, TokenizeRequest, TokenizeResponse) +# yapf: enable from vllm.entrypoints.openai.serving_chat import OpenAIServingChat from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion from vllm.entrypoints.openai.serving_embedding import OpenAIServingEmbedding From 093447b73e40ef485055929fac0a50119e3418d1 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Wed, 5 Jun 2024 17:04:45 +0000 Subject: [PATCH 3/6] add count and max_model_len to tokenize response --- vllm/entrypoints/openai/protocol.py | 2 ++ vllm/entrypoints/openai/serving_completion.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 64d0a2128113..f8ce771239a8 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -665,6 +665,8 @@ class TokenizeRequest(OpenAIBaseModel): class TokenizeResponse(OpenAIBaseModel): tokens: List[int] + count: int + max_model_len: int class DetokenizeRequest(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a9f8b4a6a71c..8ef8caba658e 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -422,7 +422,9 @@ def create_tokenize(self, request: TokenizeRequest) -> TokenizeResponse: request, prompt=request.prompt, add_special_tokens=request.add_special_tokens) - return TokenizeResponse(tokens=input_ids) + return TokenizeResponse(tokens=input_ids, + count=len(input_ids), + max_model_len=self.max_model_len) def create_detokenize(self, request: DetokenizeRequest) -> DetokenizeResponse: From 72b640d99e351c11c85bb85db86a96c3f06c63ce Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Tue, 25 Jun 2024 18:47:39 +0000 Subject: [PATCH 4/6] check model name --- vllm/entrypoints/openai/protocol.py | 2 ++ vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 12 +++++++++++- vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 7 ++++--- 5 files changed, 19 insertions(+), 6 deletions(-) diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index e54f6eaab332..91d279aaf569 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -691,6 +691,7 @@ class BatchRequestOutput(OpenAIBaseModel): class TokenizeRequest(OpenAIBaseModel): + model: str prompt: str add_special_tokens: bool = Field(default=True) @@ -702,6 +703,7 @@ class TokenizeResponse(OpenAIBaseModel): class DetokenizeRequest(OpenAIBaseModel): + model: str tokens: List[int] diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 76940612496a..b6b3d6eee05a 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -202,7 +202,7 @@ async def create_chat_completion( NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) """ - error_check_ret = await self._check_model(request) + error_check_ret = self._check_model(request) if error_check_ret is not None: return error_check_ret diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index a8470966c4b4..657f198c199f 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -82,7 +82,7 @@ async def create_completion(self, request: CompletionRequest, - suffix (the language models we currently support do not support suffix) """ - error_check_ret = await self._check_model(request) + error_check_ret = self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -437,16 +437,26 @@ def _create_completion_logprobs( ) def create_tokenize(self, request: TokenizeRequest) -> TokenizeResponse: + error_check_ret = self._check_model(request) + if error_check_ret is not None: + return error_check_ret + (input_ids, input_text) = self._validate_prompt_and_tokenize( request, prompt=request.prompt, add_special_tokens=request.add_special_tokens) + return TokenizeResponse(tokens=input_ids, count=len(input_ids), max_model_len=self.max_model_len) def create_detokenize(self, request: DetokenizeRequest) -> DetokenizeResponse: + error_check_ret = self._check_model(request) + if error_check_ret is not None: + return error_check_ret + (input_ids, input_text) = self._validate_prompt_and_tokenize( request, prompt_ids=request.tokens) + return DetokenizeResponse(prompt=input_text) diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index cbf09f173fb6..f79b40ce34f2 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -68,7 +68,7 @@ async def create_embedding(self, request: EmbeddingRequest, See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. """ - error_check_ret = await self._check_model(request) + error_check_ret = self._check_model(request) if error_check_ret is not None: return error_check_ret diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index fe4582f52118..8d53eb0357a0 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -99,9 +99,10 @@ def create_streaming_error_response( }) return json_str - async def _check_model( - self, request: Union[CompletionRequest, ChatCompletionRequest, - EmbeddingRequest] + def _check_model( + self, request: Union[ChatCompletionRequest, CompletionRequest, + DetokenizeRequest, EmbeddingRequest, + TokenizeRequest] ) -> Optional[ErrorResponse]: if request.model in self.served_model_names: return None From 6c4908d80ae73b8d080ac288ee498a7767e893c2 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Wed, 26 Jun 2024 07:49:53 +0000 Subject: [PATCH 5/6] fixes --- tests/entrypoints/test_openai_server.py | 24 +++++++++++++++++++----- vllm/entrypoints/openai/api_server.py | 20 ++++++++++++++------ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py index efaf17ee77b0..6d3a5f204b9b 100644 --- a/tests/entrypoints/test_openai_server.py +++ b/tests/entrypoints/test_openai_server.py @@ -1367,37 +1367,51 @@ async def test_long_seed(client: openai.AsyncOpenAI): or "less_than_equal" in exc_info.value.message) +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", [MODEL_NAME], ) async def test_tokenize(server, client: openai.AsyncOpenAI, model_name: str): + base_url = str(client.base_url)[:-3] tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") for add_special in [False, True]: prompt = "This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=add_special) - response = requests.post("http://localhost:8000/tokenize", + response = requests.post(base_url + "/tokenize", json={ "add_special_tokens": add_special, + "model": model_name, "prompt": prompt }) - assert response.json() == {"tokens": tokens} + response.raise_for_status() + assert response.json() == { + "tokens": tokens, + "count": len(tokens), + "max_model_len": 8192 + } +@pytest.mark.asyncio @pytest.mark.parametrize( "model_name", [MODEL_NAME], ) async def test_detokenize(server, client: openai.AsyncOpenAI, model_name: str): - tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME) + base_url = str(client.base_url)[:-3] + tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME, tokenizer_mode="fast") prompt = "This is a test prompt." tokens = tokenizer.encode(prompt, add_special_tokens=False) - response = requests.post("http://localhost:8000/detokenize", - json={"tokens": tokens}) + response = requests.post(base_url + "detokenize", + json={ + "model": model_name, + "tokens": tokens + }) + response.raise_for_status() assert response.json() == {"prompt": prompt} diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 23e169e7edbe..d13bd44623eb 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -94,16 +94,24 @@ async def health() -> Response: @app.post("/tokenize") async def tokenize(request: TokenizeRequest): - response = openai_serving_completion.create_tokenize(request) - assert isinstance(response, TokenizeResponse) - return JSONResponse(content=response.model_dump()) + generator = openai_serving_completion.create_tokenize(request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + else: + assert isinstance(generator, TokenizeResponse) + return JSONResponse(content=generator.model_dump()) @app.post("/detokenize") async def detokenize(request: DetokenizeRequest): - response = openai_serving_completion.create_detokenize(request) - assert isinstance(response, DetokenizeResponse) - return JSONResponse(content=response.model_dump()) + generator = openai_serving_completion.create_detokenize(request) + if isinstance(generator, ErrorResponse): + return JSONResponse(content=generator.model_dump(), + status_code=generator.code) + else: + assert isinstance(generator, DetokenizeResponse) + return JSONResponse(content=generator.model_dump()) @app.get("/v1/models") From 6815ccee80c25cb37cd81a9c4df29cd33988c339 Mon Sep 17 00:00:00 2001 From: sasha0552 Date: Wed, 26 Jun 2024 13:19:35 +0000 Subject: [PATCH 6/6] restore asyncs --- vllm/entrypoints/openai/api_server.py | 4 ++-- vllm/entrypoints/openai/serving_chat.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 13 +++++++------ vllm/entrypoints/openai/serving_embedding.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 2 +- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d13bd44623eb..a708176c254e 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -94,7 +94,7 @@ async def health() -> Response: @app.post("/tokenize") async def tokenize(request: TokenizeRequest): - generator = openai_serving_completion.create_tokenize(request) + generator = await openai_serving_completion.create_tokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) @@ -105,7 +105,7 @@ async def tokenize(request: TokenizeRequest): @app.post("/detokenize") async def detokenize(request: DetokenizeRequest): - generator = openai_serving_completion.create_detokenize(request) + generator = await openai_serving_completion.create_detokenize(request) if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 3d1e27b3939f..744e1d94511b 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -204,7 +204,7 @@ async def create_chat_completion( NOTE: Currently we do not support the following feature: - function_call (Users should implement this by themselves) """ - error_check_ret = self._check_model(request) + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index d9c12d55e9fc..8741893c9271 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -84,7 +84,7 @@ async def create_completion(self, request: CompletionRequest, - suffix (the language models we currently support do not support suffix) """ - error_check_ret = self._check_model(request) + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -447,8 +447,9 @@ def _create_completion_logprobs( top_logprobs=out_top_logprobs, ) - def create_tokenize(self, request: TokenizeRequest) -> TokenizeResponse: - error_check_ret = self._check_model(request) + async def create_tokenize(self, + request: TokenizeRequest) -> TokenizeResponse: + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret @@ -461,9 +462,9 @@ def create_tokenize(self, request: TokenizeRequest) -> TokenizeResponse: count=len(input_ids), max_model_len=self.max_model_len) - def create_detokenize(self, - request: DetokenizeRequest) -> DetokenizeResponse: - error_check_ret = self._check_model(request) + async def create_detokenize( + self, request: DetokenizeRequest) -> DetokenizeResponse: + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py index f79b40ce34f2..cbf09f173fb6 100644 --- a/vllm/entrypoints/openai/serving_embedding.py +++ b/vllm/entrypoints/openai/serving_embedding.py @@ -68,7 +68,7 @@ async def create_embedding(self, request: EmbeddingRequest, See https://platform.openai.com/docs/api-reference/embeddings/create for the API specification. This API mimics the OpenAI Embedding API. """ - error_check_ret = self._check_model(request) + error_check_ret = await self._check_model(request) if error_check_ret is not None: return error_check_ret diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 8d53eb0357a0..84e4127725bb 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -99,7 +99,7 @@ def create_streaming_error_response( }) return json_str - def _check_model( + async def _check_model( self, request: Union[ChatCompletionRequest, CompletionRequest, DetokenizeRequest, EmbeddingRequest, TokenizeRequest]