From 6becba34fcbc4861ba0cfe3d570a167bcbc7a66f Mon Sep 17 00:00:00 2001 From: gongdao123 Date: Wed, 14 Aug 2024 17:12:41 +0800 Subject: [PATCH 1/4] add quantization param to embedding checking method --- vllm/entrypoints/openai/api_server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index d89b87534320..86c97d8995fa 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -60,11 +60,12 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, trust_remote_code: bool) -> bool: +def model_is_embedding(model_name: str, trust_remote_code: bool, quantization: str) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", trust_remote_code=trust_remote_code, + quantization=quantization, seed=0, dtype="float16").embedding_mode @@ -97,7 +98,7 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(args.model, args.trust_remote_code) + if (model_is_embedding(args.model, args.trust_remote_code, args.quantization) or args.disable_frontend_multiprocessing): async_engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) From e51a32dd69073ad4055eb0adb6645957a5f3fd3c Mon Sep 17 00:00:00 2001 From: gongdao123 Date: Wed, 14 Aug 2024 17:50:02 +0800 Subject: [PATCH 2/4] lint fix --- vllm/entrypoints/openai/api_server.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 86c97d8995fa..0f2ba240e6a0 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -60,7 +60,11 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding(model_name: str, trust_remote_code: bool, quantization: str) -> bool: +def model_is_embedding( + model_name: str, + trust_remote_code: bool, + quantization: str +) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", @@ -98,7 +102,10 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(args.model, args.trust_remote_code, args.quantization) + if (model_is_embedding( + args.model, + args.trust_remote_code, + args.quantization) or args.disable_frontend_multiprocessing): async_engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) From 049ebc81c88b9b841082303f7ca4062be70a46bc Mon Sep 17 00:00:00 2001 From: gongdao123 Date: Wed, 14 Aug 2024 18:03:42 +0800 Subject: [PATCH 3/4] fix the yapf --- vllm/entrypoints/openai/api_server.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 0f2ba240e6a0..f03b887b88e5 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -60,11 +60,8 @@ _running_tasks: Set[asyncio.Task] = set() -def model_is_embedding( - model_name: str, - trust_remote_code: bool, - quantization: str -) -> bool: +def model_is_embedding(model_name: str, trust_remote_code: bool, + quantization: str) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", @@ -102,10 +99,8 @@ async def build_async_engine_client(args) -> AsyncIterator[AsyncEngineClient]: # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding( - args.model, - args.trust_remote_code, - args.quantization) + if (model_is_embedding(args.model, args.trust_remote_code, + args.quantization) or args.disable_frontend_multiprocessing): async_engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) From 62782e310618f5da7ec313cb99633804849af8cb Mon Sep 17 00:00:00 2001 From: gongdao123 Date: Fri, 16 Aug 2024 09:10:47 +0800 Subject: [PATCH 4/4] retrigger checks