From 5c174806f785443397b2a76957d2e09de2f9e604 Mon Sep 17 00:00:00 2001 From: Joe G Date: Thu, 13 Jun 2024 13:04:58 -0700 Subject: [PATCH 1/3] Fix async engine --- vllm/engine/async_llm_engine.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 60ccff09abe5..510c8c1a7602 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -260,6 +260,7 @@ async def process_model_inputs_async( request_id: str, inputs: PromptInputs, lora_request: Optional[LoRARequest] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> LLMInputs: if isinstance(inputs, str): inputs = {"prompt": inputs} @@ -275,6 +276,11 @@ async def process_model_inputs_async( else: prompt_token_ids = inputs["prompt_token_ids"] + if prompt_adapter_request: + prompt_token_ids = [ + 0 + ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + prompt_token_ids + return LLMInputs(prompt_token_ids=prompt_token_ids, prompt=inputs.get("prompt"), multi_modal_data=inputs.get("multi_modal_data")) @@ -295,7 +301,10 @@ async def add_request_async( arrival_time = time.time() processed_inputs = await self.process_model_inputs_async( - request_id=request_id, inputs=inputs, lora_request=lora_request) + request_id=request_id, + inputs=inputs, + lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request) self._add_processed_request( request_id=request_id, From e62cbb5692a1c884d36d185054bdee521395b39e Mon Sep 17 00:00:00 2001 From: Joe G Date: Thu, 13 Jun 2024 14:01:07 -0700 Subject: [PATCH 2/3] Initial implementation of openai entrypoint Assumes the interface for prompt adapters and lora modules remains completely separate. --- vllm/entrypoints/openai/api_server.py | 5 +- vllm/entrypoints/openai/cli_args.py | 21 ++++++- vllm/entrypoints/openai/serving_completion.py | 11 +++- vllm/entrypoints/openai/serving_engine.py | 59 +++++++++++++++++-- 4 files changed, 86 insertions(+), 10 deletions(-) diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index e7503b965583..ec35a81c0b89 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -87,7 +87,7 @@ async def health() -> Response: @app.get("/v1/models") async def show_available_models(): - models = await openai_serving_chat.show_available_models() + models = await openai_serving_completion.show_available_models() return JSONResponse(content=models.model_dump()) @@ -216,7 +216,8 @@ async def authentication(request: Request, call_next): args.lora_modules, args.chat_template) openai_serving_completion = OpenAIServingCompletion( - engine, model_config, served_model_names, args.lora_modules) + engine, model_config, served_model_names, args.lora_modules, + args.prompt_adapters) openai_serving_embedding = OpenAIServingEmbedding(engine, model_config, served_model_names) app.root_path = args.root_path diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 4c0cb1e4f3e4..fd142643aec7 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -9,7 +9,8 @@ import ssl from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str -from vllm.entrypoints.openai.serving_engine import LoRAModulePath +from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, + PromptAdapterPath) class LoRAParserAction(argparse.Action): @@ -22,6 +23,16 @@ def __call__(self, parser, namespace, values, option_string=None): setattr(namespace, self.dest, lora_list) +class PromptAdapterParserAction(argparse.Action): + + def __call__(self, parser, namespace, values, option_string=None): + adapter_list = [] + for item in values: + name, path = item.split('=') + adapter_list.append(PromptAdapterPath(name, path)) + setattr(namespace, self.dest, adapter_list) + + def make_arg_parser(): parser = argparse.ArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") @@ -64,6 +75,14 @@ def make_arg_parser(): action=LoRAParserAction, help="LoRA module configurations in the format name=path. " "Multiple modules can be specified.") + parser.add_argument( + "--prompt-adapters", + type=nullable_str, + default=None, + nargs='+', + action=PromptAdapterParserAction, + help="Prompt adapter configurations in the format name=path. " + "Multiple adapters can be specified.") parser.add_argument("--chat-template", type=nullable_str, default=None, diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 64671e21a724..a91df755f4bc 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -18,7 +18,8 @@ CompletionStreamResponse, UsageInfo) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, - OpenAIServing) + OpenAIServing, + PromptAdapterPath) from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) @@ -61,11 +62,13 @@ class OpenAIServingCompletion(OpenAIServing): def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, served_model_names: List[str], - lora_modules: Optional[List[LoRAModulePath]]): + lora_modules: Optional[List[LoRAModulePath]], + prompt_adapters: Optional[List[PromptAdapterPath]]): super().__init__(engine=engine, model_config=model_config, served_model_names=served_model_names, - lora_modules=lora_modules) + lora_modules=lora_modules, + prompt_adapters=prompt_adapters) async def create_completion(self, request: CompletionRequest, raw_request: Request): @@ -96,6 +99,7 @@ async def create_completion(self, request: CompletionRequest, try: sampling_params = request.to_sampling_params() lora_request = self._maybe_get_lora(request) + prompt_adapter_request = self._maybe_get_prompt_adapter(request) decoding_config = await self.engine.get_decoding_config() guided_decoding_backend = request.guided_decoding_backend \ or decoding_config.guided_decoding_backend @@ -133,6 +137,7 @@ async def create_completion(self, request: CompletionRequest, sampling_params, f"{request_id}-{i}", lora_request=lora_request, + prompt_adapter_request=prompt_adapter_request, ) generators.append(generator) diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 6b5a62efc7f2..a0cc0d04f44c 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -15,12 +15,19 @@ ModelPermission) from vllm.logger import init_logger from vllm.lora.request import LoRARequest +from vllm.prompt_adapter.request import PromptAdapterRequest from vllm.sequence import Logprob from vllm.transformers_utils.tokenizer import get_tokenizer logger = init_logger(__name__) +@dataclass +class PromptAdapterPath: + name: str + local_path: str + + @dataclass class LoRAModulePath: name: str @@ -29,9 +36,14 @@ class LoRAModulePath: class OpenAIServing: - def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, - served_model_names: List[str], - lora_modules: Optional[List[LoRAModulePath]]): + def __init__( + self, + engine: AsyncLLMEngine, + model_config: ModelConfig, + served_model_names: List[str], + lora_modules: Optional[List[LoRAModulePath]], + prompt_adapters: Optional[List[PromptAdapterPath]] = None, + ): super().__init__() self.engine = engine @@ -58,6 +70,19 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig, ) for i, lora in enumerate(lora_modules, start=1) ] + self.prompt_adapter_requests = [] + if prompt_adapters is not None: + for i, prompt_adapter in enumerate(prompt_adapters, start=1): + with open(prompt_adapter.local_path) as f: + adapter_config = json.load(f) + num_virtual_tokens = adapter_config["num_virtual_tokens"] + self.prompt_adapter_requests.append( + PromptAdapterRequest( + prompt_adapter_name=prompt_adapter.name, + prompt_adapter_id=i, + prompt_adapter_local_path=prompt_adapter.local_path, + prompt_adapter_num_virtual_tokens=num_virtual_tokens)) + async def show_available_models(self) -> ModelList: """Show available models. Right now we only have one model.""" model_cards = [ @@ -73,6 +98,13 @@ async def show_available_models(self) -> ModelList: permission=[ModelPermission()]) for lora in self.lora_requests ] + prompt_adapter_cards = [ + ModelCard(id=prompt_adapter.prompt_adapter_name, + root=self.served_model_names[0], + permission=[ModelPermission()]) + for prompt_adapter in self.prompt_adapter_requests + ] + model_cards.extend(prompt_adapter_cards) model_cards.extend(lora_cards) return ModelList(data=model_cards) @@ -106,6 +138,11 @@ async def _check_model( return None if request.model in [lora.lora_name for lora in self.lora_requests]: return None + if request.model in [ + prompt_adapter.prompt_adapter_name + for prompt_adapter in self.prompt_adapter_requests + ]: + return None return self.create_error_response( message=f"The model `{request.model}` does not exist.", err_type="NotFoundError", @@ -120,8 +157,22 @@ def _maybe_get_lora( for lora in self.lora_requests: if request.model == lora.lora_name: return lora + return None + # if _check_model has been called earlier, this will be unreachable + #raise ValueError(f"The model `{request.model}` does not exist.") + + def _maybe_get_prompt_adapter( + self, request: Union[CompletionRequest, ChatCompletionRequest, + EmbeddingRequest] + ) -> Optional[PromptAdapterRequest]: + if request.model in self.served_model_names: + return None + for prompt_adapter in self.prompt_adapter_requests: + if request.model == prompt_adapter.prompt_adapter_name: + return prompt_adapter + return None # if _check_model has been called earlier, this will be unreachable - raise ValueError(f"The model `{request.model}` does not exist.") + #raise ValueError(f"The model `{request.model}` does not exist.") def _validate_prompt_and_tokenize( self, From 612d6c5ef4fa5490d0292eb4c5a7f8af64d1cb38 Mon Sep 17 00:00:00 2001 From: Joe G Date: Thu, 13 Jun 2024 14:23:34 -0700 Subject: [PATCH 3/3] Fixes --- vllm/engine/async_llm_engine.py | 3 ++- vllm/entrypoints/openai/serving_engine.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 510c8c1a7602..f5bea488a04b 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -279,7 +279,8 @@ async def process_model_inputs_async( if prompt_adapter_request: prompt_token_ids = [ 0 - ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + prompt_token_ids + ] * prompt_adapter_request.prompt_adapter_num_virtual_tokens + \ + prompt_token_ids return LLMInputs(prompt_token_ids=prompt_token_ids, prompt=inputs.get("prompt"), diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index a0cc0d04f44c..e6d99686e72d 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -73,7 +73,7 @@ def __init__( self.prompt_adapter_requests = [] if prompt_adapters is not None: for i, prompt_adapter in enumerate(prompt_adapters, start=1): - with open(prompt_adapter.local_path) as f: + with open(f"./{prompt_adapter.local_path}/adapter_config.json") as f: adapter_config = json.load(f) num_virtual_tokens = adapter_config["num_virtual_tokens"] self.prompt_adapter_requests.append(