From 0c872b3fd92f07376f8ff08ad99550eb8b7eea11 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 04:59:22 +0000 Subject: [PATCH 01/22] Make encoder-decoder inputs a composed structure --- tests/core/utils.py | 57 +++---- tests/test_cache_block_hashing.py | 7 +- tests/tokenization/test_detokenize.py | 6 +- vllm/engine/llm_engine.py | 42 ++--- vllm/inputs/__init__.py | 12 +- vllm/inputs/data.py | 48 +++--- vllm/inputs/parse.py | 11 +- vllm/inputs/preprocess.py | 237 +++++++++++++------------- vllm/inputs/registry.py | 10 +- vllm/model_executor/models/mllama.py | 51 +++--- vllm/sequence.py | 102 ++++------- 11 files changed, 275 insertions(+), 308 deletions(-) diff --git a/tests/core/utils.py b/tests/core/utils.py index a95a573db7c..cd0caa4704e 100644 --- a/tests/core/utils.py +++ b/tests/core/utils.py @@ -4,6 +4,7 @@ from typing import Tuple from vllm import SamplingParams +from vllm.inputs import EncoderDecoderInputs, token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Logprob, Sequence, SequenceGroup @@ -27,10 +28,7 @@ def create_dummy_prompt( prompt_tokens = list(range(prompt_length)) prompt_str = " ".join([str(t) for t in prompt_tokens]) prompt = Sequence(int(request_id), - inputs={ - "prompt": prompt_str, - "prompt_token_ids": prompt_tokens, - }, + inputs=token_inputs(prompt_tokens, prompt=prompt_str), block_size=block_size) seq_group = SequenceGroup(request_id=request_id, seqs=[prompt], @@ -63,23 +61,21 @@ def create_dummy_prompt_encoder_decoder( encoder_prompt_tokens = list(reversed(list(range(encoder_prompt_length)))) encoder_prompt_str = " ".join([str(t) for t in encoder_prompt_tokens]) - inputs = { - "prompt": decoder_prompt_str, - "prompt_token_ids": decoder_prompt_tokens, - "encoder_prompt": encoder_prompt_str, - "encoder_prompt_token_ids": encoder_prompt_tokens, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(decoder_prompt_tokens, + prompt=decoder_prompt_str), + "encoder": token_inputs(encoder_prompt_tokens, + prompt=encoder_prompt_str), } decoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=True) + inputs=inputs["decoder"], + block_size=block_size) encoder_prompt = Sequence(int(request_id), - inputs=inputs, - block_size=block_size, - from_decoder_prompt=False) + inputs=inputs["encoder"], + block_size=block_size) + seq_group = SequenceGroup(request_id=request_id, seqs=[decoder_prompt], sampling_params=SamplingParams(best_of=best_of), @@ -108,7 +104,7 @@ def create_seq_group( for seq_id_offset, output_len in enumerate(seq_output_lens): seq = Sequence( seq_id=seq_id_start + seq_id_offset, - inputs={"prompt_token_ids": prompt_token_ids}, + inputs=token_inputs(prompt_token_ids), block_size=16, ) @@ -143,21 +139,19 @@ def create_seq_group_encoder_decoder( prompt_token_ids = [0] * seq_prompt_len - inputs = { - "prompt": "", - "prompt_token_ids": prompt_token_ids, - "encoder_prompt": "", - "encoder_prompt_token_ids": prompt_token_ids, - "multi_modal_data": None, + inputs: EncoderDecoderInputs = { + "decoder": token_inputs(prompt_token_ids), + "encoder": token_inputs(prompt_token_ids), } seqs = [] for seq_id_offset, output_len in enumerate(seq_output_lens): # Construct decoder input sequences - seq = Sequence(seq_id=seq_id_start + seq_id_offset, - inputs=inputs, - block_size=16, - from_decoder_prompt=True) + seq = Sequence( + seq_id=seq_id_start + seq_id_offset, + inputs=inputs["decoder"], + block_size=16, + ) for i in range(output_len): seq.append_token_id( @@ -167,10 +161,11 @@ def create_seq_group_encoder_decoder( seqs.append(seq) # Encoder input sequence - encoder_seq = Sequence(seq_id=seq_id_start + len(seq_output_lens), - inputs=inputs, - block_size=16, - from_decoder_prompt=False) + encoder_seq = Sequence( + seq_id=seq_id_start + len(seq_output_lens), + inputs=inputs["encoder"], + block_size=16, + ) return SequenceGroup(request_id=request_id, seqs=seqs, diff --git a/tests/test_cache_block_hashing.py b/tests/test_cache_block_hashing.py index 3576a4834eb..e8f8499aa88 100644 --- a/tests/test_cache_block_hashing.py +++ b/tests/test_cache_block_hashing.py @@ -6,6 +6,7 @@ import pytest +from vllm.inputs import token_inputs from vllm.lora.request import LoRARequest from vllm.sequence import Sequence from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -70,10 +71,8 @@ def test_auto_prefix_caching(model: str, block_size: int, max_num_seqs: int, hashes[-1].append([]) prompt_token_ids = tokenizer.encode(prompt) seq = Sequence(seq_id, - inputs={ - "prompt": prompt, - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, + prompt=prompt), block_size=block_size, eos_token_id=tokenizer.tokenizer.eos_token_id, lora_request=lora_request) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f4551ed42ef..921ce6b0973 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -3,6 +3,7 @@ import pytest from transformers import AutoTokenizer +from vllm.inputs import token_inputs from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup from vllm.transformers_utils.detokenizer import (Detokenizer, detokenize_incrementally) @@ -123,10 +124,7 @@ def create_sequence(prompt_token_ids=None): prompt_token_ids = prompt_token_ids or [1] return Sequence( seq_id=0, - inputs={ - "prompt": "", - "prompt_token_ids": prompt_token_ids, - }, + inputs=token_inputs(prompt_token_ids, prompt=""), block_size=16, ) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 25c4e76d9b1..2a302b058f6 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -30,8 +30,9 @@ from vllm.executor.executor_base import ExecutorBase from vllm.executor.gpu_executor import GPUExecutor from vllm.executor.ray_utils import initialize_ray_cluster -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputRegistry, PromptType) +from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs, + PromptType) +from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -639,7 +640,7 @@ def _verify_args(self) -> None: def _add_processed_request( self, request_id: str, - processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], + processed_inputs: ProcessorInputs, params: Union[SamplingParams, PoolingParams], arrival_time: float, lora_request: Optional[LoRARequest], @@ -656,18 +657,19 @@ def _add_processed_request( seq_id = next(self.seq_counter) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - seq = Sequence(seq_id, processed_inputs, block_size, eos_token_id, + if is_encoder_decoder_inputs(processed_inputs): + decoder_inputs = processed_inputs["decoder"] + encoder_inputs = processed_inputs["encoder"] + else: + decoder_inputs = processed_inputs + encoder_inputs = None + + seq = Sequence(seq_id, decoder_inputs, block_size, eos_token_id, lora_request, prompt_adapter_request) - encoder_seq = None - if 'encoder_prompt_token_ids' in processed_inputs: - encoder_seq = Sequence(seq_id, - processed_inputs, - block_size, - eos_token_id, - lora_request, - prompt_adapter_request, - from_decoder_prompt=False) + encoder_seq = (None if encoder_inputs is None else Sequence( + seq_id, encoder_inputs, block_size, eos_token_id, lora_request, + prompt_adapter_request)) # Create a SequenceGroup based on SamplingParams or PoolingParams if isinstance(params, SamplingParams): @@ -1909,16 +1911,16 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None: def is_encoder_decoder_model(self): return self.input_preprocessor.is_encoder_decoder_model() - def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): - if self.model_config.is_multimodal_model: + def _validate_model_inputs(self, inputs: ProcessorInputs): + if is_encoder_decoder_inputs(inputs): # For encoder-decoder multimodal models, the max_prompt_len # restricts the decoder prompt length - prompt_ids = inputs.get("prompt_token_ids") - elif self.is_encoder_decoder_model(): - prompt_ids = inputs.get("encoder_prompt_token_ids") + prompt_inputs = inputs["decoder" if self.model_config. + is_multimodal_model else "encoder"] else: - prompt_ids = inputs.get("prompt_token_ids") + prompt_inputs = inputs + + prompt_ids = prompt_inputs.get("prompt_token_ids") if prompt_ids is None or len(prompt_ids) == 0: raise ValueError("Prompt cannot be empty") diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index 7b73922ddd2..57793349780 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -1,8 +1,8 @@ from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonInputs, - SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, - build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, - token_inputs, zip_enc_dec_prompts) + ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType, + SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, + TokensPrompt, build_explicit_enc_dec_prompt, + to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) from .registry import InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() @@ -22,9 +22,9 @@ "ExplicitEncoderDecoderPrompt", "TokenInputs", "token_inputs", - "SingletonInputs", - "DecoderOnlyInputs", "EncoderDecoderInputs", + "ProcessorInputs", + "SingletonInputs", "build_explicit_enc_dec_prompt", "to_enc_dec_tuple_list", "zip_enc_dec_prompts", diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 9a094191eda..8f91d386704 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,4 +1,4 @@ -from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, +from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal, Optional, Tuple, Union, cast) from typing_extensions import NotRequired, TypedDict, TypeVar @@ -122,21 +122,25 @@ class ExplicitEncoderDecoderPrompt(TypedDict, Generic[_T1_co, _T2_co]): class TokenInputs(TypedDict): """Represents token-based inputs.""" + + type: Literal["token"] + """The type of inputs.""" + prompt_token_ids: List[int] """The token IDs of the prompt.""" - prompt: NotRequired[Optional[str]] + prompt: NotRequired[str] """ The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. """ - mm_processor_kwargs: NotRequired[Optional[Dict[str, Any]]] + mm_processor_kwargs: NotRequired[Dict[str, Any]] """ Optional multi-modal processor kwargs to be forwarded to the multimodal input mapper & processor. Note that if multiple modalities @@ -152,7 +156,7 @@ def token_inputs( mm_processor_kwargs: Optional[Dict[str, Any]] = None, ) -> TokenInputs: """Construct :class:`TokenInputs` from optional values.""" - inputs = TokenInputs(prompt_token_ids=prompt_token_ids) + inputs = TokenInputs(type="token", prompt_token_ids=prompt_token_ids) if prompt is not None: inputs["prompt"] = prompt @@ -164,12 +168,6 @@ def token_inputs( return inputs -SingletonInputs = TokenInputs -""" -A processed :class:`SingletonPrompt` which can be passed to -:class:`vllm.sequence.Sequence`. -""" - DecoderOnlyInputs = TokenInputs """ The inputs in :class:`~vllm.LLMEngine` before they are @@ -178,28 +176,30 @@ def token_inputs( """ -class EncoderDecoderInputs(TokenInputs): +class EncoderDecoderInputs(TypedDict): """ The inputs in :class:`~vllm.LLMEngine` before they are passed to the model executor. This specifies the required data for encoder-decoder models. """ - encoder_prompt_token_ids: List[int] - """The token IDs of the encoder prompt.""" + encoder: TokenInputs + """The inputs for the encoder portion.""" - encoder_prompt: NotRequired[Optional[str]] - """ - The original encoder prompt text corresponding to the token IDs, if - available. - """ + decoder: TokenInputs + """The inputs for the decoder portion.""" - encoder_multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] - """ - Optional multi-modal data to pass to the encoder model, - if the model supports it. - """ +SingletonInputs = TokenInputs +""" +A processed :class:`SingletonPrompt` which can be passed to +:class:`vllm.sequence.Sequence`. +""" + +ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs] +""" +The inputs to :data:`vllm.inputs.InputProcessor`. +""" _T1 = TypeVar("_T1", bound=SingletonPrompt, default=SingletonPrompt) _T2 = TypeVar("_T2", bound=SingletonPrompt, default=SingletonPrompt) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index e79d2c813bb..b11a151c4a5 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -4,9 +4,9 @@ from vllm.utils import is_list_of -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, - ExplicitEncoderDecoderPrompt, PromptType, SingletonPrompt, - TextPrompt, TokensPrompt) +from .data import (EncoderDecoderInputs, ExplicitEncoderDecoderPrompt, + ProcessorInputs, PromptType, SingletonPrompt, TextPrompt, + TokensPrompt) class ParsedText(TypedDict): @@ -104,6 +104,5 @@ def is_explicit_encoder_decoder_prompt( def is_encoder_decoder_inputs( - inputs: Union[DecoderOnlyInputs, EncoderDecoderInputs], -) -> TypeIs[EncoderDecoderInputs]: - return "encoder_prompt_token_ids" in inputs + inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]: + return "encoder" in inputs and "decoder" in inputs diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 82ce7d392b7..9681c7dc548 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -1,5 +1,5 @@ import asyncio -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import List, Optional from typing_extensions import assert_never @@ -10,22 +10,12 @@ from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.utils import print_warning_once -from .data import (DecoderOnlyInputs, EncoderDecoderInputs, PromptType, - SingletonPrompt) +from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, + PromptType, SingletonPrompt, TokenInputs, token_inputs) from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt -if TYPE_CHECKING: - from vllm.multimodal import MultiModalDataDict - logger = init_logger(__name__) -PromptComponents = Tuple[Optional[str], List[int], - Optional["MultiModalDataDict"], Optional[Dict[str, - Any]]] -DecoderPromptComponents = Tuple[Optional[str], Optional[List[int]], - Optional["MultiModalDataDict"], - Optional[Dict[str, Any]]] - class InputPreprocessor: @@ -115,7 +105,7 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: "default" decoder prompt be . However, it is possible that in the future - other models may have different or more + other models may have different or more complex logic for the default decoder prompt. This motivates having a special helper method for default decoder prompts. @@ -209,12 +199,12 @@ async def _tokenize_prompt_async( prompt=prompt, lora_request=lora_request) - def _extract_prompt_components( + def _prompt_to_llm_inputs( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> DecoderOnlyInputs: ''' Extract the components of any single encoder or decoder input prompt. @@ -241,14 +231,24 @@ def _extract_prompt_components( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": prompt_token_ids = parsed["content"]["prompt_token_ids"] multi_modal_data = parsed["content"].get("multi_modal_data") mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": prompt_text = parsed["content"]["prompt"] prompt_token_ids = self._tokenize_prompt( prompt_text, @@ -257,18 +257,22 @@ def _extract_prompt_components( ) multi_modal_data = parsed["content"].get("multi_modal_data") mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + assert_never(parsed) - async def _extract_prompt_components_async( + async def _prompt_to_llm_inputs_async( self, prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> PromptComponents: + ) -> DecoderOnlyInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -279,14 +283,24 @@ async def _extract_prompt_components_async( request_id=request_id, lora_request=lora_request, ) - multi_modal_data = None - mm_processor_kwargs = None - elif parsed["type"] == "tokens": - prompt_text = None + + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + ) + + if parsed["type"] == "tokens": prompt_token_ids = parsed["content"]["prompt_token_ids"] multi_modal_data = parsed["content"].get("multi_modal_data") mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - elif parsed["type"] == "text": + + return token_inputs( + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + if parsed["type"] == "text": prompt_text = parsed["content"]["prompt"] prompt_token_ids = await self._tokenize_prompt_async( prompt_text, @@ -295,43 +309,49 @@ async def _extract_prompt_components_async( ) multi_modal_data = parsed["content"].get("multi_modal_data") mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") - else: - assert_never(parsed) - return (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) + return token_inputs( + prompt=prompt_text, + prompt_token_ids=prompt_token_ids, + multi_modal_data=multi_modal_data, + mm_processor_kwargs=mm_processor_kwargs, + ) + + assert_never(parsed) def _build_enc_dec_llm_inputs( self, - encoder_comps: PromptComponents, - decoder_comps: DecoderPromptComponents, - mm_processor_kwargs: Dict[str, Any], + encoder_inputs: TokenInputs, + decoder_inputs: Optional[TokenInputs], ) -> EncoderDecoderInputs: - encoder_prompt, encoder_prompt_ids, encoder_mm_data, _ = encoder_comps - decoder_prompt, decoder_prompt_ids, decoder_mm_data, _ = decoder_comps - - # Reminder: Please update docs/source/serving/compatibility_matrix.rst - # If the feature combo become valid - if decoder_mm_data is not None: - raise ValueError( - "Multi-modality decoder inputs of encoder-decoder models are " - "not supported yet") - - # For Multi-Modal models (e.g., mllama), the text input can be - # <|image|><|begin_of_text|>hello world. And we should not add - # another <|begin_of_text|> to the beginning. - decoder_prompt_ids = (self._prepare_decoder_input_ids_for_generation( - decoder_prompt_ids, - force_bos=(encoder_mm_data is None and decoder_mm_data is None))) + if encoder_inputs["type"] == "token": + pass + else: + assert_never(encoder_inputs) + + if decoder_inputs is None: + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None, + force_bos="multi_modal_data" not in encoder_inputs, + ) + decoder_inputs = token_inputs(dec_token_ids) + elif decoder_inputs["type"] == "token": + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + decoder_inputs["prompt_token_ids"], + force_bos=("multi_modal_data" not in encoder_inputs + and "multi_modal_data" not in decoder_inputs), + ) + decoder_inputs["prompt_token_ids"] = dec_token_ids + + if "multi_modal_data" in decoder_inputs: + raise ValueError("Multi-modal decoder inputs of encoder-" + "decoder models are not supported yet") + else: + assert_never(encoder_inputs) return EncoderDecoderInputs( - prompt_token_ids=decoder_prompt_ids, - prompt=decoder_prompt, - multi_modal_data=decoder_mm_data, - mm_processor_kwargs=mm_processor_kwargs, - encoder_prompt_token_ids=encoder_prompt_ids, - encoder_prompt=encoder_prompt, - encoder_multi_modal_data=encoder_mm_data, + encoder=encoder_inputs, + decoder=decoder_inputs, ) def _process_encoder_decoder_prompt( @@ -341,8 +361,7 @@ def _process_encoder_decoder_prompt( ) -> EncoderDecoderInputs: ''' For encoder/decoder models only: - Process an input prompt into an - :class:`EncoderDecoderInputs` instance. + Process an input prompt into an :class:`EncoderDecoderInputs` instance. There are two types of input prompts: singleton prompts which carry only the @@ -361,7 +380,7 @@ def _process_encoder_decoder_prompt( have any possible singleton type; thus this method relies on helper functions to obtain token ids for the sub-prompts. - + Arguments: * prompt: an input prompt @@ -372,40 +391,31 @@ def _process_encoder_decoder_prompt( * :class:`EncoderDecoderInputs` instance ''' - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: TokenInputs + decoder_inputs: Optional[TokenInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - decoder_comps = None, None, None, None + decoder_inputs = None else: - decoder_comps = self._extract_prompt_components( + decoder_inputs = self._prompt_to_llm_inputs( decoder_input, request_id=request_id, ) - # Handle this carefully in case it was directly initialized by user - mm_processor_kwargs = prompt.get("mm_processor_kwargs", {}) else: - encoder_comps = self._extract_prompt_components( + encoder_inputs = self._prompt_to_llm_inputs( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) async def _process_encoder_decoder_prompt_async( self, @@ -413,59 +423,50 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - encoder_comps: PromptComponents - decoder_comps: DecoderPromptComponents + encoder_inputs: TokenInputs + decoder_inputs: Optional[TokenInputs] if is_explicit_encoder_decoder_prompt(prompt): - encoder_task = self._extract_prompt_components_async( + encoder_task = self._prompt_to_llm_inputs_async( prompt["encoder_prompt"], request_id=request_id, ) if (decoder_input := prompt["decoder_prompt"]) is None: - encoder_comps = await encoder_task - decoder_comps = None, None, None, None + encoder_inputs = await encoder_task + decoder_inputs = None else: - decoder_task = self._extract_prompt_components_async( + decoder_task = self._prompt_to_llm_inputs_async( decoder_input, request_id=request_id, ) - encoder_comps, decoder_comps = await asyncio.gather( + encoder_inputs, decoder_inputs = await asyncio.gather( encoder_task, decoder_task) - mm_processor_kwargs = prompt["mm_processor_kwargs"] else: - encoder_comps = await self._extract_prompt_components_async( + encoder_inputs = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, ) - # If there are no decoder components, we assume the - # mm_processor_kwargs are in the encoder prompt - mm_processor_kwargs = encoder_comps[-1] if encoder_comps[ - -1] is not None else {} - decoder_comps = None, None, None, None - - return self._build_enc_dec_llm_inputs( - encoder_comps, - decoder_comps, - mm_processor_kwargs, - ) + + decoder_inputs = None + + return self._build_enc_dec_llm_inputs(encoder_inputs, decoder_inputs) def _build_decoder_only_llm_inputs( self, - prompt_comps: PromptComponents, + prompt_inputs: DecoderOnlyInputs, prompt_adapter_request: Optional[PromptAdapterRequest], ) -> DecoderOnlyInputs: - (prompt, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = prompt_comps - - prompt_token_ids = self._apply_prompt_adapter( - prompt_token_ids, prompt_adapter_request=prompt_adapter_request) + if prompt_inputs["type"] == "token": + prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter( + prompt_inputs["prompt_token_ids"], + prompt_adapter_request=prompt_adapter_request, + ) + else: + assert_never(prompt_inputs) - return DecoderOnlyInputs(prompt_token_ids=prompt_token_ids, - prompt=prompt, - multi_modal_data=multi_modal_data, - mm_processor_kwargs=mm_processor_kwargs) + return prompt_inputs def _process_decoder_only_prompt( self, @@ -490,7 +491,7 @@ def _process_decoder_only_prompt( * :class:`DecoderOnlyInputs` instance ''' - prompt_comps = self._extract_prompt_components( + prompt_comps = self._prompt_to_llm_inputs( prompt, request_id=request_id, lora_request=lora_request, @@ -509,7 +510,7 @@ async def _process_decoder_only_prompt_async( prompt_adapter_request: Optional[PromptAdapterRequest] = None, ) -> DecoderOnlyInputs: """Async version of :meth:`_process_decoder_only_prompt`.""" - prompt_comps = await self._extract_prompt_components_async( + prompt_comps = await self._prompt_to_llm_inputs_async( prompt, request_id=request_id, lora_request=lora_request, @@ -526,7 +527,7 @@ def preprocess( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Preprocess the input prompt.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of @@ -554,7 +555,7 @@ async def preprocess_async( request_id: str, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - ) -> Union[DecoderOnlyInputs, EncoderDecoderInputs]: + ) -> ProcessorInputs: """Async version of :meth:`preprocess`.""" if self.is_encoder_decoder_model(): # Encoder-decoder model requires special mapping of diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 4cebc91ce71..41af8456f53 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -12,7 +12,7 @@ from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once, resolve_mm_processor_kwargs) -from .data import DecoderOnlyInputs +from .data import ProcessorInputs if TYPE_CHECKING: from vllm.config import ModelConfig @@ -100,7 +100,7 @@ def __getitem__(self, key: str) -> int: raise KeyError(msg) from exc -InputProcessor = Callable[[InputContext, DecoderOnlyInputs], DecoderOnlyInputs] +InputProcessor = Callable[[InputContext, ProcessorInputs], ProcessorInputs] """Preprocess the inputs to the model.""" @@ -248,8 +248,8 @@ def dummy_data_for_profiling( def _default_input_processor( self, ctx: InputContext, - inputs: DecoderOnlyInputs, - ) -> DecoderOnlyInputs: + inputs: ProcessorInputs, + ) -> ProcessorInputs: """The default input processor is a no-op.""" return inputs @@ -282,7 +282,7 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]): .get(model_cls, self._default_input_processor) def process_input(self, model_config: "ModelConfig", - inputs: DecoderOnlyInputs) -> DecoderOnlyInputs: + inputs: ProcessorInputs) -> ProcessorInputs: """ Apply an input processor to an instance of model inputs. diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 378231f1445..39034b4e9d6 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -36,8 +36,7 @@ from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, - EncoderDecoderInputs, InputContext) +from vllm.inputs import INPUT_REGISTRY, EncoderDecoderInputs, InputContext from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -52,6 +51,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SequenceData +from vllm.utils import is_list_of from .clip import CLIPMLP from .interfaces import SupportsMultiModal @@ -87,34 +87,37 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: def input_processor_for_mllama(ctx: InputContext, - inputs: Union[DecoderOnlyInputs, - EncoderDecoderInputs]): - # move encoder_prompt to prompt - if inputs.get("prompt") is None: - inputs["prompt"] = inputs["encoder_prompt"] - inputs["prompt_token_ids"] = inputs["encoder_prompt_token_ids"] - - # process multi-modal data - multi_modal_data = inputs.get("encoder_multi_modal_data") - - if multi_modal_data is None or "image" not in multi_modal_data \ - or multi_modal_data["image"] is None: + inputs: EncoderDecoderInputs): + enc_inputs = inputs["encoder"] + dec_inputs = inputs["decoder"] + + # move encoder prompt to decoder + if dec_inputs.get("prompt") is None: + dec_inputs["prompt"] = enc_inputs["prompt"] + dec_inputs["prompt_token_ids"] = enc_inputs["prompt_token_ids"] + + multi_modal_data = enc_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: # text-only - inputs["encoder_prompt"] = "" - inputs["encoder_prompt_token_ids"] = [] - inputs["encoder_multi_modal_data"] = {} + enc_inputs["prompt"] = "" + enc_inputs["prompt_token_ids"] = [] + enc_inputs["multi_modal_data"] = {} return inputs - if isinstance(multi_modal_data['image'], Image.Image): - multi_modal_data['image'] = [multi_modal_data['image']] + image_data = multi_modal_data["image"] + if isinstance(image_data, Image.Image): + image_data = [image_data] + + assert is_list_of(image_data, Image.Image) + # Since only the last group of consecutive images # are attended by the decoded tokens, we only need to # get the number of tiles for those images. num_decode_images = _get_num_image_in_last_group( - inputs["prompt_token_ids"]) + dec_inputs["prompt_token_ids"]) hf_config = ctx.model_config.hf_config num_tiles = 0 - for image in multi_modal_data["image"][::-1]: + for image in image_data[::-1]: width, height = image.size tile_size = hf_config.vision_config.image_size canvas_height, canvas_width = get_optimal_tiled_canvas( @@ -129,7 +132,6 @@ def input_processor_for_mllama(ctx: InputContext, num_decode_images -= 1 if num_decode_images == 0: break - # Set encoder prompt length based on the number of tiles. # This tells the block manager to allocate correct number # of slots for encoder tokens. @@ -137,8 +139,9 @@ def input_processor_for_mllama(ctx: InputContext, "chunk size should be multiple of 14" token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1 num_tokens = num_tiles * token_per_chunk - inputs["encoder_prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens - inputs["encoder_prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens + + enc_inputs["prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens + enc_inputs["prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens return inputs diff --git a/vllm/sequence.py b/vllm/sequence.py index 93f58f00ef7..cc7e8f8fef1 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -8,12 +8,12 @@ from functools import cached_property, reduce from typing import TYPE_CHECKING, Any, Callable, Dict, List, Mapping, Optional from typing import Sequence as GenericSequence -from typing import Set, Tuple, Union, cast +from typing import Set, Tuple, Union import msgspec import torch +from typing_extensions import assert_never -from vllm.inputs.parse import is_encoder_decoder_inputs from vllm.lora.request import LoRARequest from vllm.pooling_params import PoolingParams from vllm.prompt_adapter.request import PromptAdapterRequest @@ -378,15 +378,10 @@ def __repr__(self) -> str: class Sequence: """Stores the data, status, and block information of a sequence. - - The sequence is constructed from the :code:`SingletonInputs` instance - passed in through the :code:`inputs` constructor argument. - - For encoder/decoder models, SingletonInputs encapsulates both a - decoder and encoder prompt, creating an ambiguity about which - prompt to construct the sequence from. The `from_decoder_prompt` - constructor argument signals whether to construct the Sequence - from the SingletonInputs decoder prompt, or encoder prompt. + + The sequence is constructed from the :data:`DecoderOnlyInputs` + (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder) + instance passed in through the :code:`inputs` constructor argument. Args: seq_id: The ID of the sequence. @@ -396,10 +391,6 @@ class Sequence: eos_token_id: The end-of-sequence (EOS) token id recognized by this LLM. lora_request: LoRA request. prompt_adapter_request: Prompt Adapter request. - from_decoder_prompt: Construct Sequence from SingletonInputs decoder - prompt (True) or encoder prompt (False.) Must be - True for decoder-only model. - """ def __init__( @@ -410,7 +401,6 @@ def __init__( eos_token_id: Optional[int] = None, lora_request: Optional[LoRARequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None, - from_decoder_prompt: bool = True, ) -> None: self.seq_id = seq_id self.inputs = inputs @@ -418,33 +408,6 @@ def __init__( self.eos_token_id = eos_token_id self.lora_request = lora_request self.prompt_adapter_request = prompt_adapter_request - self.from_decoder_prompt = from_decoder_prompt - - # For decoder-only models, a Sequence is constructed - # from an DecoderOnlyInputs instance (the `inputs` arg.) - # - # For encoder/decoder models the same `inputs` - # instance could be utilized to construct either an - # encoder sequence or a decoder sequence, because - # `DecoderOnlyInputs` has both decoder- and encoder-oriented - # member variables (i.e. it encapsulates both an encoder - # and a decoder prompt.) The decision of which type of sequence - # to generate is determined by the `from_decoder_prompt` argument. - # - # When constructing a encoder sequence - # (`from_decoder_prompt` False) it matters that - # the `DecoderOnlyInputs` instance stored in `inputs` is valid - # in the sense that its encoder-related member variables are - # populated; below, an exception is raised if this is - # not the case. - # - # When constructing a decoder sequence (`from_decoder_prompt` True) - # it does not matter whether `inputs` has its encoder-related - # member variables populated. - if not (from_decoder_prompt or is_encoder_decoder_inputs(inputs)): - raise ValueError("Cannot extract encoder input prompt from " - f"invalid input {inputs}; did you forget the " - "encoder input prompt fields?") self.data = SequenceData.from_seqs(self.prompt_token_ids) self.output_logprobs: SampleLogprobs = [] @@ -469,41 +432,48 @@ def n_blocks(self) -> int: @cached_property def prompt(self) -> Optional[str]: - # Select decoder or encoder input prompt str, as appropriate - prompt_key: str = ("prompt" - if self.from_decoder_prompt else "encoder_prompt") + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("prompt") - return cast(Optional[str], self.inputs.get(prompt_key)) + assert_never(inputs) @cached_property def prompt_token_ids(self) -> List[int]: - # Select decoder or encoder input prompt token ids, as appropriate - prompt_token_ids_key: str = ("prompt_token_ids" - if self.from_decoder_prompt else - "encoder_prompt_token_ids") + inputs = self.inputs - # Cache computed prompt token ids - return cast(List[int], self.inputs.get(prompt_token_ids_key)) + if inputs["type"] == "token": + return inputs.get("prompt_token_ids", []) - @property + assert_never(inputs) + + @cached_property + def prompt_embeds(self) -> Optional[torch.Tensor]: + inputs = self.inputs + + if inputs["type"] == "token": + return None + + assert_never(inputs) + + @cached_property def multi_modal_data(self) -> "MultiModalDataDict": inputs = self.inputs - if (inputs.get("multi_modal_data") - and inputs.get("encoder_multi_modal_data")): - raise ValueError( - "Multi-modal data in both encoder and decoder is not supported." - ) + if inputs["type"] == "token": + return inputs.get("multi_modal_data", {}) - return cast( - "MultiModalDataDict", - (inputs.get("multi_modal_data") - or inputs.get("encoder_multi_modal_data") or {}), - ) + assert_never(inputs) - @property + @cached_property def mm_processor_kwargs(self) -> Dict[str, Any]: - return self.inputs.get("mm_processor_kwargs") or {} + inputs = self.inputs + + if inputs["type"] == "token": + return inputs.get("mm_processor_kwargs", {}) + + assert_never(inputs) @property def lora_int_id(self) -> int: From fa5ad179b1010c0ba257050025e125a234369e7a Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 05:11:57 +0000 Subject: [PATCH 02/22] Rename --- vllm/inputs/preprocess.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 9681c7dc548..73db916dfc2 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -11,7 +11,7 @@ from vllm.utils import print_warning_once from .data import (DecoderOnlyInputs, EncoderDecoderInputs, ProcessorInputs, - PromptType, SingletonPrompt, TokenInputs, token_inputs) + PromptType, SingletonInputs, SingletonPrompt, token_inputs) from .parse import is_explicit_encoder_decoder_prompt, parse_singleton_prompt logger = init_logger(__name__) @@ -204,7 +204,7 @@ def _prompt_to_llm_inputs( prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> DecoderOnlyInputs: + ) -> SingletonInputs: ''' Extract the components of any single encoder or decoder input prompt. @@ -272,7 +272,7 @@ async def _prompt_to_llm_inputs_async( prompt: SingletonPrompt, request_id: str, lora_request: Optional[LoRARequest] = None, - ) -> DecoderOnlyInputs: + ) -> SingletonInputs: """Async version of :meth:`_extract_prompt_components`.""" parsed = parse_singleton_prompt(prompt) @@ -321,8 +321,8 @@ async def _prompt_to_llm_inputs_async( def _build_enc_dec_llm_inputs( self, - encoder_inputs: TokenInputs, - decoder_inputs: Optional[TokenInputs], + encoder_inputs: SingletonInputs, + decoder_inputs: Optional[SingletonInputs], ) -> EncoderDecoderInputs: if encoder_inputs["type"] == "token": pass @@ -391,8 +391,8 @@ def _process_encoder_decoder_prompt( * :class:`EncoderDecoderInputs` instance ''' - encoder_inputs: TokenInputs - decoder_inputs: Optional[TokenInputs] + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): encoder_inputs = self._prompt_to_llm_inputs( @@ -423,8 +423,8 @@ async def _process_encoder_decoder_prompt_async( request_id: str, ) -> EncoderDecoderInputs: """Async version of :meth:`_process_encoder_decoder_prompt`.""" - encoder_inputs: TokenInputs - decoder_inputs: Optional[TokenInputs] + encoder_inputs: SingletonInputs + decoder_inputs: Optional[SingletonInputs] if is_explicit_encoder_decoder_prompt(prompt): encoder_task = self._prompt_to_llm_inputs_async( From 44fd058d17237abb4b3948cddd119ab4eb5f9564 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 05:12:49 +0000 Subject: [PATCH 03/22] Fix type error --- vllm/inputs/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 41af8456f53..1b531b2ab51 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -2,7 +2,7 @@ from collections import UserDict from dataclasses import dataclass from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional, - Protocol, Tuple, Type) + Protocol, Tuple, Type, cast) from torch import nn from transformers import PretrainedConfig @@ -302,7 +302,7 @@ def process_input(self, model_config: "ModelConfig", # If it's empty, it'll fall back to the default kwarg values mm_processor_kwargs = resolve_mm_processor_kwargs( model_config.mm_processor_kwargs, - inputs.get("mm_processor_kwargs"), + cast(Dict[str, Any], inputs.get("mm_processor_kwargs")), processor, ) From b73a345757b9fc24c9f47c9e8edda65031a9e862 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 07:05:07 +0000 Subject: [PATCH 04/22] Fix test --- tests/engine/output_processor/test_stop_checker.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/engine/output_processor/test_stop_checker.py b/tests/engine/output_processor/test_stop_checker.py index 0d84443c51f..cc14e8cbf75 100644 --- a/tests/engine/output_processor/test_stop_checker.py +++ b/tests/engine/output_processor/test_stop_checker.py @@ -4,6 +4,7 @@ from transformers import PreTrainedTokenizer from vllm.engine.output_processor.stop_checker import StopChecker +from vllm.inputs import token_inputs from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob, Sequence, SequenceStatus @@ -15,7 +16,7 @@ def sequence_with_eos(text: str, eos_token: str, """ seq = Sequence( seq_id=0, - inputs={"prompt_token_ids": []}, + inputs=token_inputs([]), block_size=16, eos_token_id=eos_token_id, ) From fa968b59b5a951893ce55aad538f9a5b1169c1aa Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Wed, 23 Oct 2024 07:37:04 +0000 Subject: [PATCH 05/22] Fix llama-3.2 --- vllm/model_executor/models/mllama.py | 58 +++++++++++++++++++--------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 39034b4e9d6..a7e5f0d106f 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -36,7 +36,8 @@ from vllm.attention.ops.paged_attn import PagedAttention from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import get_tensor_model_parallel_world_size -from vllm.inputs import INPUT_REGISTRY, EncoderDecoderInputs, InputContext +from vllm.inputs import (INPUT_REGISTRY, EncoderDecoderInputs, InputContext, + TokenInputs, token_inputs) from vllm.logger import init_logger from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -88,21 +89,32 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: def input_processor_for_mllama(ctx: InputContext, inputs: EncoderDecoderInputs): - enc_inputs = inputs["encoder"] - dec_inputs = inputs["decoder"] + # Example inputs when initially passed to processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000], + # }, + # } # move encoder prompt to decoder - if dec_inputs.get("prompt") is None: - dec_inputs["prompt"] = enc_inputs["prompt"] - dec_inputs["prompt_token_ids"] = enc_inputs["prompt_token_ids"] + inputs["decoder"] = TokenInputs(**inputs["encoder"]) + + dec_inputs = inputs["decoder"] - multi_modal_data = enc_inputs.get("multi_modal_data") + multi_modal_data = dec_inputs.get("multi_modal_data") if multi_modal_data is None or "image" not in multi_modal_data: # text-only - enc_inputs["prompt"] = "" - enc_inputs["prompt_token_ids"] = [] - enc_inputs["multi_modal_data"] = {} - return inputs + return EncoderDecoderInputs( + encoder=token_inputs([]), + decoder=dec_inputs, + ) image_data = multi_modal_data["image"] if isinstance(image_data, Image.Image): @@ -115,15 +127,18 @@ def input_processor_for_mllama(ctx: InputContext, # get the number of tiles for those images. num_decode_images = _get_num_image_in_last_group( dec_inputs["prompt_token_ids"]) + hf_config = ctx.model_config.hf_config + vision_config = hf_config.vision_config + num_tiles = 0 for image in image_data[::-1]: width, height = image.size - tile_size = hf_config.vision_config.image_size + tile_size = vision_config.image_size canvas_height, canvas_width = get_optimal_tiled_canvas( image_height=height, image_width=width, - max_image_tiles=hf_config.vision_config.max_num_tiles, + max_image_tiles=vision_config.max_num_tiles, tile_size=tile_size, ) num_tiles_height = canvas_height // tile_size @@ -132,18 +147,23 @@ def input_processor_for_mllama(ctx: InputContext, num_decode_images -= 1 if num_decode_images == 0: break + # Set encoder prompt length based on the number of tiles. # This tells the block manager to allocate correct number # of slots for encoder tokens. - assert hf_config.vision_config.image_size % 14 == 0, \ + assert vision_config.image_size % 14 == 0, \ "chunk size should be multiple of 14" - token_per_chunk = (hf_config.vision_config.image_size // 14)**2 + 1 + token_per_chunk = (vision_config.image_size // 14)**2 + 1 num_tokens = num_tiles * token_per_chunk - enc_inputs["prompt"] = MLLAMA_IMAGE_TOKEN * num_tokens - enc_inputs["prompt_token_ids"] = [MLLAMA_IMAGE_TOKEN_ID] * num_tokens - - return inputs + return EncoderDecoderInputs( + encoder=token_inputs( + prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens, + prompt=MLLAMA_IMAGE_TOKEN * num_tokens, + multi_modal_data=multi_modal_data, + ), + decoder=dec_inputs, + ) def get_max_mllama_image_tokens(ctx: InputContext) -> int: From 906ee1ea9e79f46bd652abcdbf54422a3b5a05c3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 24 Oct 2024 02:29:50 +0000 Subject: [PATCH 06/22] Remove force_bos --- vllm/inputs/preprocess.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 73db916dfc2..59441ecfd46 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -122,7 +122,6 @@ def _get_default_enc_dec_decoder_prompt(self) -> List[int]: def _prepare_decoder_input_ids_for_generation( self, decoder_input_ids: Optional[List[int]], - force_bos: bool = True, ) -> List[int]: """ Prepares `decoder_input_ids` for generation with encoder-decoder models. @@ -152,10 +151,6 @@ def _prepare_decoder_input_ids_for_generation( # use decoder_start_token_id as decoder_input_ids decoder_input_ids = self._get_default_enc_dec_decoder_prompt() - if force_bos and (len(decoder_input_ids) == 0 - or decoder_input_ids[0] != decoder_start_token_id): - decoder_input_ids = [decoder_start_token_id] + decoder_input_ids - return decoder_input_ids def _apply_prompt_adapter( @@ -330,17 +325,11 @@ def _build_enc_dec_llm_inputs( assert_never(encoder_inputs) if decoder_inputs is None: - dec_token_ids = self._prepare_decoder_input_ids_for_generation( - None, - force_bos="multi_modal_data" not in encoder_inputs, - ) + dec_token_ids = self._prepare_decoder_input_ids_for_generation(None) decoder_inputs = token_inputs(dec_token_ids) elif decoder_inputs["type"] == "token": dec_token_ids = self._prepare_decoder_input_ids_for_generation( - decoder_inputs["prompt_token_ids"], - force_bos=("multi_modal_data" not in encoder_inputs - and "multi_modal_data" not in decoder_inputs), - ) + decoder_inputs["prompt_token_ids"]) decoder_inputs["prompt_token_ids"] = dec_token_ids if "multi_modal_data" in decoder_inputs: From 005ad95003ff57af6f921ac33a3507780ac756d2 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 24 Oct 2024 02:32:46 +0000 Subject: [PATCH 07/22] Add example output --- vllm/model_executor/models/mllama.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index a7e5f0d106f..845700bc49d 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -89,7 +89,7 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: def input_processor_for_mllama(ctx: InputContext, inputs: EncoderDecoderInputs): - # Example inputs when initially passed to processor: + # Example input to processor: # { # 'encoder': { # 'type': 'token', @@ -156,6 +156,21 @@ def input_processor_for_mllama(ctx: InputContext, token_per_chunk = (vision_config.image_size // 14)**2 + 1 num_tokens = num_tiles * token_per_chunk + # Example output from processor: + # { + # 'encoder': { + # 'type': 'token', + # 'prompt_token_ids': [128256, 128256, ..., 128256], + # 'prompt': '<|image|><|image|>...<|image|>', + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # 'decoder': { + # 'type': 'token', + # 'prompt_token_ids': [128000, 128256, 128000, 3923, 374, 279, 2262, 315, 420, 2217, 30], # noqa: E501 + # 'prompt': '<|image|><|begin_of_text|>What is the content of this image?', # noqa: E501 + # 'multi_modal_data': {'image': }, # noqa: E501 + # }, + # } return EncoderDecoderInputs( encoder=token_inputs( prompt_token_ids=[MLLAMA_IMAGE_TOKEN_ID] * num_tokens, From a5f0c163ad810f5bbaae465956f68e16cbb46092 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 24 Oct 2024 02:47:12 +0000 Subject: [PATCH 08/22] format --- vllm/inputs/preprocess.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 59441ecfd46..4bdbcb88e65 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -325,7 +325,8 @@ def _build_enc_dec_llm_inputs( assert_never(encoder_inputs) if decoder_inputs is None: - dec_token_ids = self._prepare_decoder_input_ids_for_generation(None) + dec_token_ids = self._prepare_decoder_input_ids_for_generation( + None) decoder_inputs = token_inputs(dec_token_ids) elif decoder_inputs["type"] == "token": dec_token_ids = self._prepare_decoder_input_ids_for_generation( From 6ab44e4829278d17039651572a0708e3b5f21d97 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 24 Oct 2024 04:24:37 +0000 Subject: [PATCH 09/22] Fix --- vllm/inputs/preprocess.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 4bdbcb88e65..c501b5490c9 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -151,6 +151,10 @@ def _prepare_decoder_input_ids_for_generation( # use decoder_start_token_id as decoder_input_ids decoder_input_ids = self._get_default_enc_dec_decoder_prompt() + if (len(decoder_input_ids) == 0 + or decoder_input_ids[0] != decoder_start_token_id): + decoder_input_ids = [decoder_start_token_id] + decoder_input_ids + return decoder_input_ids def _apply_prompt_adapter( From 760db0549b82b51e83cd3e4905ac99475c6863a4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:31:54 +0000 Subject: [PATCH 10/22] Fix merge --- vllm/inputs/parse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index b11a151c4a5..3438effe6d4 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -98,6 +98,10 @@ def parse_singleton_prompt( raise TypeError("inputs must be a string, TextPrompt, or TokensPrompt") +def is_token_prompt(prompt: PromptType) -> TypeIs[TokensPrompt]: + return isinstance(prompt, dict) and "prompt_token_ids" in prompt + + def is_explicit_encoder_decoder_prompt( prompt: PromptType) -> TypeIs[ExplicitEncoderDecoderPrompt]: return isinstance(prompt, dict) and "encoder_prompt" in prompt @@ -105,4 +109,4 @@ def is_explicit_encoder_decoder_prompt( def is_encoder_decoder_inputs( inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]: - return "encoder" in inputs and "decoder" in inputs + return "encoder" in inputs and "decoder" in inputs \ No newline at end of file From acb8e6f160fb744c68fa46e18f24a76341453f49 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:32:26 +0000 Subject: [PATCH 11/22] Update mllama processing --- vllm/model_executor/models/mllama.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index e26c5f5dfb6..4ecd96d3e91 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -87,8 +87,10 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: return num_images -def input_processor_for_mllama(ctx: InputContext, - inputs: EncoderDecoderInputs): +def input_processor_for_mllama( + ctx: InputContext, + inputs: EncoderDecoderInputs, +)-> EncoderDecoderInputs: # Example input to processor: # { # 'encoder': { @@ -104,9 +106,7 @@ def input_processor_for_mllama(ctx: InputContext, # } # move encoder prompt to decoder - inputs["decoder"] = TokenInputs(**inputs["encoder"]) - - dec_inputs = inputs["decoder"] + dec_inputs = TokenInputs(**inputs["encoder"]) multi_modal_data = dec_inputs.get("multi_modal_data") if multi_modal_data is None or "image" not in multi_modal_data: From 3bed51909b3bddff3e84a5c0c2f0931e51ef8d32 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:32:42 +0000 Subject: [PATCH 12/22] Fix line --- vllm/inputs/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs/parse.py b/vllm/inputs/parse.py index 3438effe6d4..09f1ff2cb42 100644 --- a/vllm/inputs/parse.py +++ b/vllm/inputs/parse.py @@ -109,4 +109,4 @@ def is_explicit_encoder_decoder_prompt( def is_encoder_decoder_inputs( inputs: ProcessorInputs) -> TypeIs[EncoderDecoderInputs]: - return "encoder" in inputs and "decoder" in inputs \ No newline at end of file + return "encoder" in inputs and "decoder" in inputs From ea861e015ce5f365fb958f981238a43ef56bc984 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:33:55 +0000 Subject: [PATCH 13/22] format --- vllm/model_executor/models/mllama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py index 4ecd96d3e91..83899c3b200 100644 --- a/vllm/model_executor/models/mllama.py +++ b/vllm/model_executor/models/mllama.py @@ -90,7 +90,7 @@ def _get_num_image_in_last_group(prompt_token_ids: List[int]) -> int: def input_processor_for_mllama( ctx: InputContext, inputs: EncoderDecoderInputs, -)-> EncoderDecoderInputs: +) -> EncoderDecoderInputs: # Example input to processor: # { # 'encoder': { From f654421cd980452a51e53e95d0271181357f0d16 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:36:56 +0000 Subject: [PATCH 14/22] Avoid repeated lookups --- vllm/inputs/preprocess.py | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index c501b5490c9..453ff36bd65 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -237,9 +237,11 @@ def _prompt_to_llm_inputs( ) if parsed["type"] == "tokens": - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + content = parsed["content"] + + prompt_token_ids = content["prompt_token_ids"] + multi_modal_data = content.get("multi_modal_data") + mm_processor_kwargs = content.get("mm_processor_kwargs") return token_inputs( prompt_token_ids=prompt_token_ids, @@ -248,14 +250,16 @@ def _prompt_to_llm_inputs( ) if parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + content = parsed["content"] + + prompt_text = content["prompt"] prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + multi_modal_data = content.get("multi_modal_data") + mm_processor_kwargs = content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -289,9 +293,11 @@ async def _prompt_to_llm_inputs_async( ) if parsed["type"] == "tokens": - prompt_token_ids = parsed["content"]["prompt_token_ids"] - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + content = parsed["content"] + + prompt_token_ids = content["prompt_token_ids"] + multi_modal_data = content.get("multi_modal_data") + mm_processor_kwargs = content.get("mm_processor_kwargs") return token_inputs( prompt_token_ids=prompt_token_ids, @@ -300,14 +306,16 @@ async def _prompt_to_llm_inputs_async( ) if parsed["type"] == "text": - prompt_text = parsed["content"]["prompt"] + content = parsed["content"] + + prompt_text = content["prompt"] prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = parsed["content"].get("multi_modal_data") - mm_processor_kwargs = parsed["content"].get("mm_processor_kwargs") + multi_modal_data = content.get("multi_modal_data") + mm_processor_kwargs = content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, From 594794e24344f1be5c0386f7bf1ca12d5f421a87 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:37:47 +0000 Subject: [PATCH 15/22] Remove unused import --- vllm/engine/llm_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 8d6c3bc7256..78c5baf8405 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ from typing import Set, Type, Union, cast, overload import torch -from typing_extensions import TypeIs, TypeVar +from typing_extensions import TypeVar import vllm.envs as envs from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, From 08ea824121160054da226760aeb27200a37c1fed Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 01:39:02 +0000 Subject: [PATCH 16/22] Fix mypy --- vllm/inputs/preprocess.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 453ff36bd65..a5c787a56b5 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -237,11 +237,11 @@ def _prompt_to_llm_inputs( ) if parsed["type"] == "tokens": - content = parsed["content"] + tokens_content = parsed["content"] - prompt_token_ids = content["prompt_token_ids"] - multi_modal_data = content.get("multi_modal_data") - mm_processor_kwargs = content.get("mm_processor_kwargs") + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") return token_inputs( prompt_token_ids=prompt_token_ids, @@ -250,16 +250,16 @@ def _prompt_to_llm_inputs( ) if parsed["type"] == "text": - content = parsed["content"] + text_content = parsed["content"] - prompt_text = content["prompt"] + prompt_text = text_content["prompt"] prompt_token_ids = self._tokenize_prompt( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = content.get("multi_modal_data") - mm_processor_kwargs = content.get("mm_processor_kwargs") + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, @@ -293,11 +293,11 @@ async def _prompt_to_llm_inputs_async( ) if parsed["type"] == "tokens": - content = parsed["content"] + tokens_content = parsed["content"] - prompt_token_ids = content["prompt_token_ids"] - multi_modal_data = content.get("multi_modal_data") - mm_processor_kwargs = content.get("mm_processor_kwargs") + prompt_token_ids = tokens_content["prompt_token_ids"] + multi_modal_data = tokens_content.get("multi_modal_data") + mm_processor_kwargs = tokens_content.get("mm_processor_kwargs") return token_inputs( prompt_token_ids=prompt_token_ids, @@ -306,16 +306,16 @@ async def _prompt_to_llm_inputs_async( ) if parsed["type"] == "text": - content = parsed["content"] + text_content = parsed["content"] - prompt_text = content["prompt"] + prompt_text = text_content["prompt"] prompt_token_ids = await self._tokenize_prompt_async( prompt_text, request_id=request_id, lora_request=lora_request, ) - multi_modal_data = content.get("multi_modal_data") - mm_processor_kwargs = content.get("mm_processor_kwargs") + multi_modal_data = text_content.get("multi_modal_data") + mm_processor_kwargs = text_content.get("mm_processor_kwargs") return token_inputs( prompt=prompt_text, From 283bc2ccdd28bc02cab567631b326d59ae6e7f30 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 31 Oct 2024 10:00:31 +0000 Subject: [PATCH 17/22] Fix merge --- vllm/engine/protocol.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index 6a09361c568..e0b59d94cfd 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -1,11 +1,12 @@ import asyncio from abc import ABC, abstractmethod -from typing import AsyncGenerator, List, Mapping, Optional, Union +from typing import AsyncGenerator, List, Mapping, Optional from vllm.beam_search import BeamSearchSequence, create_sort_beams_key_function from vllm.config import DecodingConfig, ModelConfig from vllm.core.scheduler import SchedulerOutputs from vllm.inputs.data import PromptType, TokensPrompt +from vllm.inputs.parse import is_explicit_encoder_decoder_prompt from vllm.inputs.preprocess import InputPreprocessor from vllm.logger import init_logger from vllm.lora.request import LoRARequest @@ -60,7 +61,7 @@ def generate( async def beam_search( self, - prompt: Union[PromptType, List[int]], + prompt: PromptType, model_config: ModelConfig, request_id: str, params: BeamSearchParams, @@ -76,11 +77,19 @@ async def beam_search( tokenizer = await self.get_tokenizer() input_preprocessor = InputPreprocessor(model_config, tokenizer) - (prompt_text, prompt_token_ids, multi_modal_data, - mm_processor_kwargs) = input_preprocessor._extract_prompt_components( - prompt, - request_id=request_id, - ) + if is_explicit_encoder_decoder_prompt(prompt): + raise NotImplementedError + else: + processed_inputs = input_preprocessor._prompt_to_llm_inputs( + prompt, + request_id=request_id, + ) + + prompt_token_ids = processed_inputs["prompt_token_ids"] + prompt_text = processed_inputs.get("prompt") + multi_modal_data = processed_inputs.get("multi_modal_data") + mm_processor_kwargs = processed_inputs.get("mm_processor_kwargs") + tokenized_length = len(prompt_token_ids) sort_beams_key = create_sort_beams_key_function( From b45cdc974c22d2022112119e846d7c16e4bc9085 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 3 Nov 2024 03:20:21 +0000 Subject: [PATCH 18/22] Fix missing import Signed-off-by: DarkLight1337 --- vllm/inputs/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index d9e853020ee..b19e419b9ca 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -3,7 +3,7 @@ SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs, TokensPrompt, build_explicit_enc_dec_prompt, to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts) -from .registry import InputContext, InputRegistry +from .registry import DummyData, InputContext, InputRegistry INPUT_REGISTRY = InputRegistry() """ From 4d33b1ec4d396bdbe889692827e4120348dba71b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 3 Nov 2024 03:25:44 +0000 Subject: [PATCH 19/22] Improve error message Signed-off-by: DarkLight1337 --- vllm/model_executor/models/registry.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index f50ceaccb1b..02f2215ceaa 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -342,9 +342,13 @@ def register_model( def _raise_for_unsupported(self, architectures: List[str]): all_supported_archs = self.get_supported_archs() - raise ValueError( - f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {all_supported_archs}") + msg = (f"Model architectures {architectures} are not supported for " + f"now. Supported architectures: {all_supported_archs}") + if any(arch in all_supported_archs for arch in architectures): + msg += ("\n(Please check the logs to see why the model " + "failed to be inspected.)") + + raise ValueError(msg) def _try_load_model_cls(self, model_arch: str) -> Optional[Type[nn.Module]]: From 0a549e54ac9524a08b07ccd4846f96bf2b2ffb25 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Sun, 3 Nov 2024 03:28:10 +0000 Subject: [PATCH 20/22] Add missing export Signed-off-by: DarkLight1337 --- vllm/inputs/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py index b19e419b9ca..68ac50a2c5a 100644 --- a/vllm/inputs/__init__.py +++ b/vllm/inputs/__init__.py @@ -22,6 +22,7 @@ "ExplicitEncoderDecoderPrompt", "TokenInputs", "token_inputs", + "DecoderOnlyInputs", "EncoderDecoderInputs", "ProcessorInputs", "SingletonInputs", From f741a75af1ca13bba44727d43963891e6fa473df Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 3 Nov 2024 11:46:28 +0800 Subject: [PATCH 21/22] Improve error message. --- vllm/model_executor/models/registry.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index 02f2215ceaa..b3870244cb0 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -342,13 +342,16 @@ def register_model( def _raise_for_unsupported(self, architectures: List[str]): all_supported_archs = self.get_supported_archs() - msg = (f"Model architectures {architectures} are not supported for " - f"now. Supported architectures: {all_supported_archs}") if any(arch in all_supported_archs for arch in architectures): - msg += ("\n(Please check the logs to see why the model " - "failed to be inspected.)") - - raise ValueError(msg) + raise ValueError( + f"Model architectures {architectures} failed " + "to be inspected. Please check the logs for more details." + ) + + raise ValueError( + f"Model architectures {architectures} are not supported for now. " + f"Supported architectures: {all_supported_archs}" + ) def _try_load_model_cls(self, model_arch: str) -> Optional[Type[nn.Module]]: From cd231fa780a14f99ed337a6f550cb2b269991a9e Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 3 Nov 2024 11:47:35 +0800 Subject: [PATCH 22/22] Format --- vllm/model_executor/models/registry.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index b3870244cb0..1fd20307d92 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -345,13 +345,11 @@ def _raise_for_unsupported(self, architectures: List[str]): if any(arch in all_supported_archs for arch in architectures): raise ValueError( f"Model architectures {architectures} failed " - "to be inspected. Please check the logs for more details." - ) + "to be inspected. Please check the logs for more details.") raise ValueError( f"Model architectures {architectures} are not supported for now. " - f"Supported architectures: {all_supported_archs}" - ) + f"Supported architectures: {all_supported_archs}") def _try_load_model_cls(self, model_arch: str) -> Optional[Type[nn.Module]]: