From 7b5a49c01f9449a93be0dfadaebe2a5375c9a240 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 12 Feb 2025 23:35:31 -0800 Subject: [PATCH 1/7] rename Signed-off-by: Roger Wang --- vllm/v1/engine/core.py | 2 +- vllm/v1/engine/{mm_input_mapper.py => mm_input_cache.py} | 0 vllm/v1/engine/processor.py | 2 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename vllm/v1/engine/{mm_input_mapper.py => mm_input_cache.py} (100%) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index e4677681bd2b..ffe0eb98b3e1 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,7 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType) -from vllm.v1.engine.mm_input_mapper import MMInputMapperServer +from vllm.v1.engine.mm_input_cache import MMInputMapperServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_cache.py similarity index 100% rename from vllm/v1/engine/mm_input_mapper.py rename to vllm/v1/engine/mm_input_cache.py diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 70876b03a823..5288f89e0d9e 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,7 +17,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.engine.mm_input_cache import MMInputMapperClient class Processor: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9b1eab613bf7..786d79a2dbb5 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -26,7 +26,7 @@ from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget -from vllm.v1.engine.mm_input_mapper import MMInputMapperClient +from vllm.v1.engine.mm_input_cache import MMInputMapperClient from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput From 7c8f6eafcdee38cfb36aabe642ca472e9f925ab8 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 12 Feb 2025 23:36:50 -0800 Subject: [PATCH 2/7] rename Signed-off-by: Roger Wang --- vllm/v1/engine/mm_input_cache.py | 2 +- vllm/v1/engine/processor.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 83a0d9db161d..2dc2b7b909a1 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -27,7 +27,7 @@ MM_CACHE_SIZE = 256 -class MMInputMapperClient: +class MMInputCacheClient: def __init__( self, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 5288f89e0d9e..9e637cc95759 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -17,7 +17,7 @@ from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine.mm_input_cache import MMInputMapperClient +from vllm.v1.engine.mm_input_cache import MMInputCacheClient class Processor: @@ -46,7 +46,7 @@ def __init__( model_config) # Multi-modal (huggingface) input mapper - self.mm_input_mapper_client = MMInputMapperClient(model_config) + self.mm_input_mapper_client = MMInputCacheClient(model_config) # Multi-modal hasher (for images) self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \ diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 786d79a2dbb5..690e700c49d8 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -26,7 +26,7 @@ from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend, FlashAttentionMetadata) from vllm.v1.core.encoder_cache_manager import compute_encoder_budget -from vllm.v1.engine.mm_input_cache import MMInputMapperClient +from vllm.v1.engine.mm_input_cache import MMInputCacheClient from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec) from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput @@ -96,7 +96,7 @@ def __init__( # NOTE: Initialized input mapper is only used for processing dummy # multimodal data into multimodal kwargs for GPU memory profiling. - self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config) + self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config) self.mm_input_mapper_profiling.use_cache = False encoder_compute_budget, encoder_cache_size = compute_encoder_budget( From 51d553659924ff039bcbe39991bb70a9447430f8 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 12 Feb 2025 23:37:15 -0800 Subject: [PATCH 3/7] rename Signed-off-by: Roger Wang --- vllm/v1/engine/core.py | 4 ++-- vllm/v1/engine/mm_input_cache.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index ffe0eb98b3e1..9fdea9053c51 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -20,7 +20,7 @@ from vllm.v1.core.scheduler import Scheduler from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, EngineCoreRequestType) -from vllm.v1.engine.mm_input_cache import MMInputMapperServer +from vllm.v1.engine.mm_input_cache import MMInputCacheServer from vllm.v1.executor.abstract import Executor from vllm.v1.request import Request, RequestStatus from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder @@ -65,7 +65,7 @@ def __init__( log_stats=self.log_stats, ) - self.mm_input_mapper_server = MMInputMapperServer( + self.mm_input_mapper_server = MMInputCacheServer( vllm_config.model_config) def _initialize_kv_caches(self, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 2dc2b7b909a1..b7138e33de49 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -114,7 +114,7 @@ def process_inputs( return ret_inputs -class MMInputMapperServer: +class MMInputCacheServer: def __init__(self, model_config): self.use_cache = not model_config.disable_mm_preprocessor_cache From 36bb6ee04cf2918b9d8e59df7215c8ebe6f41e0b Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Wed, 12 Feb 2025 23:52:27 -0800 Subject: [PATCH 4/7] clarify Signed-off-by: Roger Wang --- vllm/v1/engine/mm_input_cache.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index b7138e33de49..d5763f3b5876 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -10,12 +10,18 @@ logger = init_logger(__name__) -# The idea of MM preprocessor caching is based on having a client and a server, -# where the client executes in the frontend process (=P0) and the server in the -# core process (=P1). +# The idea of multimodal preprocessing caching is based on having a client and +# a server, where the client executes in the frontend process (=P0) and the +# server in the core process (=P1). # -# -- Client: Executes the MM mapper and performs caching of the results. -# -- Server: Performs caching of the results +# -- Client: +# - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs. +# - Perform caching of the generated MultiModalKwargs. +# - This client can be deprecated once all mutimodal models migrate to use +# merged preprocessor with built-in caching functionality. +# +# -- Server: +# - Perform caching of the received MultiModalKwargs. # # The caching for both client and server is mirrored/similar, and this allows us # to avoid the serialization of "mm_inputs" (like pixel values) between @@ -27,6 +33,8 @@ MM_CACHE_SIZE = 256 +# TODO(ywang96): Deprecate this class once all multimodal models migrate to use +# merged preprocessor with built-in caching functionality. class MMInputCacheClient: def __init__( @@ -54,7 +62,8 @@ def cache_hit_ratio(self, steps): logger.debug("MMInputMapper: cache_hit_ratio = %.2f ", self.mm_cache_hits / self.mm_cache_total) - # TODO: Support modalities beyond image. + # NOTE: process_inputs only supports image inputs since all multimodal + # models with other modalities have migrated to use merged preprocessor. def process_inputs( self, mm_data: MultiModalDataDict, @@ -95,7 +104,7 @@ def process_inputs( # Reuse precomputed input (for merged preprocessor) mm_input = precomputed_mm_inputs[input_id] else: - # Apply MM mapper + # Apply legacy input_mapper mm_input = self.multi_modal_input_mapper( {"image": [image_inputs[input_id]]}, mm_processor_kwargs=mm_processor_kwargs, From 070d814bcfd8b148dbd5d6671b7ec3184e78048f Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 13 Feb 2025 00:30:14 -0800 Subject: [PATCH 5/7] update Signed-off-by: Roger Wang --- vllm/v1/engine/core.py | 14 +++++++------- vllm/v1/engine/mm_input_cache.py | 2 +- vllm/v1/engine/processor.py | 18 +++++++++++++----- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 9fdea9053c51..15148b0bd76f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -65,7 +65,7 @@ def __init__( log_stats=self.log_stats, ) - self.mm_input_mapper_server = MMInputCacheServer( + self.mm_input_cache_server = MMInputCacheServer( vllm_config.model_config) def _initialize_kv_caches(self, @@ -97,13 +97,13 @@ def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" if request.mm_hashes is not None: - # Here, if hash exists for an image, then it will be fetched - # from the cache, else it will be added to the cache. - # Note that the cache here is mirrored with the client side of the - # MM mapper, so anything that has a hash must have a HIT cache - # entry here as well. + # Here, if hash exists for an multimodal input, then it will + # be fetched from the cache, else it will be added to the cache. + # Note that the cache here is mirrored with the client cache, so + # anything that has a hash must have a HIT cache entry here + # as well. assert request.mm_inputs is not None - request.mm_inputs = self.mm_input_mapper_server.process_inputs( + request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) req = Request.from_engine_core_request(request) diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index d5763f3b5876..e1b6679c284b 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -129,7 +129,7 @@ def __init__(self, model_config): self.use_cache = not model_config.disable_mm_preprocessor_cache self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE) - def process_inputs( + def get_and_update( self, mm_inputs: List[Optional[MultiModalKwargs]], mm_hashes: List[str], diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 9e637cc95759..b7eee5a39972 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -46,7 +46,7 @@ def __init__( model_config) # Multi-modal (huggingface) input mapper - self.mm_input_mapper_client = MMInputCacheClient(model_config) + self.mm_input_cache_client = MMInputCacheClient(model_config) # Multi-modal hasher (for images) self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \ @@ -106,16 +106,24 @@ def process_inputs( assert priority == 0, "vLLM V1 does not support priority at the moment." assert trace_headers is None, "vLLM V1 does not support tracing yet." - # Process inputs. + # Process inputs, which includes: + # 1. Tokenize text prompt, with LoRA request if one exists. + # 2. For multimodal models with a merged preprocessor, preprocess + # multimodal data and expand prompt token ids accordingly. + # 3. Apply prompt adapter to prompt token ids if one exists. preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) + eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) + + # Process prompt and prompt token ids. + # Only applicable to multimodal models with legacy input processor. processed_inputs = self.input_processor(preprocessed_inputs) + self._validate_model_inputs(processed_inputs) - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) if is_encoder_decoder_inputs(processed_inputs): decoder_inputs = SingletonInputsAdapter( @@ -200,8 +208,8 @@ def process_inputs( key=lambda mm_input: modality_order_dict[list( mm_input.modalities)[0]]) - # Apply mm input cache update (and input mapper if necessary). - sorted_mm_inputs = self.mm_input_mapper_client.process_inputs( + # Apply mm input cache update and legacy input mapper if one exists. + sorted_mm_inputs = self.mm_input_cache_client.process_inputs( mm_data=decoder_mm_data, mm_hashes=sorted_mm_hashes, mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, From 559ed7549658ea15a64a8008530b9eb30df2bba8 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 13 Feb 2025 00:38:17 -0800 Subject: [PATCH 6/7] typo Signed-off-by: Roger Wang --- vllm/v1/engine/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 15148b0bd76f..0683bcb8c898 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -97,8 +97,8 @@ def add_request(self, request: EngineCoreRequest): """Add request to the scheduler.""" if request.mm_hashes is not None: - # Here, if hash exists for an multimodal input, then it will - # be fetched from the cache, else it will be added to the cache. + # Here, if hash exists for a multimodal input, then it will be + # fetched from the cache, else it will be added to the cache. # Note that the cache here is mirrored with the client cache, so # anything that has a hash must have a HIT cache entry here # as well. From 4ba535a548594e0ad1ff7751d4da4397fe301d89 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Thu, 13 Feb 2025 00:44:12 -0800 Subject: [PATCH 7/7] cleanup Signed-off-by: Roger Wang --- vllm/v1/worker/gpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 690e700c49d8..7a2f12a9075d 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -94,8 +94,9 @@ def __init__( self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope - # NOTE: Initialized input mapper is only used for processing dummy + # NOTE: Initialized client is only used for processing dummy # multimodal data into multimodal kwargs for GPU memory profiling. + # Only applicable to multimodal models with legacy input mapper. self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config) self.mm_input_mapper_profiling.use_cache = False