From 7b5a49c01f9449a93be0dfadaebe2a5375c9a240 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 12 Feb 2025 23:35:31 -0800
Subject: [PATCH 1/7] rename

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core.py                                   | 2 +-
 vllm/v1/engine/{mm_input_mapper.py => mm_input_cache.py} | 0
 vllm/v1/engine/processor.py                              | 2 +-
 vllm/v1/worker/gpu_model_runner.py                       | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename vllm/v1/engine/{mm_input_mapper.py => mm_input_cache.py} (100%)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index e4677681bd2b..ffe0eb98b3e1 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType)
-from vllm.v1.engine.mm_input_mapper import MMInputMapperServer
+from vllm.v1.engine.mm_input_cache import MMInputMapperServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_cache.py
similarity index 100%
rename from vllm/v1/engine/mm_input_mapper.py
rename to vllm/v1/engine/mm_input_cache.py
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 70876b03a823..5288f89e0d9e 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,7 +17,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputMapperClient
 
 
 class Processor:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9b1eab613bf7..786d79a2dbb5 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -26,7 +26,7 @@
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputMapperClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput

From 7c8f6eafcdee38cfb36aabe642ca472e9f925ab8 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 12 Feb 2025 23:36:50 -0800
Subject: [PATCH 2/7] rename

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/mm_input_cache.py   | 2 +-
 vllm/v1/engine/processor.py        | 4 ++--
 vllm/v1/worker/gpu_model_runner.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 83a0d9db161d..2dc2b7b909a1 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -27,7 +27,7 @@
 MM_CACHE_SIZE = 256
 
 
-class MMInputMapperClient:
+class MMInputCacheClient:
 
     def __init__(
         self,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5288f89e0d9e..9e637cc95759 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -17,7 +17,7 @@
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_cache import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 
 
 class Processor:
@@ -46,7 +46,7 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper_client = MMInputMapperClient(model_config)
+        self.mm_input_mapper_client = MMInputCacheClient(model_config)
 
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 786d79a2dbb5..690e700c49d8 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -26,7 +26,7 @@
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
 from vllm.v1.core.encoder_cache_manager import compute_encoder_budget
-from vllm.v1.engine.mm_input_cache import MMInputMapperClient
+from vllm.v1.engine.mm_input_cache import MMInputCacheClient
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
                                         KVCacheSpec)
 from vllm.v1.outputs import LogprobsTensors, ModelRunnerOutput
@@ -96,7 +96,7 @@ def __init__(
 
         # NOTE: Initialized input mapper is only used for processing dummy
         # multimodal data into multimodal kwargs for GPU memory profiling.
-        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False
 
         encoder_compute_budget, encoder_cache_size = compute_encoder_budget(

From 51d553659924ff039bcbe39991bb70a9447430f8 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 12 Feb 2025 23:37:15 -0800
Subject: [PATCH 3/7] rename

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core.py           | 4 ++--
 vllm/v1/engine/mm_input_cache.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index ffe0eb98b3e1..9fdea9053c51 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -20,7 +20,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
                             EngineCoreRequestType)
-from vllm.v1.engine.mm_input_cache import MMInputMapperServer
+from vllm.v1.engine.mm_input_cache import MMInputCacheServer
 from vllm.v1.executor.abstract import Executor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
@@ -65,7 +65,7 @@ def __init__(
             log_stats=self.log_stats,
         )
 
-        self.mm_input_mapper_server = MMInputMapperServer(
+        self.mm_input_mapper_server = MMInputCacheServer(
             vllm_config.model_config)
 
     def _initialize_kv_caches(self,
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 2dc2b7b909a1..b7138e33de49 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -114,7 +114,7 @@ def process_inputs(
         return ret_inputs
 
 
-class MMInputMapperServer:
+class MMInputCacheServer:
 
     def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache

From 36bb6ee04cf2918b9d8e59df7215c8ebe6f41e0b Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 12 Feb 2025 23:52:27 -0800
Subject: [PATCH 4/7] clarify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/mm_input_cache.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index b7138e33de49..d5763f3b5876 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -10,12 +10,18 @@
 
 logger = init_logger(__name__)
 
-# The idea of MM preprocessor caching is based on having a client and a server,
-# where the client executes in the frontend process (=P0) and the server in the
-# core process (=P1).
+# The idea of multimodal preprocessing caching is based on having a client and
+# a server, where the client executes in the frontend process (=P0) and the
+# server in the core process (=P1).
 #
-# -- Client: Executes the MM mapper and performs caching of the results.
-# -- Server: Performs caching of the results
+# -- Client:
+#  - Apply legacy input_mapper (if one exists) to generate MultiModalKwargs.
+#  - Perform caching of the generated MultiModalKwargs.
+#  - This client can be deprecated once all mutimodal models migrate to use
+#    merged preprocessor with built-in caching functionality.
+#
+# -- Server:
+#  - Perform caching of the received MultiModalKwargs.
 #
 # The caching for both client and server is mirrored/similar, and this allows us
 # to avoid the serialization of "mm_inputs" (like pixel values) between
@@ -27,6 +33,8 @@
 MM_CACHE_SIZE = 256
 
 
+# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
+# merged preprocessor with built-in caching functionality.
 class MMInputCacheClient:
 
     def __init__(
@@ -54,7 +62,8 @@ def cache_hit_ratio(self, steps):
             logger.debug("MMInputMapper: cache_hit_ratio = %.2f ",
                          self.mm_cache_hits / self.mm_cache_total)
 
-    # TODO: Support modalities beyond image.
+    # NOTE: process_inputs only supports image inputs since all multimodal
+    # models with other modalities have migrated to use merged preprocessor.
     def process_inputs(
         self,
         mm_data: MultiModalDataDict,
@@ -95,7 +104,7 @@ def process_inputs(
                     # Reuse precomputed input (for merged preprocessor)
                     mm_input = precomputed_mm_inputs[input_id]
                 else:
-                    # Apply MM mapper
+                    # Apply legacy input_mapper
                     mm_input = self.multi_modal_input_mapper(
                         {"image": [image_inputs[input_id]]},
                         mm_processor_kwargs=mm_processor_kwargs,

From 070d814bcfd8b148dbd5d6671b7ec3184e78048f Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 13 Feb 2025 00:30:14 -0800
Subject: [PATCH 5/7] update

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core.py           | 14 +++++++-------
 vllm/v1/engine/mm_input_cache.py |  2 +-
 vllm/v1/engine/processor.py      | 18 +++++++++++++-----
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 9fdea9053c51..15148b0bd76f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -65,7 +65,7 @@ def __init__(
             log_stats=self.log_stats,
         )
 
-        self.mm_input_mapper_server = MMInputCacheServer(
+        self.mm_input_cache_server = MMInputCacheServer(
             vllm_config.model_config)
 
     def _initialize_kv_caches(self,
@@ -97,13 +97,13 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         if request.mm_hashes is not None:
-            # Here, if hash exists for an image, then it will be fetched
-            # from the cache, else it will be added to the cache.
-            # Note that the cache here is mirrored with the client side of the
-            # MM mapper, so anything that has a hash must have a HIT cache
-            # entry here as well.
+            # Here, if hash exists for an multimodal input, then it will
+            # be fetched from the cache, else it will be added to the cache.
+            # Note that the cache here is mirrored with the client cache, so
+            # anything that has a hash must have a HIT cache entry here
+            # as well.
             assert request.mm_inputs is not None
-            request.mm_inputs = self.mm_input_mapper_server.process_inputs(
+            request.mm_inputs = self.mm_input_cache_server.get_and_update(
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index d5763f3b5876..e1b6679c284b 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -129,7 +129,7 @@ def __init__(self, model_config):
         self.use_cache = not model_config.disable_mm_preprocessor_cache
         self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
 
-    def process_inputs(
+    def get_and_update(
         self,
         mm_inputs: List[Optional[MultiModalKwargs]],
         mm_hashes: List[str],
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 9e637cc95759..b7eee5a39972 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -46,7 +46,7 @@ def __init__(
             model_config)
 
         # Multi-modal (huggingface) input mapper
-        self.mm_input_mapper_client = MMInputCacheClient(model_config)
+        self.mm_input_cache_client = MMInputCacheClient(model_config)
 
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
@@ -106,16 +106,24 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Process inputs.
+        # Process inputs, which includes:
+        # 1. Tokenize text prompt, with LoRA request if one exists.
+        # 2. For multimodal models with a merged preprocessor, preprocess
+        #   multimodal data and expand prompt token ids accordingly.
+        # 3. Apply prompt adapter to prompt token ids if one exists.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
+
+        # Process prompt and prompt token ids.
+        # Only applicable to multimodal models with legacy input processor.
         processed_inputs = self.input_processor(preprocessed_inputs)
+
         self._validate_model_inputs(processed_inputs)
-        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
         if is_encoder_decoder_inputs(processed_inputs):
             decoder_inputs = SingletonInputsAdapter(
@@ -200,8 +208,8 @@ def process_inputs(
                     key=lambda mm_input: modality_order_dict[list(
                         mm_input.modalities)[0]])
 
-            # Apply mm input cache update (and input mapper if necessary).
-            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+            # Apply mm input cache update and legacy input mapper if one exists.
+            sorted_mm_inputs = self.mm_input_cache_client.process_inputs(
                 mm_data=decoder_mm_data,
                 mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,

From 559ed7549658ea15a64a8008530b9eb30df2bba8 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 13 Feb 2025 00:38:17 -0800
Subject: [PATCH 6/7] typo

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 15148b0bd76f..0683bcb8c898 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -97,8 +97,8 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         if request.mm_hashes is not None:
-            # Here, if hash exists for an multimodal input, then it will
-            # be fetched from the cache, else it will be added to the cache.
+            # Here, if hash exists for a multimodal input, then it will be
+            # fetched from the cache, else it will be added to the cache.
             # Note that the cache here is mirrored with the client cache, so
             # anything that has a hash must have a HIT cache entry here
             # as well.

From 4ba535a548594e0ad1ff7751d4da4397fe301d89 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 13 Feb 2025 00:44:12 -0800
Subject: [PATCH 7/7] cleanup

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 690e700c49d8..7a2f12a9075d 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -94,8 +94,9 @@ def __init__(
         self.mm_registry = MULTIMODAL_REGISTRY
         self.uses_mrope = model_config.uses_mrope
 
-        # NOTE: Initialized input mapper is only used for processing dummy
+        # NOTE: Initialized client is only used for processing dummy
         # multimodal data into multimodal kwargs for GPU memory profiling.
+        # Only applicable to multimodal models with legacy input mapper.
         self.mm_input_mapper_profiling = MMInputCacheClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False