Enable video encoder and generalize finding mm_token_length

chang-l · chang-l · commit cf445ee00bac · 2025-08-28T22:03:07.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py
@@ -14,8 +14,8 @@
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
-from ...inputs import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...llmapi.utils import download_hf_model
@@ -32,7 +32,7 @@
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
 
 
-class LlavaNextInputProcessor(InputProcessor):
+class LlavaNextInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
@@ -56,17 +56,6 @@ def __init__(self,
         self.vocab_size = model_config.vocab_size
         self.config = model_config.vision_config
 
-    def get_num_tokens_per_image(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        image_size = (image_height, image_width)
-        num_image_tokens = self.processor._get_num_multimodal_tokens(
-            [image_size])["num_image_tokens"][0]
-        return num_image_tokens
-
     def _postprocess(
         self, input_ids: torch.Tensor, mm_features: Union[torch.Tensor,
                                                           List[torch.Tensor]]
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -7,14 +7,13 @@
 from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig,
                           PreTrainedModel, Qwen2_5_VLForConditionalGeneration,
                           Qwen2VLForConditionalGeneration)
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
 from ..._utils import nvtx_range_debug
 from ...functional import RopeEmbeddingUtils, RotaryScalingType
-from ...inputs import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
@@ -29,7 +28,7 @@
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
 
 
-class Qwen2VLInputProcessorBase(InputProcessor):
+class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
@@ -45,6 +44,8 @@ def __init__(self,
             trust_remote_code=trust_remote_code)
 
         self.tllm_multimodal_token_id = self.model_config.vocab_size + 1
+        self.temporal_patch_size = getattr(model_config.vision_config,
+                                           'temporal_patch_size', 1)
 
     @classmethod
     def get_rope_index(
@@ -220,38 +221,6 @@ def get_rope_index(
             mrope_position_deltas, device=input_ids.device).unsqueeze(1)
         return position_ids, mrope_position_deltas
 
-    def get_num_tokens_per_image(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        num_frames: int = 1,
-        do_resize: bool = True,
-    ):
-        patch_size = self.model_config.vision_config.patch_size
-        merge_size = self.model_config.vision_config.spatial_merge_size
-        temporal_patch_size = self.model_config.vision_config.temporal_patch_size
-        if do_resize:
-            resized_height, resized_width = smart_resize(
-                height=image_height,
-                width=image_width,
-                factor=patch_size * merge_size,
-                min_pixels=self.processor.image_processor.min_pixels,
-                max_pixels=self.processor.image_processor.max_pixels,
-            )
-            image_width, image_height = resized_width, resized_height
-
-        padded_num_frames = num_frames + num_frames % temporal_patch_size
-
-        grid_t = max(padded_num_frames // temporal_patch_size, 1)
-        grid_h = image_height // patch_size
-        grid_w = image_width // patch_size
-
-        num_patches = grid_t * grid_h * grid_w
-        num_vision_tokens = num_patches // (merge_size**2)
-
-        return num_vision_tokens
-
     def _preprocess(self, text: dict[str, any], mm_data: dict[str, any],
                     mm_processor_kwargs: Dict[str, Any]):
         images = mm_data.get("image")
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
@@ -1,7 +1,7 @@
 from .data import PromptInputs, TextPrompt, TokensPrompt, prompt_inputs
 from .multimodal import MultimodalInput
-from .registry import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from .registry import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, create_input_processor,
                        create_input_processor_with_hash,
                        register_input_processor)
@@ -27,6 +27,7 @@
     "create_input_processor_with_hash",
     "register_input_processor",
     "ExtraProcessedInputs",
+    "BaseMultimodalInputProcessor",
     "MultimodalPlaceholderMetadata",
     "MultimodalPlaceholderPlacement",
     "ConversationMessage",
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -435,13 +435,20 @@ def apply_mm_hashes(mm_data: Dict[str, Any],
     """Apply hashing to multimodal data items."""
 
     def _hash_image(image):
-        # only support single modality w/ PIL.Image.Image for now
         # TODO: possible hash collision w/ this simplified version (vllm/PR/17378)
         hasher = hash_lib()
         if isinstance(image, torch.Tensor):
-            # TODO: Device tensor hashing is an open issue. Limited hashing to CPU for now.
-            image = image.cpu()
-        hasher.update(serialize_item(image))
+            # Ensure tensor is on CPU and contiguous for consistent hashing
+            image = image.detach().cpu().contiguous()
+            hasher.update(serialize_item(image))
+        elif isinstance(image, list):
+            # Hash each frame with a separator to avoid collisions between [A,B] and [AB]
+            for frame in image:
+                hasher.update(b"<frame>")
+                hasher.update(serialize_item(frame))
+        else:
+            hasher.update(serialize_item(image))
+
         return hasher.hexdigest()
 
     mm_items = {
@@ -483,31 +490,35 @@ def find_mm_token_lengths(mm_data: Dict[str, Any],
     num_mm_tokens = {}
 
     for modality, items in mm_items.items():
-        if modality != "image":
-            #TODO: support other modalities
-            raise ValueError(
-                f"Unsupported modality: {modality}. Only 'image' modality is currently supported for hashing."
-            )
-        if not hasattr(input_processor, "get_num_tokens_per_image"):
-            #TODO: backward compatibility for models that don't yet have get_num_tokens_per_image implemented
-            #TODO: only support qwen2_vl for now
+        if not hasattr(input_processor, f"get_num_tokens_per_{modality}"):
             raise AttributeError(
-                f"Input processor {type(input_processor).__name__} does not have 'get_num_tokens_per_image' method required for multimodal hashing."
+                f"Input processor {type(input_processor).__name__} does not have 'get_num_tokens_per_{modality}' method required for multimodal hashing."
             )
 
         modality_token_lengths = []
         for item in items:
-            if isinstance(item, torch.Tensor):
-                item = ToPILImage()(item)
-            num_tokens = input_processor.get_num_tokens_per_image(
-                image_width=item.width,
-                image_height=item.height,
-            )
-            modality_token_lengths.append(num_tokens)
+            if modality == "image":
+                if isinstance(item, torch.Tensor):
+                    item = ToPILImage()(item)
+                num_tokens = input_processor.get_num_tokens_per_image(
+                    image_width=item.width,
+                    image_height=item.height,
+                )
+                modality_token_lengths.append(num_tokens)
+            elif modality == "video":
+                assert isinstance(item, list), "Video must be a list of frames"
+                if isinstance(item[0], torch.Tensor):
+                    item = [ToPILImage()(frame) for frame in item]
+                num_tokens = input_processor.get_num_tokens_per_video(
+                    video_width=item[0].width,
+                    video_height=item[0].height,
+                    num_frames=len(item),
+                )
+                modality_token_lengths.append(num_tokens)
 
         num_mm_tokens[modality] = modality_token_lengths
 
-    return num_mm_tokens['image']  # flatten all mm instances to a single list
+    return num_mm_tokens  # flatten all mm instances to a single list
 
 
 def find_mm_token_positions(input_ids: Union[torch.Tensor, List[int],
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
@@ -41,6 +41,110 @@ def __call__(
         ...
 
 
+class BaseMultimodalInputProcessor:
+    """
+    Base class for multimodal input processors with default implementations
+    of get_num_tokens_per_image and get_num_tokens_per_video methods.
+
+    This class provides default implementations that work with most AutoProcessor-based
+    models. Specific processors can override these methods if they need custom logic.
+    """
+
+    def get_num_tokens_per_image(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        **kwargs,
+    ):
+        """
+        Calculate the number of tokens generated for an image.
+
+        Default implementation assumes the processor has either:
+        1. A 'processor' attribute with _get_num_multimodal_tokens method
+        2. A '_processor' attribute with _get_num_multimodal_tokens method
+
+        Override this method for custom implementations.
+        """
+        if hasattr(self, 'processor') and hasattr(self.processor,
+                                                  '_get_num_multimodal_tokens'):
+            image_size = (image_height, image_width)
+            num_image_tokens = self.processor._get_num_multimodal_tokens(
+                [image_size], **kwargs)["num_image_tokens"][0]
+            return num_image_tokens
+        # Check for _processor attribute (e.g., Mistral3)
+        elif hasattr(self, '_processor') and hasattr(
+                self._processor, '_get_num_multimodal_tokens'):
+            image_size = (image_height, image_width)
+            num_image_tokens = self._processor._get_num_multimodal_tokens(
+                [image_size], **kwargs)["num_image_tokens"][0]
+            return num_image_tokens
+        else:
+            raise NotImplementedError(
+                f"get_num_tokens_per_image not implemented for {self.__class__.__name__}. "
+                "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
+            )
+
+    def get_num_tokens_per_video(
+        self,
+        *,
+        video_width: int,
+        video_height: int,
+        num_frames: int,
+        **kwargs,
+    ):
+        """
+        Calculate the number of tokens generated for a video.
+
+        Default implementation assumes the processor has either:
+        1. A 'processor' attribute with _get_num_multimodal_tokens method
+        2. A '_processor' attribute with _get_num_multimodal_tokens method
+
+        Override this method for custom implementations.
+        """
+        if hasattr(self, 'processor') and hasattr(self.processor,
+                                                  '_get_num_multimodal_tokens'):
+            video_size = (num_frames, video_height, video_width)
+            # Try to get video tokens directly
+            try:
+                num_video_tokens = self.processor._get_num_multimodal_tokens(
+                    video_sizes=[video_size], **kwargs)["num_video_tokens"][0]
+                return num_video_tokens
+            except Exception:
+                # Fallback: treat video as sequence of frames
+                num_tokens_per_frame = self.get_num_tokens_per_image(
+                    image_width=video_width,
+                    image_height=video_height,
+                    **kwargs)
+                temporal_patch_size = self.temporal_patch_size if hasattr(
+                    self, 'temporal_patch_size') else 1
+                return num_tokens_per_frame * num_frames // temporal_patch_size
+        # Check for _processor attribute (e.g., Mistral3)
+        # TODO: unify the naming convention for the processor attribute
+        elif hasattr(self, '_processor') and hasattr(
+                self._processor, '_get_num_multimodal_tokens'):
+            video_size = (num_frames, video_height, video_width)
+            # Try to get video tokens directly
+            try:
+                num_video_tokens = self._processor._get_num_multimodal_tokens(
+                    video_sizes=[video_size], **kwargs)["num_video_tokens"][0]
+                return num_video_tokens
+            except Exception:
+                # Fallback: treat video as sequence of frames
+                num_tokens_per_frame = self.get_num_tokens_per_image(
+                    image_width=video_width,
+                    image_height=video_height,
+                    **kwargs)
+                temporal_patch_size = self.temporal_patch_size if hasattr(
+                    self, 'temporal_patch_size') else 1
+                return num_tokens_per_frame * num_frames // temporal_patch_size
+        else:
+            raise NotImplementedError(
+                f"get_num_tokens_per_video not implemented for {self.__class__.__name__}. "
+                "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
+            )
+
+
 class DefaultInputProcessor(InputProcessor):
     """Preprocess the inputs to the model."""
 
@@ -327,6 +431,8 @@ def multimodal_hashing_process(
         assert 'multi_modal_data' in inputs, "multi_modal_data must be provided for hashing support."
         mm_data = inputs['multi_modal_data']
         num_mm_tokens = find_mm_token_lengths(mm_data, input_processor)
+        # TODO: here we assume there is only one modality for now
+        num_mm_tokens = next(iter(num_mm_tokens.values()))
         if len(num_mm_tokens) > 0:
             mm_hashes = apply_mm_hashes(mm_data, hash_lib)
             prompt_token_ids, extra_processed_inputs = input_processor(
@@ -358,8 +464,8 @@ def input_processor_wrapper(
         modalities = list(set(inputs['multi_modal_data'].keys())
                           ) if 'multi_modal_data' in inputs else []
         if len(modalities) > 0:
-            # NOTE: tensorrt_llm/inputs/multimodal.py:find_mm_token_lengths only supports image data for now
-            if len(modalities) == 1 and modalities[0] == "image":
+            # TODO: support multiple modalities for multimodal hashing (for kv cache reuse, chunked prefill, etc.)
+            if len(modalities) == 1:
                 # only try multimodal hashing if the inputs only contain image data
                 if input_processor.multimodal_hashing_supported is not None:
                     use_multimodal_hashing = input_processor.multimodal_hashing_supported
diff --git a/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py b/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py