Address comments

chang-l · chang-l · commit b859c4d7011c · 2025-09-02T10:51:21.000-07:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
@@ -145,7 +145,7 @@ def parse_arguments():
     parser = add_lora_args(parser)
     args = parser.parse_args()
 
-    args.disable_kv_cache_reuse = True  # kv cache reuse does not work for multimodal, force overwrite
+    args.disable_kv_cache_reuse = False  # kv cache reuse does not work for multimodal, force overwrite
     if args.kv_cache_fraction is None:
         args.kv_cache_fraction = 0.6  # lower the default kv cache fraction for multimodal
 
@@ -177,6 +177,19 @@ def main():
 
     llm, sampling_params = setup_llm(args, lora_config=lora_config)
 
+#    from tensorrt_llm import MultimodalEncoder, SamplingParams
+#    sampling_params = SamplingParams(max_tokens=args.max_tokens)
+#    llm = MultimodalEncoder(
+#        model=args.model_dir,
+#        backend='pytorch',
+#        disable_overlap_scheduler=args.disable_overlap_scheduler,
+#        max_seq_len=args.max_seq_len,
+#        max_batch_size=args.max_batch_size,
+#        max_num_tokens=args.max_num_tokens,
+#        trust_remote_code=args.trust_remote_code,
+#        )
+
+
     image_format = args.image_format
     if args.model_type is not None:
         model_type = args.model_type
@@ -197,8 +210,8 @@ def main():
                                              model_dir=str(llm._hf_model_dir),
                                              model_type=model_type,
                                              modality=args.modality,
-                                             prompts=args.prompt,
-                                             media=args.media,
+                                             prompts=[args.prompt[0], args.prompt[0]],
+                                             media=[args.media[0], args.media[0]],
                                              image_data_format=image_format,
                                              num_frames=args.num_frames,
                                              device=args.device)
@@ -211,7 +224,6 @@ def main():
     outputs = llm.generate(
         inputs,
         sampling_params,
-        lora_request=lora_request,
     )
 
     for i, output in enumerate(outputs):
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -44,6 +44,7 @@ def __init__(self,
             trust_remote_code=trust_remote_code)
 
         self.tllm_multimodal_token_id = self.model_config.vocab_size + 1
+        # temporal patch size for video frames
         self.temporal_patch_size = getattr(model_config.vision_config,
                                            'temporal_patch_size', 1)
 
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -445,6 +445,8 @@ def _hash_image(image):
             # Hash each frame with a separator to avoid collisions between [A,B] and [AB]
             for frame in image:
                 hasher.update(b"<frame>")
+                if isinstance(frame, torch.Tensor):
+                    frame = frame.detach().cpu().contiguous()
                 hasher.update(serialize_item(frame))
         else:
             hasher.update(serialize_item(image))
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
@@ -50,6 +50,24 @@ class BaseMultimodalInputProcessor:
     models. Specific processors can override these methods if they need custom logic.
     """
 
+    @property
+    def get_num_multimodal_tokens(self):
+        """
+        Get the Hugging Face processor's '_get_num_multimodal_tokens' method.
+
+        """
+        if hasattr(self, 'processor') and hasattr(self.processor,
+                                                  '_get_num_multimodal_tokens'):
+            return self.processor._get_num_multimodal_tokens
+        elif hasattr(self, '_processor') and hasattr(
+                self._processor, '_get_num_multimodal_tokens'):
+            return self._processor._get_num_multimodal_tokens
+        else:
+            raise NotImplementedError(
+                f"get_num_multimodal_tokens not implemented for {self.__class__.__name__}. "
+                "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
+            )
+
     def get_num_tokens_per_image(
         self,
         *,
@@ -60,30 +78,14 @@ def get_num_tokens_per_image(
         """
         Calculate the number of tokens generated for an image.
 
-        Default implementation assumes the processor has either:
-        1. A 'processor' attribute with _get_num_multimodal_tokens method
-        2. A '_processor' attribute with _get_num_multimodal_tokens method
+        This (default) method delegates to the Hugging Face processor's '_get_num_multimodal_tokens' method.
+        Returns the token count for the given image.
 
-        Override this method for custom implementations.
+        Subclasses can override this method to provide custom logic to calculate the number of tokens.
         """
-        if hasattr(self, 'processor') and hasattr(self.processor,
-                                                  '_get_num_multimodal_tokens'):
-            image_size = (image_height, image_width)
-            num_image_tokens = self.processor._get_num_multimodal_tokens(
-                [image_size], **kwargs)["num_image_tokens"][0]
-            return num_image_tokens
-        # Check for _processor attribute (e.g., Mistral3)
-        elif hasattr(self, '_processor') and hasattr(
-                self._processor, '_get_num_multimodal_tokens'):
-            image_size = (image_height, image_width)
-            num_image_tokens = self._processor._get_num_multimodal_tokens(
-                [image_size], **kwargs)["num_image_tokens"][0]
-            return num_image_tokens
-        else:
-            raise NotImplementedError(
-                f"get_num_tokens_per_image not implemented for {self.__class__.__name__}. "
-                "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
-            )
+        image_size = (image_height, image_width)
+        return self.get_num_multimodal_tokens([image_size],
+                                              **kwargs)["num_image_tokens"][0]
 
     def get_num_tokens_per_video(
         self,
@@ -96,53 +98,23 @@ def get_num_tokens_per_video(
         """
         Calculate the number of tokens generated for a video.
 
-        Default implementation assumes the processor has either:
-        1. A 'processor' attribute with _get_num_multimodal_tokens method
-        2. A '_processor' attribute with _get_num_multimodal_tokens method
+        This (default) method delegates to the Hugging Face processor's '_get_num_multimodal_tokens' method.
+        Returns the token count for the given video.
 
-        Override this method for custom implementations.
+        Subclasses can override this method to provide custom logic to calculate the number of tokens.
         """
-        if hasattr(self, 'processor') and hasattr(self.processor,
-                                                  '_get_num_multimodal_tokens'):
-            video_size = (num_frames, video_height, video_width)
-            # Try to get video tokens directly
-            try:
-                num_video_tokens = self.processor._get_num_multimodal_tokens(
-                    video_sizes=[video_size], **kwargs)["num_video_tokens"][0]
-                return num_video_tokens
-            except Exception:
-                # Fallback: treat video as sequence of frames
-                num_tokens_per_frame = self.get_num_tokens_per_image(
-                    image_width=video_width,
-                    image_height=video_height,
-                    **kwargs)
-                temporal_patch_size = self.temporal_patch_size if hasattr(
-                    self, 'temporal_patch_size') else 1
-                return num_tokens_per_frame * num_frames // temporal_patch_size
-        # Check for _processor attribute (e.g., Mistral3)
-        # TODO: unify the naming convention for the processor attribute
-        elif hasattr(self, '_processor') and hasattr(
-                self._processor, '_get_num_multimodal_tokens'):
-            video_size = (num_frames, video_height, video_width)
-            # Try to get video tokens directly
-            try:
-                num_video_tokens = self._processor._get_num_multimodal_tokens(
-                    video_sizes=[video_size], **kwargs)["num_video_tokens"][0]
-                return num_video_tokens
-            except Exception:
-                # Fallback: treat video as sequence of frames
-                num_tokens_per_frame = self.get_num_tokens_per_image(
-                    image_width=video_width,
-                    image_height=video_height,
-                    **kwargs)
-                temporal_patch_size = self.temporal_patch_size if hasattr(
-                    self, 'temporal_patch_size') else 1
-                return num_tokens_per_frame * num_frames // temporal_patch_size
-        else:
-            raise NotImplementedError(
-                f"get_num_tokens_per_video not implemented for {self.__class__.__name__}. "
-                "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
-            )
+        video_size = (num_frames, video_height, video_width)
+        try:
+            num_video_tokens = self.get_num_multimodal_tokens(
+                video_sizes=[video_size], **kwargs)["num_video_tokens"][0]
+            return num_video_tokens
+        except Exception:
+            # Fallback: treat video as sequence of frames
+            num_tokens_per_frame = self.get_num_tokens_per_image(
+                image_width=video_width, image_height=video_height, **kwargs)
+            temporal_patch_size = self.temporal_patch_size if hasattr(
+                self, 'temporal_patch_size') else 1
+            return num_tokens_per_frame * num_frames // temporal_patch_size
 
 
 class DefaultInputProcessor(InputProcessor):
diff --git a/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py b/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py
@@ -8,7 +8,6 @@
 from tensorrt_llm import MultimodalEncoder
 from tensorrt_llm._torch.models.modeling_llava_next import \
     LlavaNextInputProcessor
-from tensorrt_llm._torch.models.modeling_mistral import Mistral3InputProcessor
 from tensorrt_llm._torch.models.modeling_qwen2vl import \
     Qwen2VLInputProcessorBase
 from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
@@ -47,11 +46,6 @@ def multimodal_model_configs():
             'hf_model_dir': 'Qwen/Qwen2.5-VL-3B-Instruct',
             'model_type': 'qwen2_5_vl',
         },
-        'mistral-small-3.1': {
-            'hf_model_dir':
-            '/home/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503',
-            'model_type': 'mistral3',
-        },
     }
     return model_configs
 
@@ -149,9 +143,6 @@ def test_get_num_tokens_per_image(model_key, multimodal_model_configs):
                     image_height=image_height,
                     num_frames=1,
                     do_resize=True)
-            elif model_type == 'mistral':
-                predicted_num_tokens = input_processor.get_num_tokens_per_image(
-                    image_width=image_width, image_height=image_height)
             else:
                 raise ValueError(f"Unsupported model type: {model_type}")
 
@@ -216,12 +207,6 @@ def test_get_num_tokens_per_video(model_key, multimodal_model_configs):
                 model_config=model_config_dict,
                 tokenizer=tokenizer,
                 trust_remote_code=True)
-        elif model_type == 'mistral':
-            input_processor = Mistral3InputProcessor(
-                model_path=encoder_model_dir,
-                model_config=model_config_dict,
-                tokenizer=tokenizer,
-                trust_remote_code=True)
         else:
             pytest.fail(f"Unsupported model type: {model_type}")