fix: Mistral Small vision encoder with BS>1

brb-nv · brb-nv · commit 2d15a9bfa786 · 2025-05-27T19:34:01.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -909,29 +909,37 @@ def preprocess(self, pre_prompt, post_prompt, image, other_vision_inputs,
         elif self.model_type == 'pixtral':
             # Hold on to pixel_values and input_ids.
             dtype = str_dtype_to_torch(self.vision_precision)
-            pixel_values = image["pixel_values"].to(device="cuda", dtype=dtype)
-            input_ids = image["input_ids"].to(device="cuda")
-
             # Shape of pixel values from the processor varies with the raw image.
             # So we create a new tensor with a fixed shape as expected by the vision
             # encoder and create a corresponding attention mask.
             image_size = self.image_size
             patch_size = self.patch_size
             d_min = torch.finfo(dtype).min
             num_patches = (image_size // patch_size)
-            image = torch.full((1, 3, image_size, image_size),
-                               fill_value=0,
-                               dtype=dtype,
-                               device="cuda")
-            attention_mask = torch.full((1, num_patches, num_patches),
-                                        fill_value=d_min,
-                                        dtype=dtype,
-                                        device="cuda")
-            h, w = pixel_values.shape[-2:]
-            image[..., :h, :w] = pixel_values
-            attention_mask[..., :h // patch_size, :w // patch_size] = 0
+            padded_image = torch.full(
+                (self.args.batch_size, 3, image_size, image_size),
+                fill_value=0,
+                dtype=dtype,
+                device="cuda")
+            padded_attention_mask = torch.full(
+                (self.args.batch_size, num_patches, num_patches),
+                fill_value=d_min,
+                dtype=dtype,
+                device="cuda")
+            h, w, input_ids = [], [], []
+            for img_idx in range(self.args.batch_size):
+                pixel_values = image["pixel_values"][img_idx]
+                img_h, img_w = pixel_values.shape[-2:]
+                padded_image[img_idx, :, :img_h, :img_w] = pixel_values
+                padded_attention_mask[img_idx, :img_h // patch_size, :img_w //
+                                      patch_size] = 0
+                input_ids.append(image["input_ids"][img_idx])
+                h.append(img_h)
+                w.append(img_w)
+
+            image = padded_image
             other_vision_inputs = {
-                "attention_mask": attention_mask,
+                "attention_mask": padded_attention_mask,
             }
         elif self.model_type == 'llava_next':
             input = image
@@ -1150,12 +1158,29 @@ def preprocess(self, pre_prompt, post_prompt, image, other_vision_inputs,
         elif self.model_type == 'pixtral':
             relevant_patch_size = self.patch_size * self.spatial_merge_size
             output_img_size = self.image_size // relevant_patch_size
-            visual_features = visual_features.reshape(
-                output_img_size, output_img_size,
-                -1)[:h // relevant_patch_size, :w //
-                    relevant_patch_size].flatten(0, 1)
+            # Note: max_h * max_w shall serve as the `tokens_per_task` in ptuning prompt table.
+            max_h = max(h) // relevant_patch_size
+            max_w = max(w) // relevant_patch_size
+            visual_embed_dim = visual_features.shape[-1]
+            relevant_visual_features = torch.zeros(self.args.batch_size,
+                                                   max_h * max_w,
+                                                   visual_embed_dim)
+            for img_idx in range(self.args.batch_size):
+                complete_features = visual_features[img_idx]
+                complete_features = complete_features.reshape(
+                    output_img_size, output_img_size, visual_embed_dim)
+                relevant_h = h[img_idx] // relevant_patch_size
+                relevant_w = w[img_idx] // relevant_patch_size
+                flattened_features = complete_features[:relevant_h, :
+                                                       relevant_w, :].flatten(
+                                                           0, 1)
+                relevant_visual_features[img_idx, :relevant_h *
+                                         relevant_w, :] = flattened_features
+            visual_features = relevant_visual_features
             input_ids = self.ptuning_setup_pixtral(input_ids=input_ids)
-            length = input_ids.shape[1]
+            # Note: length is not used for pixtral model downstream. Setting it to a list
+            # of length of input_ids causes errors downstream. So, supplying a placeholder.
+            length = input_ids[0].shape[0]
 
         elif self.model_type == 'llava_next':
             visual_features = LlavaNextUtils.rearrange_image_features(
@@ -2027,16 +2052,19 @@ def ptuning_setup_fuyu(self, input_ids, image_patches_indices):
 
     def ptuning_setup_pixtral(self, input_ids):
         # input_ids obtained from processor has token_ids for text as well as image tokens
-        # where each image token is represented the same image_token_index (10 for this model).
+        # where each image token is represented by the same image_token_index.
         image_token_index = self.image_token_index
         vocab_size = self.vocab_size
         # Replace all image tokens with a unique token_id > text_vacab_size.
         # This shall be used to lookup the prompt table.
-        replacer = vocab_size
-        for i in range(len(input_ids[0])):
-            if input_ids[0][i] == image_token_index:
-                input_ids[0][i] = replacer
-                replacer += 1
+        for img_idx in range(self.args.batch_size):
+            # Note: We reset replacer to text_vocab_size for each sample. This is as opposed to doing `replacer = vocab_size + img_idx * tokens_per_task`.
+            # That part of the look-up manipulation is done by the `task_ids` input to PromptEmbedding forward.
+            replacer = vocab_size
+            for token_idx in range(len(input_ids[img_idx])):
+                if input_ids[img_idx][token_idx] == image_token_index:
+                    input_ids[img_idx][token_idx] = replacer
+                    replacer += 1
         return input_ids
 
     def ptuning_setup_llava_next(self, visual_features, pre_prompt,
@@ -2166,7 +2194,24 @@ def load_images(image_paths):
                 if isinstance(image_path, str):
                     image_path = image_path.split(self.args.path_sep)
                 images = load_images(image_path)
-
+        elif "pixtral" in self.model_type:
+            if image_path is None:
+                image_urls = [
+                    "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png",
+                    "https://www.ilankelman.org/stopsigns/australia.jpg",
+                    "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.png",
+                    "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                ]
+                while len(image_urls) < self.args.batch_size:
+                    image_urls *= 2
+                image_urls = image_urls[:self.args.batch_size]
+                self.args.image_path = ",".join(image_urls)
+                images = load_images(image_urls)
+            else:
+                if isinstance(image_path, str):
+                    image_path = image_path.split(self.args.path_sep)
+                images = load_images(image_path)
+            images = [images] if not isinstance(images, list) else images
         elif "nougat" in self.model_type:
             filepath = hf_hub_download(
                 repo_id="hf-internal-testing/fixtures_docvqa",
@@ -2413,9 +2458,15 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
             post_prompt = "[/INST]"
             prompt = pre_prompt + input_text + post_prompt
             dtype = str_dtype_to_torch(self.vision_precision)
-            image = self.processor(text=prompt,
-                                   images=[raw_image],
-                                   return_tensors="pt").to(dtype)
+            image = {'pixel_values': [], 'input_ids': []}
+            for img_idx in range(self.args.batch_size):
+                image_info = self.processor(text=prompt,
+                                            images=[raw_image[img_idx]],
+                                            return_tensors="pt").to(dtype)
+                image['pixel_values'].append(image_info['pixel_values'].to(
+                    self.device))
+                image['input_ids'].append(image_info['input_ids'][0].to(
+                    self.device))
 
         elif 'internvl' in self.model_type:
             pre_prompt = "<|system|>\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。<|end|><|user|>\n<image>\n"
@@ -2619,7 +2670,7 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
                         image = image.expand(
                             min(self.args.batch_size, len(input_text)), -1, -1,
                             -1).contiguous()
-        if image is not None:
+        if image is not None and isinstance(image, torch.Tensor):
             image = image.to(self.device)
         # Generate decoder_input_ids for enc-dec models
         # Custom prompts can be added as:
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
@@ -1627,8 +1627,12 @@ def attn_forward(self,
         cos, sin = position_embeddings
         q, k = apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=0)
 
+        # attention_mask is of shape [batch, patches].
+        mask = attention_mask[:, None, None, :]
+
         attn_output = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, attn_mask=attention_mask).transpose(1, 2).contiguous()
+            q, k, v, attn_mask=mask).transpose(1, 2).contiguous()
+
         attn_output = attn_output.reshape(batch, patches, -1)
         attn_output = self.o_proj(attn_output)
 
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -185,7 +185,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
 examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
+examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -241,7 +241,7 @@ l0_h100:
   - examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
   - examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
   - examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-  - examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
+  - examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
   - examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 10 mins
   - examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-enable_fp8] # 7 mins
   - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 13 mins