[TRTLLM-6656][chore] Validate FP8 support for Gemma3

brb-nv · brb-nv · commit e26c4e2cf27c · 2025-08-06T22:02:48.000Z
Signed-off-by: Balaram Buddharaju &lt;169953907+brb-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3vl.py b/tensorrt_llm/_torch/models/modeling_gemma3vl.py
@@ -134,11 +134,16 @@ def get_sub_model_config(
             "text_config", "vision_config"
         ], f"Expected subconfig name to be either 'text_config' or 'vision_config'. Got {name} instead."
         pretrained_config = getattr(model_config.pretrained_config, name)
+        # ModelOpt currently doesn't quantize the vision part. Without setting quant config to None,
+        # weight loading fails for vision part.
+        quant_config = model_config.quant_config if name == "text_config" else None
+        # FlashInfer backend supports custom mask which is needed for bidirectional mask in decoder.
         preferred_backend = "FLASHINFER" if name == "text_config" else "TRTLLM"
         sub_model_config: ModelConfig[Gemma3Config] = dataclasses.replace(
             model_config,
             pretrained_config=pretrained_config,
-            attn_backend=preferred_backend)
+            attn_backend=preferred_backend,
+            quant_config=quant_config)
         # Make sure some fields that are not explicitly included in the sub config, but present
         # in the top-level config, are replicated.
         if (hasattr(sub_model_config.pretrained_config, "torch_dtype")
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -5,6 +5,9 @@ google/gemma-3-1b-it:
     accuracy: 20.699
 google/gemma-3-27b-it:
   - accuracy: 28.90
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 27.90
 gpt2:
   - accuracy: 18.408
   - quant_algo: W8A16
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -141,8 +141,14 @@ speakleash/Bielik-11B-v2.2-Instruct:
     accuracy: 40.41
 google/gemma-3-1b-it:
   - accuracy: 25.52 # score getting from lm-eval with HF implementation
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 23.96
 google/gemma-3-27b-it:
   - accuracy: 91.66
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 90.66
 mistralai/Ministral-8B-Instruct-2410:
   - accuracy: 79.25
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -114,6 +114,9 @@ google/gemma-3-1b-it:
     accuracy: 37.5
 google/gemma-3-27b-it:
   - accuracy: 77.80
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 76.80
 Qwen/Qwen2-0.5B-Instruct:
   - accuracy: 45.30
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -751,6 +751,24 @@ def test_auto_dtype(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    def test_fp8_prequantized(self):
+        # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size.
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        enable_partial_reuse=False,
+                                        dtype="fp8")
+        prequantized_model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-fp8/"
+        with LLM(prequantized_model_path,
+                 kv_cache_config=kv_cache_config,
+                 attn_backend="FLASHINFER",
+                 cuda_graph_config=None) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "google/gemma-3-1b-it"
@@ -784,6 +802,8 @@ def test_fp8_prequantized(self):
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -203,6 +203,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
   - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
   - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
   - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]