[https://nvbugs/5461712] [fix] Disable deep_gemm for Qwen3 due to accuracy issues (#7170)

DomBrown · dominicshanshan · commit 609b545b8249 · 2025-09-03T23:09:48.000-07:00
Signed-off-by: Dom Brown &lt;3886319+DomBrown@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py
@@ -158,6 +158,9 @@ def __init__(
 
         self.fuse_qk_norm_rope = fuse_qk_norm_rope
 
+        # Qwen3 has accuracy issues with deep_gemm (see: https://nvbugspro.nvidia.com/bug/5461712)
+        disable_deep_gemm = True
+
         super().__init__(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
@@ -171,6 +174,7 @@ def __init__(
             dtype=config.torch_dtype,
             dense_bias=config.attention_bias,
             config=model_config,
+            disable_deep_gemm=disable_deep_gemm,
         )
 
         self.q_norm = RMSNorm(hidden_size=self.head_dim,
@@ -244,13 +248,18 @@ def __init__(
             layer_idx=layer_idx,
         )
 
+        # Qwen3 has accuracy issues with deep_gemm (see: https://nvbugspro.nvidia.com/bug/5461712)
+        disable_deep_gemm = True
+
         self.mlp = GatedMLP(
             hidden_size=config.hidden_size,
             intermediate_size=config.intermediate_size,
             bias=config.mlp_bias if hasattr(config, "mlp_bias") else False,
             dtype=config.torch_dtype,
             config=model_config,
+            disable_deep_gemm=disable_deep_gemm,
         )
+
         self.input_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                        eps=config.rms_norm_eps,
                                        dtype=config.torch_dtype)
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -116,6 +116,7 @@ def __init__(
         config: Optional[ModelConfig] = None,
         q_scaling: float = 1.0,
         attention_chunk_size: Optional[int] = None,
+        disable_deep_gemm: bool = False,
     ):
         """
         Initialize the Attention module.
@@ -134,6 +135,7 @@ def __init__(
             config (Optional[ModelConfig]): The model configuration.
             q_scaling (float): The scaling factor for the qk_scale. The definition is $O = softmax(QK^T * qk_scale) * V, qk_scale = 1 / (sqrt(head_dim) * q_scaling)$. The default value is 1.0.
             attention_chunk_size (Optional[int]): See [Chunked Attention] below.
+            disable_deep_gemm (bool): Whether to disable deep_gemm for linear layers.
         """
         super().__init__()
         self.layer_idx = layer_idx
@@ -215,7 +217,9 @@ def __init__(
             quant_config=config.get_quant_config(),
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             allreduce_strategy=config.allreduce_strategy,
-            force_dynamic_quantization=config.force_dynamic_quantization)
+            force_dynamic_quantization=config.force_dynamic_quantization,
+            disable_deep_gemm=disable_deep_gemm,
+        )
         self.o_lora = LoraLayer([LoraModuleType.ATTENTION_DENSE],
                                 [self.hidden_size])
 
@@ -230,7 +234,9 @@ def __init__(
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             lora=self.o_lora,
             allreduce_strategy=config.allreduce_strategy,
-            force_dynamic_quantization=config.force_dynamic_quantization)
+            force_dynamic_quantization=config.force_dynamic_quantization,
+            disable_deep_gemm=disable_deep_gemm,
+        )
 
         self.quant_config = config.get_quant_config()
         self.attn_backend = config.attn_backend
diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -29,7 +29,9 @@ def __init__(self,
                  overridden_tp_size: Optional[int] = None,
                  reduce_output: bool = True,
                  layer_idx: Optional[int] = None,
-                 use_cute_dsl_blockscaling_mm: bool = False):
+                 use_cute_dsl_blockscaling_mm: bool = False,
+                 disable_deep_gemm: bool = False):
+
         super().__init__()
         self.layer_idx = layer_idx
         self.hidden_size = hidden_size
@@ -67,7 +69,9 @@ def __init__(self,
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
-            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm)
+            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
+            disable_deep_gemm=disable_deep_gemm,
+        )
 
         self.down_lora = LoraLayer([LoraModuleType.MLP_4H_TO_H],
                                    [self.hidden_size])
@@ -85,7 +89,9 @@ def __init__(self,
             lora=self.down_lora,
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
-            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm)
+            use_cute_dsl_blockscaling_mm=use_cute_dsl_blockscaling_mm,
+            disable_deep_gemm=disable_deep_gemm,
+        )
 
         # These two modules are mutually exclusive - either splitted_gate_up_lora or fused_gate_up_lora will be used,
         # but never both at the same time. splitted_gate_up_lora handles gate and up separately while fused_gate_up_lora
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -613,7 +613,7 @@ def apply(self, module: Linear, input: torch.Tensor,
             input = input.to(torch.bfloat16) * module.input_scale
         assert input.dtype == torch.bfloat16
 
-        if get_sm_version() == 100:
+        if get_sm_version() == 100 and not module.disable_deep_gemm:
             if module.use_cute_dsl_blockscaling_mm:
                 # TODO (@lmin): replace with cute_dsl gemm
                 act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
@@ -1595,6 +1595,7 @@ def __init__(
         allreduce_strategy: AllReduceStrategy = AllReduceStrategy.AUTO,
         force_dynamic_quantization: bool = False,
         use_cute_dsl_blockscaling_mm: bool = False,
+        disable_deep_gemm: bool = False,
     ):
         from ..distributed import AllReduce
 
@@ -1612,6 +1613,7 @@ def __init__(
         self.gather_output = gather_output
         self.force_dynamic_quantization = force_dynamic_quantization
         self.use_cute_dsl_blockscaling_mm = use_cute_dsl_blockscaling_mm
+        self.disable_deep_gemm = disable_deep_gemm
 
         local_in_features = in_features
         local_out_features = out_features
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -37,6 +37,7 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] # To cover NVBUGS 5461712
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]