remove all_rank_max_num_tokens

sunnyqgg · sunnyqgg · commit f8f48ce60f41 · 2025-09-17T09:36:44.000Z
Signed-off-by: qgai &lt;qgai@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -395,8 +395,7 @@ def split_kv_b_proj(kv_b_proj: torch.Tensor,
                             p.data.copy_(module_weights[n][:])
 
                 if self.model_config.quant_config.layer_quant_mode.has_fp8_block_scales(
-                ) and is_sm_100f() and hasattr(
-                        module, "weight_scale"):
+                ) and is_sm_100f() and hasattr(module, "weight_scale"):
                     weight, weight_scale = resmooth_to_fp8_e8m0(
                         module.weight, module.weight_scale)
                     transfromed_scale = transform_sf_into_required_layout(
@@ -805,8 +804,9 @@ def __init__(self,
             for key in [EventType.Main, EventType.MoeShared]
         }
 
-    def _compute_shared_expert_tp_size(self, intermediate_size: int,
-                                       block_size: int) -> int:
+    def _compute_shared_expert_tp_size(
+            self, intermediate_size: int,
+            block_size: int) -> tuple[int, float | None]:
         """
         In the case of Deepseek-R1, the TP size of MLP is capped by intermediate_size // block_size.
         For example, when the intermediate_size is 2048 and block scaling size is 128,
@@ -818,7 +818,9 @@ def _compute_shared_expert_tp_size(self, intermediate_size: int,
                 it's 128. For NVFP4, it's 16.
 
         Returns:
-            int: The computed tp_size.
+            tuple[int, float | None]: A tuple containing (shared_tp_size, shared_output_scale).
+                - shared_tp_size: The computed TP size.
+                - shared_output_scale: The output scale factor, or None if not needed.
         """
 
         assert intermediate_size % block_size == 0, "intermediate_size must be divisible by block_size."
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -393,7 +393,6 @@ def forward(
         hidden_states: torch.Tensor,
         attn_metadata: AttentionMetadata,
         all_rank_num_tokens: Optional[List[int]] = None,
-        all_rank_max_num_tokens: Optional[int] = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         hidden_states = self.layers(
@@ -403,7 +402,6 @@ def forward(
             embed_tokens=self.embed_tokens,
             attn_metadata=attn_metadata,
             all_rank_num_tokens=all_rank_num_tokens,
-            all_rank_max_num_tokens=all_rank_max_num_tokens,
         )
 
         return hidden_states
@@ -458,7 +456,6 @@ def forward(self,
             hidden_states=hidden_states,
             attn_metadata=attn_metadata,
             all_rank_num_tokens=attn_metadata.all_rank_num_tokens,
-            all_rank_max_num_tokens=attn_metadata.all_rank_max_num_tokens,
             **kwargs)
         return self.logits_processor.forward(
             output,