switch to GatedMLP

brb-nv · brb-nv · commit 8151fd0b6a79 · 2025-07-25T19:13:24.000Z
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py
@@ -4,7 +4,6 @@
 import torch
 from torch import nn
 from transformers import Gemma3TextConfig
-from transformers.activations import ACT2FN
 
 from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
     BaseWeightMapper
@@ -18,6 +17,7 @@
 from ..distributed import AllReduceParams
 from ..model_config import ModelConfig
 from ..modules.attention import Attention
+from ..modules.gated_mlp import GatedMLP
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
 from ..modules.linear import Linear, TensorParallelMode
@@ -156,33 +156,10 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],
         return super().apply_rope(q, k, v, position_ids)
 
 
-class Gemma3MLP(nn.Module):
-
-    def __init__(self, config: Gemma3TextConfig):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.dtype = config.torch_dtype
-        self.gate_proj = Linear(self.hidden_size,
-                                self.intermediate_size,
-                                bias=False,
-                                dtype=self.dtype)
-        self.up_proj = Linear(self.hidden_size,
-                              self.intermediate_size,
-                              bias=False,
-                              dtype=self.dtype)
-        self.down_proj = Linear(self.intermediate_size,
-                                self.hidden_size,
-                                bias=False,
-                                dtype=self.dtype)
-        self.act_fn = ACT2FN[config.hidden_activation]
-
-    @torch.inference_mode()
-    def forward(self, x):
-        down_proj = self.down_proj(
-            self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
+# This function is written to be compatible with TRTLLM's GatedMLP class.
+def pytorch_gelu_tanh(gate_x: torch.Tensor) -> torch.Tensor:
+    gate, x = gate_x.chunk(2, dim=-1)
+    return nn.functional.gelu(gate, approximate="tanh") * x
 
 
 class Gemma3DecoderLayer(DecoderLayer):
@@ -202,7 +179,13 @@ def __init__(
             is_sliding=is_sliding,
         )
 
-        self.mlp = Gemma3MLP(config)
+        self.mlp = GatedMLP(hidden_size=config.hidden_size,
+                            intermediate_size=config.intermediate_size,
+                            bias=False,
+                            activation=pytorch_gelu_tanh,
+                            dtype=config.torch_dtype,
+                            config=model_config,
+                            layer_idx=layer_idx)
 
         self.input_layernorm = RMSNorm(hidden_size=config.hidden_size,
                                        eps=config.rms_norm_eps,
diff --git a/tensorrt_llm/_torch/modules/gated_mlp.py b/tensorrt_llm/_torch/modules/gated_mlp.py
@@ -108,7 +108,9 @@ def __init__(self,
     def _apply_activation(self, x):
         if self.activation == F.silu:
             return swiglu(x)
-        elif self.activation == None:
+        elif callable(self.activation):
+            return self.activation(x)
+        elif self.activation is None:
             return x
         else:
             raise NotImplementedError(