NVIDIA · Fridah-nv · Sep 10, 2025 · Aug 12, 2025 · Aug 14, 2025 · Aug 14, 2025
@@ -45,11 +45,19 @@ transforms:
   # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
   optimize_rope:
     stage: pattern_matcher
-  quantize_from_config:
+  quantize_fp8_linear_from_config:
     stage: pattern_matcher
-  quantize_from_graph:
+  quantize_nvfp4_linear_from_config:
     stage: pattern_matcher
-  quantize_moe:
+  quantize_fp8_bmm_from_config:
+    stage: pattern_matcher
+  quantize_fp8_from_graph:
+    stage: pattern_matcher
+  quantize_nvfp4_from_graph:
+    stage: pattern_matcher
+  quantize_fp8_moe:
+    stage: pattern_matcher
+  quantize_nvfp4_moe:
     stage: pattern_matcher
   # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
   detect_sharding:
@@ -70,10 +78,21 @@ transforms:
   # RUN POST-LOAD FUSION AND OPTIMIZATIONS
   ############################################################################################
   # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs
-  # fuse_moe:
-  #   stage: post_load_fusion
   # fuse_gemms:
   #   stage: post_load_fusion
+  # fuse_fp4_gemms:
+  #   stage: post_load_fusion
+  # fuse_fp8_gemms:
+  #   stage: post_load_fusion
+  fuse_fp8_linear:
+    stage: post_load_fusion
+    backend: torch
+  fuse_nvfp4_linear:
+    stage: post_load_fusion
+    backend: trtllm
+  # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/4674 this is causing OOMs
+  fuse_moe:
+    stage: post_load_fusion
   fuse_allreduce_residual_rmsnorm:
     stage: post_load_fusion
   fuse_collectives:

@@ -25,7 +25,7 @@ The table below lists the operators ordered by their backend.
 | `torch.ops.auto_deploy.torch_moe_fused` | Fused Mixture of Experts implementation |
 | `torch.ops.auto_deploy.torch_quant_fn` | Generic quantization function that scales, rounds, and clamps input values |
 | `torch.ops.auto_deploy.torch_quant_fused_fp8_linear_all_reduce` | Fused FP8 linear layer followed by all-reduce operation |
-| `torch.ops.auto_deploy.torch_quant_fp4_linear` | FP4 quantized linear layer |
+| `torch.ops.auto_deploy.torch_quant_nvfp4_linear` | FP4 quantized linear layer |
 | `torch.ops.auto_deploy.torch_quant_fp8_linear` | FP8 quantized linear layer |
 | `torch.ops.auto_deploy.torch_rope_with_complex_freqs` | RoPE with complex frequencies |
 | `torch.ops.auto_deploy.torch_rope_with_explicit_cos_sin` | RoPE with explicit cosine/sine |

@@ -11,6 +11,7 @@
 from .torch_attention import *
 from .torch_backend_attention import *
 from .torch_moe import *
+from .torch_quant import *
 from .torch_rope import *
 from .triton_attention import *
 from .triton_rope import *

@@ -157,9 +157,9 @@ def forward(self, x):
         )
 
 
-@torch.library.custom_op("auto_deploy::torch_quant_fp4_linear", mutates_args=())
+@torch.library.custom_op("auto_deploy::torch_quant_nvfp4_linear", mutates_args=())
 @torch.compile(dynamic=True)
-def fp4_linear(
+def nvfp4_linear(
     input: torch.Tensor,
     weight_fp4: torch.Tensor,
     bias: Optional[torch.Tensor] = None,
@@ -212,7 +212,7 @@ def fp4_linear(
     return output.reshape(*input_shape[:-1], n)
 
 
-@fp4_linear.register_fake
+@nvfp4_linear.register_fake
 def fp4_linear_fake(
     input: torch.Tensor,
     weight_fp4: torch.Tensor,
@@ -299,15 +299,3 @@ def fp8_bmm_fake(
     """Fake implementation of fp8_bmm for testing and tracing."""
     # Use standard bmm
     return torch.bmm(input.to(torch.float), mat2.to(torch.float)).to(input.dtype)
-
-
-QUANT_LINEAR_OPS = [
-    torch.ops.auto_deploy.torch_quant_fp8_linear,
-    torch.ops.auto_deploy.torch_quant_fp4_linear,
-]
-
-QUANT_BMM_OPS = [
-    torch.ops.auto_deploy.torch_quant_fp8_bmm,
-]
-
-QUANT_OPS = QUANT_LINEAR_OPS + QUANT_BMM_OPS
@@ -235,8 +235,8 @@ def torch_quant_fp8_moe_fake(
     return torch.empty_like(x)
 
 
-@torch.library.custom_op("auto_deploy::torch_quant_fp4_moe", mutates_args=())
-def torch_quant_fp4_moe(
+@torch.library.custom_op("auto_deploy::torch_quant_nvfp4_moe", mutates_args=())
+def torch_quant_nvfp4_moe(
     x: torch.Tensor,
     selected_experts: torch.Tensor,
     routing_weights: torch.Tensor,
@@ -273,15 +273,15 @@ def make_fp4_mlp(i):
         def mlp(inp):
             if inp.shape[0] == 0:
                 return torch.zeros_like(inp)
-            gate_out = torch.ops.auto_deploy.torch_quant_fp4_linear(
+            gate_out = torch.ops.auto_deploy.torch_quant_nvfp4_linear(
                 inp,
                 w1_weight[i],
                 bias=None,
                 input_scale=w1_input_scale[i],
                 weight_scale=w1_weight_scale[i],
                 alpha=w1_alpha[i],
             )
-            up_out = torch.ops.auto_deploy.torch_quant_fp4_linear(
+            up_out = torch.ops.auto_deploy.torch_quant_nvfp4_linear(
                 inp,
                 w3_weight[i],
                 bias=None,
@@ -290,7 +290,7 @@ def mlp(inp):
                 alpha=w3_alpha[i],
             )
             prod = F.silu(gate_out) * up_out
-            return torch.ops.auto_deploy.torch_quant_fp4_linear(
+            return torch.ops.auto_deploy.torch_quant_nvfp4_linear(
                 prod,
                 w2_weight[i],
                 bias=None,
@@ -305,8 +305,8 @@ def mlp(inp):
     return _template_moe(x, selected_experts, routing_weights, mlps)
 
 
-@torch_quant_fp4_moe.register_fake
-def torch_quant_fp4_moe_fake(
+@torch_quant_nvfp4_moe.register_fake
+def torch_quant_nvfp4_moe_fake(
     x: torch.Tensor,
     selected_experts: torch.Tensor,
     routing_weights: torch.Tensor,