[Inductor][Triton][FP8] Add a Blackwell-specific scaled persistent + TMA template for GEMMs (pytorch#163147)

jananisriram · cleonard530 · commit 225789c36bfd · 2025-09-22T09:19:38.000-04:00
Summary: X-link: meta-pytorch/tritonbench#432 Add a Blackwell-specific scaled persistent + TMA Triton template to Inductor. This diff builds on D82515450 by adding a new set of mixins which inherit the scaling epilogue and add scaled persistent + TMA kwargs to the template. This diff also adds a benchmark for the scaled Blackwell persistent + TMA template to TritonBench `fp8_gemm`. Note that this diff is a minimal extension to the above diff; rather than adding a new kernel for the scaled version, we opted to simply extend the epilogue to account for scaling. This template is accurate for per-tensor and per-row scaling but may require modifications for other scaling modes, such as deepseek-style scaling, which apply scaling prior to the GEMM computation. In addition, note that epilogue subtiling is currently unsupported for both the scaled and non-scaled Blackwell templates, and functionality will be added in a subsequent diff. Test Plan: Verified that the scaled Blackwell template adds the scaling epilogue to the generated Triton kernel by inspecting the Inductor-generated Triton kernel. Benchmarking command: ``` TRITON_PRINT_AUTOTUNING=1 TORCHINDUCTOR_CACHE_DIR=~/personal/cache_dir_inductor TRITON_CACHE_DIR=~/personal/cache_dir_triton TRITON_ALWAYS_COMPILE=1 TORCH_LOGS=+inductor TORCHINDUCTOR_FORCE_DISABLE_CACHES=1 ENABLE_PERSISTENT_TMA_MATMUL=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM=1 buck2 run mode/{opt,inplace} pytorch/tritonbench:run -c fbcode.nvcc_arch=b200a -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -- --op fp8_gemm --only torch_fp8_gemm,blackwell_pt2_fp8_gemm --metrics tflops,accuracy --input-loader=/home/jananisriram/personal/fp8_shapes_testing.json --scaling_rowwise --output="/home/jananisriram/personal/fp8_shapes_testing_results.csv" --atol=1e-2 --rtol=0.5 2>&1 | tee ~/personal/fp8_shapes_testing.log ``` Rollback Plan: Differential Revision: D82597111 Pull Request resolved: pytorch#163147 Approved by: https://github.com/njriasan
diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
@@ -439,6 +439,132 @@ def mm(a, b):
         with config.patch({"max_autotune": True}):
             torch.compile(mm, dynamic=dynamic)(a, b)
 
+    # NOTE: the current Inductor template verifies that the scaling mode is either per-tensor or per-row
+    # TODO: support additional scaling modes for Blackwell
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_tensor_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_major
+            return torch._scaled_mm(
+                a, b.t(), scale_a, scale_b, use_fast_accum=True, out_dtype=torch.float16
+            )
+
+        def get_scale_per_tensor(t):
+            scale = torch.finfo(torch.float8_e4m3fn).max / t.abs().max()
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.float16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_tensor(a)
+        scale_b = get_scale_per_tensor(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
+    @unittest.skipIf(
+        not has_datacenter_blackwell_tma_device(),
+        "Need Blackwell with device-side TMA support in Triton",
+    )
+    @parametrize("dynamic", (False, True))
+    @parametrize("tma_store", (False, True))
+    def test_blackwell_max_autotune_scaled_mm_per_row_persistent_tma(
+        self,
+        dynamic: bool,
+        tma_store: bool,
+    ):
+        def scaled_mm(a, b, scale_a, scale_b):
+            # NOTE: Inductor constrains a to be row_major and b to be col_majo
+            return torch._scaled_mm(
+                a,
+                b.t(),
+                scale_a,
+                scale_b.t(),
+                use_fast_accum=True,
+                out_dtype=torch.bfloat16,
+            )
+
+        def get_scale_per_row(t):
+            scale = (
+                torch.finfo(torch.float8_e4m3fn).max
+                / t.abs().max(dim=1, keepdim=True).values
+            )
+            return scale.to(torch.float32)
+
+        # TMA requires 16-byte alignment: here we repeat the dims
+        # by the factor of 8, as float16 is 2-byte.
+        M, N, K = 32, 16, 48
+        a = (torch.randn((M, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+        b = (torch.randn((N, K)).to(torch.bfloat16).to(GPU_TYPE)).repeat(8, 8)
+
+        scale_a = get_scale_per_row(a)
+        scale_b = get_scale_per_row(b)
+
+        a = a.to(torch.float8_e4m3fn)
+        b = b.to(torch.float8_e4m3fn)
+
+        with config.patch(
+            {
+                "max_autotune": True,
+                "triton.enable_persistent_tma_matmul": True,
+                "triton.enable_template_tma_store": tma_store,
+                "test_configs.autotune_choice_name_regex": "blackwell_ws_persistent_device_tma",
+            }
+        ):
+            c_actual, code = run_and_get_code(
+                torch.compile(scaled_mm, dynamic=dynamic), a, b, scale_a, scale_b
+            )
+            c_expected = scaled_mm(a, b, scale_a, scale_b)
+
+        torch.testing.assert_close(c_actual, c_expected, atol=1e-2, rtol=0.5)
+        if tma_store:
+            # Verify that we are using a TMA implementation
+            # Note: The tma_descriptor0 is generated by the kernel. If the
+            # code generation process changes this could change.
+            write_api = "tma_descriptor0.store"
+        else:
+            write_api = "tl.store"
+        FileCheck().check("triton_tem_fused__scaled_mm").check(
+            "triton.language.make_tensor_descriptor"
+        ).check("tl.load_tensor_descriptor").check(write_api).run(code[0])
+
     @unittest.skipIf(
         not has_triton_tma_device(), "Need device-side TMA support in Triton"
     )
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
@@ -1271,6 +1271,15 @@ def tuned_scaled_mm(
             templates_to_use.append(scaled_mm_device_tma_template)
             kwarg_overrides[scaled_mm_device_tma_template.uid] = overriders
 
+        if (
+            use_triton_blackwell_tma_template(mat_a, mat_b, output_layout=layout)
+            and not bias
+        ):
+            templates_to_use.append(blackwell_ws_persistent_device_tma_mm_template)
+            kwarg_overrides[blackwell_ws_persistent_device_tma_mm_template.uid] = (
+                overriders
+            )
+
         templates_to_use.append(mm_template)
         kwarg_overrides[mm_template.uid] = overriders
 
diff --git a/torch/_inductor/template_heuristics/triton.py b/torch/_inductor/template_heuristics/triton.py
@@ -1944,6 +1944,30 @@ def _get_template_configs_impl(
             yield template_kwargs
 
 
+# Scaled Blackwell TMA-specific mixin for scaled MM templates with TMA
+class ScaledBlackwellTMAConfigMixin(
+    BlackwellTMATemplateConfigMixin, ScaledMMConfigMixin
+):
+    """
+    Scaled Blackwell TMA-specific mixin that extends ScaledMMConfigMixin with TMA functionality.
+    This is for scaled MM templates that use device TMA on Blackwell.
+    This inherits from ScaledMMConfigMixin, which inherits the scale_mm_epilogue, and adds TMA-specific options.
+    """
+
+    def _filter_configs(self, configs: list[BaseConfig]) -> list[BaseConfig]:
+        """
+        Warp specialization-specific filtering (BlackwellTMATemplateConfigMixin)
+        (compilation issues occur in some versions of Triton)
+        - num_warps < 4 unsafe for warpspec
+        - num_stages < 2 unsafe for warpspec
+
+        TMA-specific filtering:
+        - block_k >= 32 required for TMA (requires inner-most dimension >= 32)
+        """
+        configs = [c for c in configs if c.block_k >= 32]
+        return super()._filter_configs(configs)
+
+
 # Template-specific heuristic classes using multiple inheritance
 
 
@@ -2078,6 +2102,23 @@ def __init__(self) -> None:
         self.mm_configs = self.scaled_persistent_mm_configs
 
 
+@register_template_heuristic(
+    blackwell_ws_persistent_device_tma_mm_template.uid,  # regular Blackwell MM template + scaling epilogue from ScaledMMConfigMixin
+    "cuda",
+    register=torch.version.hip is None,
+)
+class CUDAScaledBlackwellTMATemplateConfigHeuristic(
+    ScaledBlackwellTMAConfigMixin, CUDAConfigHeuristic
+):
+    """Scaled Blackwell TMA template heuristic for CUDA"""
+
+    def __init__(self) -> None:
+        super().__init__()
+        # Override mm_configs to use scaled_persistent_mm_configs for TMA
+        # TODO: Tune scaled_persistent_mm_configs for Blackwell
+        self.mm_configs = self.scaled_persistent_mm_configs
+
+
 @register_template_heuristic(
     mm_plus_mm_template.uid,
     "cuda",