Fix Float8Tensor quantize op kernrel preference dispatch

jerryzh168 · jerryzh168 · commit 6cf26bd1ce9c · 2025-08-26T16:19:21.000-07:00
Summary: Previously we didn't handle kernel_preference == "fbgemm" properly for the quantize op, this PR makes sure we dispatch to fbgemm kernels when kernel_preference is fbgemm This doesn't have much impact on BC, the serialized checkpoints will use AUTO which is going to be dispatched to triton op for quantize, only thing is fixing the kernel choice for fbgemm kernel preference, which is supposed to be a developer facing API (we expect most users to just use AUTO without worrying about details) Test Plan: python test/quantization/quantize_/workflows/float8/test_float8_tensor.py -k test_kernel_preference_numerical_equivalence Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2883, branch: jerryzh168/stack/59
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -789,7 +789,7 @@ def test_expected_kernels_on_gpu(self, granularity, float8_config_version):
             # three triton kernels for quantizing the activation:
             # kernel 1: x_max_tmp = max(x, ...)
             # kernel 2: x_max = max(x_max_tmp)
-            # kernel 3: x_float8 = to_float8(x, x_max)
+            # kernel 3: x_float8 = Float8Tensor.from_hp(x, x_max)
             FileCheck().check("def call(").check_count(".run(", 3, exactly=True).run(
                 code[0]
             )
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1859,7 +1859,7 @@ def test_float8_fake_quantize(self, granularity: Granularity):
         torch.manual_seed(self.SEED)
         x = torch.randn(32, 64)
         out = fake_quantizer(x)
-        out_expected = Float8Tensor.to_float8(x, dtype, granularity).dequantize()
+        out_expected = Float8Tensor.from_hp(x, dtype, granularity).dequantize()
         sqnr = compute_error(out, out_expected)
         self.assertGreater(sqnr, 16)
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1546,7 +1546,7 @@ def _float8_weight_only_quant_tensor(weight, config):
     else:
         assert config.version == 2, f"Unexpected version: {config.version}"
         weight_dtype = config.weight_dtype
-        new_weight = Float8Tensor.to_float8(
+        new_weight = Float8Tensor.from_hp(
             weight, float8_dtype=weight_dtype, granularity=PerRow()
         )
     return new_weight
@@ -1744,7 +1744,7 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
             kernel_preference=kernel_preference,
         )
 
-        quantized_weight = Float8Tensor.to_float8(
+        quantized_weight = Float8Tensor.from_hp(
             weight,
             float8_dtype=weight_dtype,
             granularity=weight_granularity,
diff --git a/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py b/torchao/quantization/quantize_/common/quantize_tensor_kwargs.py
@@ -22,7 +22,7 @@ class QuantizeTensorKwargs(abc.ABC):
 
     class Float8Tensor(...)
         @classmethod
-        def to_float8(cls, tensor, quant_kwargs: QuantizeTensorKwargs)
+        def from_hp(cls, tensor, quant_kwargs: QuantizeTensorKwargs)
             ...
     """
 
@@ -43,7 +43,7 @@ def _choose_quant_func_and_quantize_tensor(
     )
 
     if isinstance(quant_kwargs, QuantizeTensorToFloat8Kwargs):
-        return Float8Tensor.to_float8(
+        return Float8Tensor.from_hp(
             tensor,
             quant_kwargs.float8_dtype,
             quant_kwargs.granularity,
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -22,7 +22,7 @@
     preprocess_data,
     preprocess_scale,
 )
-from torchao.quantization.granularity import PerRow
+from torchao.quantization.granularity import PerRow, PerTensor
 from torchao.quantization.observer import get_block_size
 from torchao.quantization.quant_primitives import (
     _choose_scale_float8,
@@ -163,7 +163,7 @@ def dequantize(self, output_dtype: Optional[torch.dtype] = None) -> torch.Tensor
         return _dequantize_affine_float8(qdata, scale, output_dtype)
 
     @classmethod
-    def to_float8(
+    def from_hp(
         cls,
         hp_tensor: torch.Tensor,
         float8_dtype: torch.dtype = torch.float8_e4m3fn,
@@ -177,18 +177,29 @@ def to_float8(
         block_size = get_block_size(hp_tensor.shape, granularity)
         block_size = list(block_size)
 
+        kernel_choice = None
         # for per row quantization and kernel_preference default setting, we'll use triton kernel for best performance
         if (
             kernel_preference == KernelPreference.AUTO
             and _is_fbgemm_genai_gpu_available()
-            and (
-                tuple(block_size)
-                == (1,) * (hp_tensor.ndim - 1) + (hp_tensor.shape[-1],)
-            )
+            and is_sm_at_least_90()
+            and isinstance(granularity, PerRow)
+            and float8_dtype == torch.float8_e4m3fn
+            and hp_value_lb is None
         ):
-            assert float8_dtype == torch.float8_e4m3fn, (
-                f"Only torch.float8_e4m3fn is supported, got: {float8_dtype}"
+            # optimized path for auto and per row quantization
+            kernel_choice = "triton"
+        elif kernel_preference == KernelPreference.FBGEMM and hp_value_lb is None:
+            assert _is_fbgemm_genai_gpu_available() and is_sm_at_least_90(), (
+                "Specified fbgemm but fbgemm_gpu_genai is not installed or hardware is not >= SM 9.0 (> H100)"
             )
+            kernel_choice = "fbgemm"
+        else:
+            # fallback path for everything else will be torch
+            kernel_choice = "torch"
+
+        if kernel_choice == "triton":
+            assert hp_value_lb is None, f"{hp_value_lb=} is not supported"
             if hp_value_ub is not None:
                 maybe_hp_value_ub_tensor = torch.tensor(
                     hp_value_ub, dtype=torch.float, device=hp_tensor.device
@@ -202,7 +213,29 @@ def to_float8(
             for i in range(hp_tensor.ndim):
                 scale_shape.append(hp_tensor.shape[i] // block_size[i])
             scale = scale.reshape(*scale_shape)
+        elif kernel_choice == "fbgemm":
+            assert hp_value_lb is None, f"{hp_value_lb=} is not supported"
+            if hp_value_ub is not None:
+                maybe_hp_value_ub_tensor = torch.tensor(
+                    hp_value_ub, dtype=torch.float, device=hp_tensor.device
+                )
+            else:
+                maybe_hp_value_ub_tensor = None
+            # not used
+            num_tokens = torch.empty([hp_tensor.size(0)], device=hp_tensor.device)
+            if isinstance(granularity, PerRow):
+                data, scale = torch.ops.fbgemm.quantize_fp8_per_row(
+                    hp_tensor, num_tokens, scale_ub=maybe_hp_value_ub_tensor
+                )
+            else:
+                assert isinstance(granularity, PerTensor), (
+                    f"Expected per tensor, got {granularity}"
+                )
+                data, scale = torch.ops.fbgemm.quantize_fp8_per_tensor(
+                    hp_tensor, num_tokens, scale_ub=maybe_hp_value_ub_tensor
+                )
         else:
+            assert kernel_choice == "torch", f"Expected torch, got {kernel_choice}"
             scale = _choose_scale_float8(
                 hp_tensor,
                 float8_dtype=float8_dtype,

Original file line number	Diff line number	Diff line change
`@@ -789,7 +789,7 @@ def test_expected_kernels_on_gpu(self, granularity, float8_config_version):`
`789`	`789`	`# three triton kernels for quantizing the activation:`
`790`	`790`	`# kernel 1: x_max_tmp = max(x, ...)`
`791`	`791`	`# kernel 2: x_max = max(x_max_tmp)`
`792`		`- # kernel 3: x_float8 = to_float8(x, x_max)`
	`792`	`+ # kernel 3: x_float8 = Float8Tensor.from_hp(x, x_max)`
`793`	`793`	`FileCheck().check("def call(").check_count(".run(", 3, exactly=True).run(`
`794`	`794`	`code[0]`
`795`	`795`	`)`