use LinearMMConfig

weifengpy · weifengpy · commit 3d0da208d16a · 2024-09-30T15:25:24.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py
@@ -32,6 +32,7 @@
 from torchao.float8.float8_tensor import (
     LinearMMConfig,
     GemmInputRole,
+    ScaledMMConfig,
 )
 from torchao.float8.float8_utils import e4m3_dtype
 
@@ -379,17 +380,40 @@ def test_dynamic_scale_numeric_parity(dtype: torch.dtype):
     float8_config = Float8LinearConfig(
         cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
     )
+    linear_mm_config = LinearMMConfig(
+        # output
+        ScaledMMConfig(
+            False,
+            float8_config.gemm_config_output.use_fast_accum,
+            False,
+            float8_config.pad_inner_dim,
+        ),
+        # grad_input
+        ScaledMMConfig(
+            False,
+            float8_config.gemm_config_grad_input.use_fast_accum,
+            False,
+            float8_config.pad_inner_dim,
+        ),
+        # grad_weight
+        ScaledMMConfig(
+            False,
+            float8_config.gemm_config_grad_weight.use_fast_accum,
+            False,
+            float8_config.pad_inner_dim,
+        ),
+    )
     float8_eager = hp_tensor_to_float8_dynamic(
         hp_tensor1,
         torch.float8_e4m3fn,
-        float8_config,
+        linear_mm_config,
         gemm_input_role=GemmInputRole.WEIGHT,
     )
     torch._dynamo.reset()
     float8_compile = torch.compile(hp_tensor_to_float8_dynamic)(
         hp_tensor2,
         torch.float8_e4m3fn,
-        float8_config,
+        linear_mm_config,
         gemm_input_role=GemmInputRole.WEIGHT,
     )
     assert torch.equal(float8_eager._scale, float8_compile._scale)
diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -59,16 +59,18 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
         return
 
     # inf-norm is equivalent to max(abs(w))
-    # keep consistent with float8_utils.amax_to_scale
-    # torch.compile and eager show different numerics for 1.0 / float32,
-    # upcast to float64 to ensure same numeric between compile and eager
     max_weights = torch._foreach_norm(weights, ord=math.inf)  # Partial
     amax_tensor = torch.stack(max_weights)  # Partial
     # clamp is dispatched through DTensor
     # it will issue a single all-reduce
     amax_tensor = torch.clamp(amax_tensor, EPS)  # Replicate
-    scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor.to(torch.float64)  # Replicate
-    if amax_tensor.dtype is torch.float16:
+    # keep consistent with float8_utils.amax_to_scale
+    # torch.compile and eager show different numerics for 1.0 / float32,
+    # upcast to float64 to ensure same numeric between compile and eager
+    origin_dtype = amax_tensor.dtype
+    amax_tensor = amax_tensor.to(torch.float64)
+    scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor  # Replicate
+    if origin_dtype is torch.float16:
         scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
     local_scale_tensor = scale_tensor.to_local().to(torch.float32)
     for i, float8_linear in enumerate(float8_linears):