add reduce_scatter to symm mem ops (pytorch#150813)

ngimel · timocafe · commit d5cc628abe13 · 2025-04-16T03:46:59.000-07:00
+ a few small fixes (don't error out on 0-element tensors, a few more checks for contiguous outputs, more threads for better perf). Pull Request resolved: pytorch#150813 Approved by: https://github.com/xw285cornell
diff --git a/test/distributed/test_symmetric_memory.py b/test/distributed/test_symmetric_memory.py
@@ -771,7 +771,7 @@ def test_subgroup(self) -> None:
             self.assertTrue(buf.eq(peer_rank + world.size() // 2).all())
 
 
-@skipIfRocm
+# @skipIfRocm
 @instantiate_parametrized_tests
 @requires_cuda_p2p_access()
 class SymmMemCollectiveTest(MultiProcessTestCase):
@@ -912,7 +912,7 @@ def test_two_shot_all_reduce(self) -> None:
             shift = align_bytes // t.element_size()
             numel = size_bytes // t.element_size()
             res = t[shift : shift + numel]
-            res.normal_().fill_(1)
+            res.normal_()
             inp = res.clone()
             if not inplace:
                 out = torch.empty_like(inp)
@@ -940,6 +940,78 @@ def _verify_all_reduce_result(self, inp, res):
             gathered_inps.sum(dim=0), res, rtol=1e-01, atol=1e-01
         )
 
+    @skipIfRocm
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_scatter(self) -> None:
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+
+        for dtype, size_bytes, align_bytes, split_last_dim in itertools.product(
+            [torch.float, torch.bfloat16],
+            [128, 8192, 36 * 1024 * 16],
+            [4, 8, 16],
+            [True, False],
+        ):
+            t = symm_mem.empty(36 * 1024 * 16, dtype=dtype, device=self.device).fill_(0)
+            symm_mem.rendezvous(t, group=group_name)
+
+            self.assertTrue(t.data_ptr() % 16 == 0)
+            self.assertTrue(align_bytes % t.element_size() == 0)
+            self.assertTrue(size_bytes % t.element_size() == 0)
+
+            shift = align_bytes // t.element_size()
+            numel = size_bytes // t.element_size()
+            res = t[shift : shift + numel].normal_()
+            if split_last_dim:
+                res = res.view(-1, 128 // t.element_size())
+            inp = res.clone()
+            out_size = list(inp.shape)
+            out_size[-1] = inp.shape[-1] // self.world_size
+            out = torch.empty(out_size, dtype=dtype, device=self.device)
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, split_last_dim, out)
+
+            # Head and tail should not be written
+            self.assertTrue(t[:shift].eq(0).all().item())
+            self.assertTrue(t[shift + numel :].eq(0).all().item())
+            self._verify_reduce_scatter_result(inp, out)
+
+        dist.destroy_process_group()
+
+    @skipIfRocm
+    @skip_if_lt_x_gpu(4)
+    def test_reduce_scatter_corner_cases(self) -> None:
+        dtype = torch.bfloat16
+        self._init_process()
+        group_name = dist.group.WORLD.group_name
+        t = symm_mem.empty(16384, dtype=dtype, device=self.device).fill_(0)
+        symm_mem.rendezvous(t, group=group_name)
+        res = t[:0]
+        out_size = res.shape[0] // self.world_size
+        out = torch.empty(out_size, dtype=dtype, device=self.device)
+        torch.ops.symm_mem.reduce_scatter_out(res, group_name, False, out)
+        res = t[:48]
+        out_size = res.shape[0] // self.world_size
+        out = torch.empty(out_size, dtype=dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, "divisible"):
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, False, out)
+        res = t[: 2 * 48].view(2, 48)
+        out = torch.empty(2, 48 // self.world_size, dtype=dtype, device=self.device)
+        with self.assertRaisesRegex(RuntimeError, "divisible"):
+            torch.ops.symm_mem.reduce_scatter_out(res, group_name, True, out)
+
+    def _verify_reduce_scatter_result(self, inp, res):
+        gathered_res = all_gather_tensor(res, 0, "0").view(self.world_size, *res.shape)
+        gathered_inps = all_gather_tensor(inp, 0, "0").view(self.world_size, *inp.shape)
+        sum_inps = gathered_inps.sum(0)
+        slice_width = sum_inps.shape[-1] // self.world_size
+        for i in range(self.world_size):
+            torch.testing.assert_close(
+                gathered_res[i],
+                sum_inps[..., i * slice_width : (i + 1) * slice_width],
+                rtol=1e-01,
+                atol=1e-01,
+            )
+
     @skip_if_lt_x_gpu(4)
     @parametrize("align_bytes", [4, 8, 16])
     def test_multimem_all_gather(self, align_bytes: int) -> None:
diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h b/torch/csrc/distributed/c10d/CUDASymmetricMemory-inl.h
@@ -314,7 +314,7 @@ __device__ __inline__ Vec<Alignment> ld_vec(const T* addr) {
 
 template <int Alignment, typename T>
 __device__ __inline__ void st_vec(T* addr, const Vec<Alignment>& vec) {
-#if defined(USE_ROCM) || !defined(NVCC_SUPPORTS_MULTICAST)
+#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800))
   CUDA_KERNEL_ASSERT(false);
 #else
   if constexpr (Alignment == 16) {
diff --git a/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/CUDASymmetricMemoryOps.cu
@@ -463,6 +463,10 @@ at::Tensor one_shot_all_reduce_out_impl(
         local_input->numel() <= input.numel(),
         "one_shot_all_reduce: local input size must be smaller than symm buffer size.");
   }
+  if (input.numel() == 0) {
+    TORCH_CHECK(input.scalar_type() == out.scalar_type());
+    return out;
+  }
   auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
   TORCH_CHECK(
       symm_mem != nullptr,
@@ -555,9 +559,14 @@ at::Tensor one_shot_all_reduce_copy(
 }
 
 constexpr size_t two_shot_all_reduce_max_num_blocks = 24;
-constexpr size_t two_shot_all_reduce_max_num_threads = 512;
-
-template <typename T, int alignment, int k_world_size>
+constexpr size_t two_shot_all_reduce_max_num_threads = 1024;
+
+template <
+    typename T,
+    int alignment,
+    int k_world_size,
+    bool reduce_scatter = false,
+    bool split_last_dim = false>
 static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
     void two_shot_all_reduce_kernel(
         T** input_ptrs,
@@ -566,31 +575,48 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
         size_t numel,
         uint32_t** signal_pads,
         size_t rank,
-        size_t world_size) {
+        size_t world_size,
+        size_t last_dim_size = 0) {
   static_assert(alignment % sizeof(T) == 0);
   constexpr size_t numel_per_thread = alignment / sizeof(T);
-
+  int32_t N_last_dim =
+      last_dim_size / world_size; // used only for split_last_dim reduce_scatter
   sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
   __syncthreads();
 
   const size_t numel_per_rank =
-      at::round_up(numel, alignment * world_size) / world_size;
-  const size_t start = numel_per_rank * rank;
+      at::round_up(numel, numel_per_thread * world_size) / world_size;
+  const size_t start = split_last_dim ? last_dim_size / world_size * rank
+                                      : numel_per_rank * rank;
 
   auto offset = (blockDim.x * blockIdx.x + threadIdx.x) * numel_per_thread;
   auto stride = blockDim.x * gridDim.x * numel_per_thread;
   for (size_t i = offset; i < numel_per_rank; i += stride) {
-    if (start + i >= numel) {
-      continue;
+    if constexpr (!reduce_scatter) {
+      // we call reduce-scatter only with evenly divisible number of elements
+      if (start + i >= numel) {
+        continue;
+      }
+    }
+    size_t idx = i;
+    if constexpr (split_last_dim) {
+      idx = i / N_last_dim * last_dim_size + i % N_last_dim;
     }
     auto vec = load_and_reduce<T, alignment, k_world_size>(
-        input_ptrs, rank, world_size, input_offset + start + i);
-    // store to local buffer
-    st_vec<alignment>(input_ptrs[rank] + input_offset + start + i, vec);
+        input_ptrs, rank, world_size, input_offset + start + idx);
+    // store to local buffer or to output
+    if constexpr (reduce_scatter) {
+      st_vec<alignment>(output_ptr + i, vec);
+    } else {
+      st_vec<alignment>(input_ptrs[rank] + input_offset + start + i, vec);
+    }
   }
 
   __syncthreads();
   sync_remote_blocks<std::memory_order_acq_rel>(signal_pads, rank, world_size);
+  if constexpr (reduce_scatter) {
+    return;
+  }
   __syncthreads();
   for (size_t i = offset; i < numel_per_rank; i += stride) {
     Vec<alignment> tmp[k_world_size];
@@ -611,8 +637,7 @@ static __launch_bounds__(two_shot_all_reduce_max_num_threads) __global__
       if (remote_start + i >= numel) {
         continue;
       }
-      st_vec<alignment>(
-          output_ptr + remote_start + i, tmp[step]);
+      st_vec<alignment>(output_ptr + remote_start + i, tmp[step]);
     }
   }
   // need to make sure all blocks exit simultaneously so that the data
@@ -679,11 +704,28 @@ at::Tensor two_shot_all_reduce_impl(
       get_and_verify_alignment(input, "two_shot_all_reduce");
 
   if (output.has_value()) {
+    TORCH_CHECK(
+        output->is_contiguous(),
+        "two_shot_all_reduce: output must be contiguous.");
     const size_t output_alignment =
         get_and_verify_alignment(*output, "two_shot_all_reduce");
     TORCH_CHECK(
         alignment <= output_alignment,
         "two_shot_all_reduce: output alignment must be equal to or larger than input.");
+    TORCH_CHECK(
+        output->sizes() == input.sizes(),
+        "two_shot_all_reduce: input/output size mismatch, input.sizes(): ",
+        input.sizes(),
+        ", output.sizes(): ",
+        output->sizes());
+    if (input.numel() == 0) {
+      TORCH_CHECK(output->scalar_type() == input.scalar_type());
+      return *output;
+    }
+  } else {
+    if (input.numel() == 0) {
+      return input;
+    }
   }
 
   int num_blocks = 0, num_threads = 0;
@@ -764,6 +806,146 @@ at::Tensor two_shot_all_reduce_out(
     at::Tensor output) {
   return two_shot_all_reduce_impl(input, output, reduce_op, group_name);
 }
+
+at::Tensor reduce_scatter_out(
+    at::Tensor input,
+    std::string group_name,
+    bool split_last_dim,
+    at::Tensor output) {
+  TORCH_CHECK(
+      input.is_contiguous(), "reduce_scatter: input must be contiguous.");
+  TORCH_CHECK(
+      output.is_contiguous(), "reduce_scatter: output must be contiguous.");
+
+  auto symm_mem = c10d::symmetric_memory::rendezvous(input, group_name);
+  TORCH_CHECK(
+      symm_mem != nullptr,
+      "reduce_scatter: input must be allocated with empty_strided_p2p().");
+
+  const size_t alignment = get_and_verify_alignment(input, "reduce_scatter");
+
+  const size_t output_alignment =
+      get_and_verify_alignment(input, "reduce_scatter");
+
+  TORCH_CHECK(
+      input.numel() %
+              (symm_mem->get_world_size() *
+               (alignment / input.element_size())) ==
+          0,
+      "expected number of elements to be divisible by world_size * alignment, number of elements ",
+      input.numel(),
+      " world size ",
+      symm_mem->get_world_size(),
+      "alignment ",
+      alignment);
+
+  if (split_last_dim) {
+    TORCH_CHECK(input.dim() == output.dim());
+    bool are_equal_except_last = std::equal(
+        input.sizes().begin(), input.sizes().end() - 1, output.sizes().begin());
+    TORCH_CHECK(
+        are_equal_except_last,
+        "reduce_scatter expected input and output to have same sizes except in the last dimension");
+    TORCH_CHECK(
+        output.size(-1) == input.size(-1) / symm_mem->get_world_size(),
+        "reduce_scatter expected output last dim size to be input last dim size / world_size");
+
+    TORCH_CHECK(
+        input.size(-1) %
+                (symm_mem->get_world_size() *
+                 (alignment / input.element_size())) ==
+            0,
+        "expected last dimension to be divisible by world_size * alignment, last dimension ",
+        input.size(-1),
+        " world size ",
+        symm_mem->get_world_size(),
+        "alignment ",
+        alignment);
+  } else {
+    TORCH_CHECK(input.dim() == 1, "reduce_scatter expected 1D input");
+    TORCH_CHECK(output.dim() == 1, "reduce_scatter expected 1D output");
+    TORCH_CHECK(output.numel() == input.numel() / symm_mem->get_world_size());
+  }
+  if (input.numel() == 0) {
+    TORCH_CHECK(input.scalar_type() == output.scalar_type());
+    return output;
+  }
+
+  TORCH_CHECK(
+      output_alignment >= alignment,
+      "reduce_scatter: output alignment should be not smaller than input alignment");
+
+  int num_blocks = 0, num_threads = 0;
+  init_elementwise_launch_config(
+      input.numel(),
+      input.element_size(),
+      alignment,
+      symm_mem->get_world_size(),
+      two_shot_all_reduce_max_num_blocks,
+      two_shot_all_reduce_max_num_threads,
+      num_blocks,
+      num_threads);
+  if (split_last_dim) {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES_NO_DEFAULT(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel<
+                  scalar_t,
+                  k_alignment,
+                  k_world_size,
+                  true,
+                  true>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      output.data_ptr<scalar_t>(),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size(),
+                      input.size(-1));
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+  } else {
+    AT_DISPATCH_FLOAT_AND_BFLOAT16(
+        input.scalar_type(), "two_shot_all_reduce", [&]() {
+          DISPATCH_ALIGNMENTS_16_8_4(alignment, [&]() {
+            DISPATCH_WORLD_SIZES_NO_DEFAULT(symm_mem->get_world_size(), [&]() {
+              two_shot_all_reduce_kernel<
+                  scalar_t,
+                  k_alignment,
+                  k_world_size,
+                  true,
+                  false>
+                  <<<num_blocks,
+                     num_threads,
+                     0,
+                     at::cuda::getCurrentCUDAStream()>>>(
+                      reinterpret_cast<scalar_t**>(
+                          symm_mem->get_buffer_ptrs_dev()),
+                      output.data_ptr<scalar_t>(),
+                      input.storage_offset(),
+                      input.numel(),
+                      reinterpret_cast<uint32_t**>(
+                          symm_mem->get_signal_pad_ptrs_dev()),
+                      symm_mem->get_rank(),
+                      symm_mem->get_world_size(),
+                      input.size(-1));
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+          });
+        });
+  }
+  return output;
+}
 } // namespace
 #endif // #if defined(CUDART_VERSION) && CUDART_VERSION >= 12030
 
@@ -899,6 +1081,7 @@ TORCH_LIBRARY_IMPL(symm_mem, CUDA, m) {
   m.impl("one_shot_all_reduce_copy_out", ::one_shot_all_reduce_copy_out);
   m.impl("two_shot_all_reduce_", ::two_shot_all_reduce_);
   m.impl("two_shot_all_reduce_out", ::two_shot_all_reduce_out);
+  m.impl("reduce_scatter_out", ::reduce_scatter_out);
 
   m.impl("_async_input_mm", c10d::cuda::detail::async_input_mm);
 #endif
diff --git a/torch/csrc/distributed/c10d/SymmetricMemory.cpp b/torch/csrc/distributed/c10d/SymmetricMemory.cpp
@@ -250,6 +250,10 @@ TORCH_LIBRARY_FRAGMENT(symm_mem, m) {
   m.def(
       "two_shot_all_reduce_out(Tensor(a!) input, str reduce_op, str group_name, Tensor(b!) output) -> Tensor(b!)");
 
+  // note this implementation also modified the input tensor
+  m.def(
+      "reduce_scatter_out(Tensor(a!) input, str group_name, bool split_last_dim, Tensor(b!) output) -> Tensor(b!)");
+
   // An mm that supports consuming asynchronous input. It guarantees the
   // following rasterization order, and that the corresponding signal arrives
   // before an input chunk is consumed.