apache · tqchen · Mar 20, 2024 · Mar 19, 2024 · masahi · Apr 18, 2024
diff --git a/3rdparty/tensorrt_llm/custom_allreduce_kernels.cu b/3rdparty/tensorrt_llm/custom_allreduce_kernels.cu
diff --git a/3rdparty/tensorrt_llm/custom_allreduce_kernels.h b/3rdparty/tensorrt_llm/custom_allreduce_kernels.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuda_fp16.h>
+#include <stdint.h>
+
+namespace tensorrt_llm {
+
+constexpr size_t WARP_SIZE = 32;
+constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
+constexpr size_t MAX_RANKS_PER_NODE = 8;
+constexpr size_t DEFAULT_BLOCK_SIZE = 1024;
+
+enum class AllReduceStrategyType : int8_t {
+  ONESHOT = 1,
+  TWOSHOT = 2,
+};
+
+struct AllReduceParams {
+  size_t elts_total;
+  size_t elts_per_rank;
+  size_t elts_per_block;
+  size_t rank_offset;
+  size_t ranks_per_node, rank, local_rank;
+  uint32_t barrier_flag;
+  uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
+  uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
+  void* peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE];
+  void* local_output_buffer_ptr;
+};
+
+void customAllReduce(AllReduceParams& params, void* data, size_t elts, DLDataType dataType,
+                     AllReduceStrategyType strat, cudaStream_t stream);
+
+}  // namespace tensorrt_llm
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -453,7 +453,7 @@ endif(USE_PROFILER)
 if(USE_CUDA AND USE_NCCL)
   message(STATUS "Build with NCCL...")
   find_nccl(${USE_NCCL})
-  tvm_file_glob(GLOB RUNTIME_NCCL_SRC src/runtime/disco/nccl/*.cc)
+  tvm_file_glob(GLOB RUNTIME_NCCL_SRC src/runtime/disco/nccl/*.cc src/runtime/disco/cuda_ipc/*.cc 3rdparty/tensorrt_llm/*.cu)
   set_source_files_properties(src/runtime/disco/nccl/nccl.cc PROPERTIES COMPILE_DEFINITIONS "TVM_NCCL_RCCL_SWITCH=0")
   list(APPEND RUNTIME_SRCS ${RUNTIME_NCCL_SRC})
 endif()

diff --git a/LICENSE b/LICENSE
@@ -215,6 +215,7 @@ Apache Software Foundation License 2.0
 3rdparty/mlperftiny
 3rdparty/nvbench (with LLVM exception)
 3rdparty/cutlass_fpA_intB_gemm
+3rdparty/tensorrt_llm
 
 BSD 2-clause License
 --------------------

diff --git a/include/tvm/runtime/disco/cuda_ipc_memory.h b/include/tvm/runtime/disco/cuda_ipc_memory.h
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_
+#define TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_
+
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/memory/memory_manager.h>
+#include <tvm/runtime/object.h>
+
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+namespace cuda_ipc {
+
+/*!
+ * \brief The CUDA IPC (interprocess communication) memory object,
+ * which internally contains data pointers to CUDA IPC memory.
+ * It is be useful for efficient all-reduce implementation.
+ * \note Right now the class members are closely tied with customized
+ * all-reduce kernel. They may also be extended for other uses in
+ * the future.
+ */
+class CUDAIPCMemoryObj : public Object {
+ public:
+  /*! \brief The number of GPU workers. */
+  int num_workers;
+  /*! \brief The worker id corresponding to this IPC memory object. */
+  int worker_id;
+  /*!
+   * \brief The data pointers of all all-reduce inputs.
+   * It has "num_workers" pointers. The i-th pointer is the data pointer on worker i.
+   * If "i != worker_id", the pointer is an IPC data pointer.
+   * Otherwise, the pointer is a local CUDA data pointer.
+   */
+  std::vector<void*> remote_data;
+
+  // We introduce the barrier helper data below per CUDAIPCMemory object
+  // so that they can be used by custom collective operations and allow
+  // fine-grained synchronization on each buffer. These barriers have
+  // low overhead, and can potentially enable concurrent execution of
+  // kernels in future.
+  /*!
+   * \brief The pointers to input barrier signals of all workers for all-reduce.
+   * It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data".
+   */
+  std::vector<void*> barrier_in;
+  /*!
+   * \brief The pointers to output barrier signals of all workers for all-reduce.
+   * It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data".
+   */
+  std::vector<void*> barrier_out;
+  /*! \brief The integer buffer flag for all-reduce. */
+  int barrier_flag;
+
+  static constexpr const char* _type_key = "tvm.runtime.disco.cuda_ipc_memory";
+  static constexpr const bool _type_has_method_sequal_reduce = false;
+  static constexpr const bool _type_has_method_shash_reduce = false;
+  TVM_DECLARE_BASE_OBJECT_INFO(CUDAIPCMemoryObj, Object);
+};
+
+/*!
+ * \brief Managed reference to CUDAIPCMemoryObj.
+ * \sa CUDAIPCMemory
+ */
+class CUDAIPCMemory : public ObjectRef {
+ public:
+  /*! \brief Get the global singleton CUDAIPCMemory allocator. */
+  TVM_DLL static memory::Allocator* GlobalAllocator();
+  /*!
+   * \brief Given a local CUDA data pointer, return the CUDAIPCMemory object of the pointer.
+   * \note The pointer's CUDAIPCMemory is expected to have been allocated
+   * through global function "cuda_ipc.alloc_storage". Or otherwise this
+   * function will raise exception.
+   */
+  TVM_DLL static CUDAIPCMemory GetIPCMemoryFromDevicePtr(void* ptr);
+
+  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(CUDAIPCMemory, ObjectRef, CUDAIPCMemoryObj);
+};
+
+}  // namespace cuda_ipc
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_
diff --git a/include/tvm/runtime/memory/memory_manager.h b/include/tvm/runtime/memory/memory_manager.h
@@ -99,6 +99,10 @@ class Allocator {
    */
   TVM_DLL virtual size_t UsedMemory() const = 0;
 
+ protected:
+  /*! \brief Check if the given memory scope is allowed to allocate by the allocator. */
+  TVM_DLL virtual bool AllowMemoryScope(const std::string& mem_scope) const;
+
  private:
   AllocatorType type_;
 };
@@ -137,17 +141,16 @@ class StorageObj : public Object {
  public:
   /*! \brief The index into the VM function table. */
   Buffer buffer;
+  /*! \brief The allocator where the storage buffer is allocated from. */
+  Allocator* allocator;
 
   /*! \brief Allocate an NDArray from a given piece of storage. */
   TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);
 
   /*! \brief The deleter for an NDArray when allocated from underlying storage. */
   static void Deleter(Object* ptr);
 
-  ~StorageObj() {
-    auto alloc = MemoryManager::Global()->GetAllocator(buffer.device, buffer.alloc_type);
-    alloc->Free(buffer);
-  }
+  ~StorageObj() { allocator->Free(buffer); }
 
   static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
   static constexpr const char* _type_key = "vm.Storage";
@@ -157,7 +160,7 @@ class StorageObj : public Object {
 /*! \brief reference to storage. */
 class Storage : public ObjectRef {
  public:
-  TVM_DLL explicit Storage(Buffer buffer);
+  TVM_DLL explicit Storage(Buffer buffer, Allocator* allocator);
 auto storage_obj = runtime::SimpleObjAllocator().make_object<memory::StorageObj>(); 
 nccl::CCLThreadLocalContext* nccl_ctx = nccl::CCLThreadLocalContext::Get(); 
 Device device{DLDeviceType::kDLCUDA, nccl_ctx->device_id}; 
 CUDAIPCMemoryAllocator* allocator = CUDAIPCMemoryAllocator::Global(); 
 storage_obj->buffer = CUDAIPCMemoryAllocator::Global()->Alloc( 
     device, std::move(buffer_shape), dtype_hint, /*mem_scope=*/"ipc_memory"); 
 storage_obj->allocator = allocator; 
   storage_obj->buffer = allocator->Alloc(device, ShapeTuple(shape_), 
                                          instr.alloc_storage.dtype_hint, mem_scope); 
   storage_obj->allocator = allocator; 
 } else { 
   auto size = LoadScalarInt(instr.alloc_storage.allocation_size); 
   auto alignment = instr.alloc_storage.alignment; 
   VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment 
           << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) 
           << ", device_index=" << instr.alloc_storage.device_index; 
   storage_obj->buffer = 
       allocator->Alloc(device, size, alignment, instr.alloc_storage.dtype_hint); 
   storage_obj->allocator = allocator; 
 auto storage_obj = runtime::SimpleObjAllocator().make_object<memory::StorageObj>(); 
 nccl::CCLThreadLocalContext* nccl_ctx = nccl::CCLThreadLocalContext::Get(); 
 Device device{DLDeviceType::kDLCUDA, nccl_ctx->device_id}; 
 CUDAIPCMemoryAllocator* allocator = CUDAIPCMemoryAllocator::Global(); 
 storage_obj->buffer = CUDAIPCMemoryAllocator::Global()->Alloc( 
     device, std::move(buffer_shape), dtype_hint, /*mem_scope=*/"ipc_memory"); 
 storage_obj->allocator = allocator; 
   storage_obj->buffer = allocator->Alloc(device, ShapeTuple(shape_), 
                                          instr.alloc_storage.dtype_hint, mem_scope); 
   storage_obj->allocator = allocator; 
 } else { 
   auto size = LoadScalarInt(instr.alloc_storage.allocation_size); 
   auto alignment = instr.alloc_storage.alignment; 
   VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment 
           << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint) 
           << ", device_index=" << instr.alloc_storage.device_index; 
   storage_obj->buffer = 
       allocator->Alloc(device, size, alignment, instr.alloc_storage.dtype_hint); 
   storage_obj->allocator = allocator; 
 
   TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Storage, ObjectRef, StorageObj);
 };