Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
400 changes: 400 additions & 0 deletions 3rdparty/tensorrt_llm/custom_allreduce_kernels.cu

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions 3rdparty/tensorrt_llm/custom_allreduce_kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuda_fp16.h>
#include <stdint.h>

namespace tensorrt_llm {

constexpr size_t WARP_SIZE = 32;
constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24;
constexpr size_t MAX_RANKS_PER_NODE = 8;
constexpr size_t DEFAULT_BLOCK_SIZE = 1024;

enum class AllReduceStrategyType : int8_t {
ONESHOT = 1,
TWOSHOT = 2,
};

struct AllReduceParams {
size_t elts_total;
size_t elts_per_rank;
size_t elts_per_block;
size_t rank_offset;
size_t ranks_per_node, rank, local_rank;
uint32_t barrier_flag;
uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE];
uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE];
void* peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE];
void* local_output_buffer_ptr;
};

void customAllReduce(AllReduceParams& params, void* data, size_t elts, DLDataType dataType,
AllReduceStrategyType strat, cudaStream_t stream);

} // namespace tensorrt_llm
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ endif(USE_PROFILER)
if(USE_CUDA AND USE_NCCL)
message(STATUS "Build with NCCL...")
find_nccl(${USE_NCCL})
tvm_file_glob(GLOB RUNTIME_NCCL_SRC src/runtime/disco/nccl/*.cc)
tvm_file_glob(GLOB RUNTIME_NCCL_SRC src/runtime/disco/nccl/*.cc src/runtime/disco/cuda_ipc/*.cc 3rdparty/tensorrt_llm/*.cu)
set_source_files_properties(src/runtime/disco/nccl/nccl.cc PROPERTIES COMPILE_DEFINITIONS "TVM_NCCL_RCCL_SWITCH=0")
list(APPEND RUNTIME_SRCS ${RUNTIME_NCCL_SRC})
endif()
Expand Down
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ Apache Software Foundation License 2.0
3rdparty/mlperftiny
3rdparty/nvbench (with LLVM exception)
3rdparty/cutlass_fpA_intB_gemm
3rdparty/tensorrt_llm

BSD 2-clause License
--------------------
Expand Down
102 changes: 102 additions & 0 deletions include/tvm/runtime/disco/cuda_ipc_memory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

#ifndef TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_
#define TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_

#include <tvm/runtime/c_runtime_api.h>
#include <tvm/runtime/memory/memory_manager.h>
#include <tvm/runtime/object.h>

#include <vector>

namespace tvm {
namespace runtime {
namespace cuda_ipc {

/*!
* \brief The CUDA IPC (interprocess communication) memory object,
* which internally contains data pointers to CUDA IPC memory.
* It is be useful for efficient all-reduce implementation.
* \note Right now the class members are closely tied with customized
* all-reduce kernel. They may also be extended for other uses in
* the future.
*/
class CUDAIPCMemoryObj : public Object {
public:
/*! \brief The number of GPU workers. */
int num_workers;
/*! \brief The worker id corresponding to this IPC memory object. */
int worker_id;
/*!
* \brief The data pointers of all all-reduce inputs.
* It has "num_workers" pointers. The i-th pointer is the data pointer on worker i.
* If "i != worker_id", the pointer is an IPC data pointer.
* Otherwise, the pointer is a local CUDA data pointer.
*/
std::vector<void*> remote_data;

// We introduce the barrier helper data below per CUDAIPCMemory object
// so that they can be used by custom collective operations and allow
// fine-grained synchronization on each buffer. These barriers have
// low overhead, and can potentially enable concurrent execution of
// kernels in future.
/*!
* \brief The pointers to input barrier signals of all workers for all-reduce.
* It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data".
*/
std::vector<void*> barrier_in;
/*!
* \brief The pointers to output barrier signals of all workers for all-reduce.
* It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data".
*/
std::vector<void*> barrier_out;
/*! \brief The integer buffer flag for all-reduce. */
int barrier_flag;

static constexpr const char* _type_key = "tvm.runtime.disco.cuda_ipc_memory";
static constexpr const bool _type_has_method_sequal_reduce = false;
static constexpr const bool _type_has_method_shash_reduce = false;
TVM_DECLARE_BASE_OBJECT_INFO(CUDAIPCMemoryObj, Object);
};

/*!
* \brief Managed reference to CUDAIPCMemoryObj.
* \sa CUDAIPCMemory
*/
class CUDAIPCMemory : public ObjectRef {
public:
/*! \brief Get the global singleton CUDAIPCMemory allocator. */
TVM_DLL static memory::Allocator* GlobalAllocator();
/*!
* \brief Given a local CUDA data pointer, return the CUDAIPCMemory object of the pointer.
* \note The pointer's CUDAIPCMemory is expected to have been allocated
* through global function "cuda_ipc.alloc_storage". Or otherwise this
* function will raise exception.
*/
TVM_DLL static CUDAIPCMemory GetIPCMemoryFromDevicePtr(void* ptr);

TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(CUDAIPCMemory, ObjectRef, CUDAIPCMemoryObj);
};

} // namespace cuda_ipc
} // namespace runtime
} // namespace tvm

#endif // TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_
13 changes: 8 additions & 5 deletions include/tvm/runtime/memory/memory_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ class Allocator {
*/
TVM_DLL virtual size_t UsedMemory() const = 0;

protected:
/*! \brief Check if the given memory scope is allowed to allocate by the allocator. */
TVM_DLL virtual bool AllowMemoryScope(const std::string& mem_scope) const;

private:
AllocatorType type_;
};
Expand Down Expand Up @@ -137,17 +141,16 @@ class StorageObj : public Object {
public:
/*! \brief The index into the VM function table. */
Buffer buffer;
/*! \brief The allocator where the storage buffer is allocated from. */
Allocator* allocator;

/*! \brief Allocate an NDArray from a given piece of storage. */
TVM_DLL NDArray AllocNDArray(int64_t offset, ShapeTuple shape, DLDataType dtype);

/*! \brief The deleter for an NDArray when allocated from underlying storage. */
static void Deleter(Object* ptr);

~StorageObj() {
auto alloc = MemoryManager::Global()->GetAllocator(buffer.device, buffer.alloc_type);
alloc->Free(buffer);
}
~StorageObj() { allocator->Free(buffer); }

static constexpr const uint32_t _type_index = TypeIndex::kDynamic;
static constexpr const char* _type_key = "vm.Storage";
Expand All @@ -157,7 +160,7 @@ class StorageObj : public Object {
/*! \brief reference to storage. */
class Storage : public ObjectRef {
public:
TVM_DLL explicit Storage(Buffer buffer);
TVM_DLL explicit Storage(Buffer buffer, Allocator* allocator);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this constructor actually used? In this PR I only see manual attachment like storage_obj->allocator = allocator;.

We started to see a segfault like free(): invalid pointer when a python process that runs a script exits, after we integrated recent upstream changes. I'm investigating if the segfault is really due to some recent upstream changes, and which one if so.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is used so recycle goes back to original allocator, in this case we need to recycle to IPCMmeory in the special allocator

Copy link
Member

@masahi masahi Apr 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is used

Can you point out the code? In our fork, I don't see it used other than paged_kv_cache.cc.

Copy link
Contributor Author

@MasterJH5574 MasterJH5574 Apr 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are mixture usages of Storage/StorageObj in TVM. The only use of the constructor seems to be in paged_kv_cache.cc in TVM. For other constructions of Storage, they all use make_object and directly set the allocator

auto storage_obj = runtime::SimpleObjAllocator().make_object<memory::StorageObj>();
nccl::CCLThreadLocalContext* nccl_ctx = nccl::CCLThreadLocalContext::Get();
Device device{DLDeviceType::kDLCUDA, nccl_ctx->device_id};
CUDAIPCMemoryAllocator* allocator = CUDAIPCMemoryAllocator::Global();
storage_obj->buffer = CUDAIPCMemoryAllocator::Global()->Alloc(
device, std::move(buffer_shape), dtype_hint, /*mem_scope=*/"ipc_memory");
storage_obj->allocator = allocator;

https://github.com/apache/tvm/blob/622bd150dd331780eb41a1c67c65aae802eb9b20/src/runtime/relax_vm/builtin.cc

tvm/src/runtime/vm/vm.cc

Lines 848 to 859 in 622bd15

storage_obj->buffer = allocator->Alloc(device, ShapeTuple(shape_),
instr.alloc_storage.dtype_hint, mem_scope);
storage_obj->allocator = allocator;
} else {
auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
auto alignment = instr.alloc_storage.alignment;
VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment
<< ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
<< ", device_index=" << instr.alloc_storage.device_index;
storage_obj->buffer =
allocator->Alloc(device, size, alignment, instr.alloc_storage.dtype_hint);
storage_obj->allocator = allocator;

There is a use of the constructor on MLC LLM side https://github.com/mlc-ai/mlc-llm/blob/main/cpp/serve/model.cc#L68-L69, where we use the constructor so that we don't need to expand it into the make_object lines.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What worries me is that the code is doing allocator->Free(buffer); in the destructor. But how can you be sure that it's safe if there is no guarantee that the allocator is always attached to the storage when it is created?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Can we just check the allocator is defined by adding ICHECK in the destructor, or change all make_object to use the constructor and add ICHECK inside? If I remember correctly, I made a pass over all occurrences and attached the allocators.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just check the allocator is defined by adding ICHECK in the destructor

Yes, I tried that in our fork but the segfault is still there. So this PR is not a cause of our problem.

But still, this doesn't change my concern about this code. I'm sure that you checked all usages, but there is always a possibility to miss some. And you never know about the usages in new code or someone else's fork.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree with your concern. I can send a patch to add some checks for more safety later on.


TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Storage, ObjectRef, StorageObj);
};
Expand Down
Loading