From bb12940c5bf972bf0dc187ac8d51f6dd28eaf773 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 29 Jul 2025 16:14:54 -0700 Subject: [PATCH 01/13] CUDA13 fixes Signed-off-by: Boris Fomitchev --- 3rdparty/cutlass | 2 +- cpp/tensorrt_llm/CMakeLists.txt | 1 + .../fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh | 2 ++ .../decoderMaskedMultiheadAttentionTemplate.h | 2 +- cpp/tensorrt_llm/kernels/sageAttentionKernels.cu | 4 ++-- .../kernels/speculativeDecoding/eagleDecodingKernels.cu | 2 +- .../kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu | 4 ++-- cpp/tensorrt_llm/runtime/utils/debugUtils.cu | 2 +- requirements.txt | 4 ++-- 9 files changed, 13 insertions(+), 10 deletions(-) diff --git a/3rdparty/cutlass b/3rdparty/cutlass index dc4817921ed..664c4f7b3ed 160000 --- a/3rdparty/cutlass +++ b/3rdparty/cutlass @@ -1 +1 @@ -Subproject commit dc4817921edda44a549197ff3a9dcf5df0636e7b +Subproject commit 664c4f7b3ed1959414905025728eef5568209479 diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index f8e3aaad9e2..2ce51ba05ca 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -20,6 +20,7 @@ set(SHARED_TARGET set(API_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/cutlass_extensions/include + /usr/local/cuda-13.0/targets/x86_64-linux/include/cccl ${API_INCLUDE_DIR}) set(TARGET_ARCH "unknown") diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index b105368af03..af603a0a3c6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -24,6 +24,8 @@ #include #include +#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 + namespace tensorrt_llm::kernels::fp8_blockscale_gemm { diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h index ccda8ce2042..43409dc4b4a 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h @@ -2597,7 +2597,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske __shared__ typename BlockReduce::TempStorage temp_storage; // Obtain a segment of consecutive items that are blocked across threads (final_max from above) // Compute the block-wide max for thread0 - final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cub::Max(), gridDim.z); + final_max = BlockReduce(temp_storage).Reduce(thread_partial_max, cuda::maximum(), gridDim.z); __shared__ float final_max_smem; if (tidx == 0) diff --git a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu index 80a12b41ce5..e45a7bb97f9 100644 --- a/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu +++ b/cpp/tensorrt_llm/kernels/sageAttentionKernels.cu @@ -250,7 +250,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i // Compute the block-wide max for thread0 // cuda::maximum<>{} - float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{}); + float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{}); if (row_id == 0 && col_id == 0) s_block_amax = static_cast(aggregate); @@ -429,7 +429,7 @@ __global__ void sage_quant_kernel(void const* q, void const* k, void const* v, i // Compute the block-wide max for thread0 // cuda::maximum<>{} - float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cub::Max{}); + float aggregate = BlockReduce(temp_storage).Reduce(local_amax, cuda::maximum{}); if (row_id == 0 && col_id == 0) s_block_amax = static_cast(aggregate); diff --git a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu index b3a90bea5f8..e963033855b 100644 --- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu +++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu @@ -504,7 +504,7 @@ __global__ void prepareGenEagleNetInputsKernel(SizeType32* nextSequenceLengths, BlockScan(tempStorage.scan).ExclusiveSum(numNextLogits, outputLastIndicesBase); // Sync because tempStorage is reused. __syncthreads(); - auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cub::Max()); + auto const maxGenLength = BlockReduce(tempStorage.reduce).Reduce(nextDraftLen, cuda::maximum()); // Thread 0 has the result. if (bid == 0) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu index ad5cd15fdda..ba850c45a2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu @@ -154,7 +154,7 @@ __global__ void activationDeepSeekKernel(KernelParams params) float constexpr E4m3MaxVal{448.f}; // Compute the absolute max - float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cub::Max()); + float aMax = BlockReduce(temp_storage).Reduce(fabsf(out), cuda::maximum()); if (threadIdx.x == 0) { s_scaleOut = aMax / E4m3MaxVal; @@ -657,7 +657,7 @@ __global__ void finalizeDeepSeekKernel(KernelParams params) float constexpr E4m3MaxVal{448.f}; // Compute the absolute max - float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cub::Max()); + float aMax = BlockReduce(temp_storage).Reduce(fabsf(acc), cuda::maximum()); if (threadIdx.x == 0) { diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu index 7f1c8d8dfc6..661dacd9a7a 100644 --- a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu +++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu @@ -54,7 +54,7 @@ __global__ void checkTensorInvalidKernel(T const* data, std::size_t size, int* f __shared__ typename BlockReduceT::TempStorage tempStorage; // Compute block-wide maximum - int blockFound = BlockReduceT(tempStorage).Reduce(found, cub::Max()); + int blockFound = BlockReduceT(tempStorage).Reduce(found, cuda::maximum()); // Have thread 0 write out block's result if (threadIdx.x == 0) diff --git a/requirements.txt b/requirements.txt index 16c1e4b5f8c..1d534fa6647 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,9 +21,9 @@ pandas h5py==3.12.1 StrEnum sentencepiece>=0.1.99 -tensorrt~=10.11.0 +tensorrt~=10.13.0 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0. -torch>=2.7.1,<=2.8.0a0 +torch>=2.7.1,<=2.9.0a0 torchvision nvidia-modelopt[torch]~=0.33.0 nvidia-nccl-cu12 From b18be91ff37227f6c311efaa7e3000fd97b85384 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Mon, 11 Aug 2025 18:01:41 -0700 Subject: [PATCH 02/13] Stash Signed-off-by: Boris Fomitchev --- cpp/CMakeLists.txt | 1 + .../tensorrt_llm/deep_gemm/tma_utils.cuh | 6 ++-- .../fp8_blockscale_tma_utils.cuh | 6 ++-- requirements.txt | 34 ++++++++++--------- 4 files changed, 25 insertions(+), 22 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6732db6eaa7..fe873a6e7c8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -212,6 +212,7 @@ endif() include_directories( SYSTEM ${CUDAToolkit_INCLUDE_DIRS} + /usr/local/cuda-13.0/targets/x86_64-linux/include/cccl ${CUDNN_ROOT_DIR}/include $ ${3RDPARTY_DIR}/cutlass/include diff --git a/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh b/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh index 411d7447600..33ddfd31ec3 100644 --- a/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh +++ b/cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh @@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType() } } -PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() +PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() { // Get pointer to `cuTensorMapEncodeTiled` cudaDriverEntryPointQueryResult driver_status; @@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() if (driver_status != cudaDriverEntryPointSuccess) throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess"); - return reinterpret_cast(cuTensorMapEncodeTiled_ptr); + return reinterpret_cast(cuTensorMapEncodeTiled_ptr); } template CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes, - uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) + uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) { CUtensorMap tensor_map{}; constexpr uint32_t rank = 2; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index b105368af03..18911feb7c4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType() } } -PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() +PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() { // Get pointer to cuTensorMapEncodeTiled cudaDriverEntryPointQueryResult driver_status; @@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess"); } - return reinterpret_cast(cuTensorMapEncodeTiled_ptr); + return reinterpret_cast(cuTensorMapEncodeTiled_ptr); } template CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes, - uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) + uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) { CUtensorMap tensor_map{}; constexpr uint32_t rank = 2; diff --git a/requirements.txt b/requirements.txt index 16c1e4b5f8c..3d90380972e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,9 @@ ---extra-index-url https://download.pytorch.org/whl/cu128 +--extra-index-url https://download.pytorch.org/whl/cu130 -c constraints.txt accelerate>=0.25.0 build colored -cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image. +cuda-python~=13.0.0 # Do not override the custom version of cuda-python installed in the NGC PyTorch image. diffusers>=0.27.0 lark mpi4py @@ -13,27 +13,29 @@ onnx_graphsurgeon>=0.5.2 openai polygraphy psutil -nvidia-ml-py>=12,<13 +nvidia-ml-py +# >=12,<13 # Just a wrapper since nvidia-modelopt requires pynvml -pynvml==12.0.0 +pynvml pulp pandas -h5py==3.12.1 +h5py>=3.12.1 StrEnum sentencepiece>=0.1.99 -tensorrt~=10.11.0 +tensorrt~=10.13.0 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0. -torch>=2.7.1,<=2.8.0a0 +torch>=2.7.1,<=2.8.0 torchvision nvidia-modelopt[torch]~=0.33.0 -nvidia-nccl-cu12 -nvidia-cuda-nvrtc-cu12 -transformers==4.53.1 +nvidia-nccl-cu13 +nvidia-cuda-nvrtc-cu13 +transformers~=4.55.0 pydantic>=2.9.1 pydantic-settings[yaml] omegaconf -pillow==10.3.0 -wheel<=0.45.1 +pillow>=10.3.0 +wheel +#<=0.45.1 optimum # evaluate needs datasets>=2.0.0 which triggers datasets>3.1.0 which is not stable: https://github.com/huggingface/datasets/issues/7467 datasets==3.1.0 @@ -43,15 +45,15 @@ click click_option_group aenum pyzmq -fastapi==0.115.4 +fastapi>=0.115.4 uvicorn setuptools<80 ordered-set peft einops -flashinfer-python==0.2.5 +flashinfer-python>=0.2.5 opencv-python-headless -xgrammar==0.1.19 +xgrammar>=0.1.19 backoff nvtx matplotlib # FIXME: this is added to make nvtx happy @@ -59,5 +61,5 @@ meson ninja etcd3 blake3 -llguidance==0.7.29 +llguidance>=0.7.29 soundfile From c61e9ba5cb785616768c887aa8ae9fac7757ba82 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Mon, 11 Aug 2025 23:08:37 -0700 Subject: [PATCH 03/13] Trying to build with CUDA13 Signed-off-by: Boris Fomitchev --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 7 +++++++ cpp/tensorrt_llm/kernels/topkLastDim.cu | 8 ++++---- requirements.txt | 4 ---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 088391aef4f..da2e6726171 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,3 +1,10 @@ +enable_language(C CXX CUDA) + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) + set(DEEP_EP_COMMIT edf3ea2b086a393d3163bf2773eab69d9191cc01) set(NVSHMEM_URL_HASH SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a) diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index 3371ab4a0f2..938e7d3d642 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -1221,8 +1221,8 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx IdxT* sort_in_idx = nullptr; air_topk_stable::ComputeOffset computeoffset(k); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator, cub::CountingInputIterator> + thrust::counting_iterator counting_iter(0); + thrust::transform_iterator, thrust::counting_iterator> transform_iter(counting_iter, computeoffset); cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, batch_size, transform_iter, transform_iter + 1, stream); @@ -1348,8 +1348,8 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons const IdxT buf_len = air_topk_stable::calc_buf_len(len); air_topk_stable::ComputeOffset computeoffset(k); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator, cub::CountingInputIterator> + thrust::counting_iterator counting_iter(0); + thrust::transform_iterator, thrust::counting_iterator> transform_iter(counting_iter, computeoffset); cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, diff --git a/requirements.txt b/requirements.txt index c552865bd9a..2fa873c6cf7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,11 +24,7 @@ StrEnum sentencepiece>=0.1.99 tensorrt~=10.13.0 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0. -<<<<<<< HEAD torch>=2.7.1,<=2.9.0a0 -======= -torch>=2.7.1,<=2.8.0 ->>>>>>> cuda13 torchvision nvidia-modelopt[torch]~=0.33.0 nvidia-nccl-cu13 From c4c79584793a29ad11c4de132b3225aef7a0f042 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 12 Aug 2025 01:31:37 -0700 Subject: [PATCH 04/13] Fixing bogus C++17 error Signed-off-by: Boris Fomitchev --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index da2e6726171..6b924077d56 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -1,10 +1,3 @@ -enable_language(C CXX CUDA) - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD}) - set(DEEP_EP_COMMIT edf3ea2b086a393d3163bf2773eab69d9191cc01) set(NVSHMEM_URL_HASH SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a) @@ -144,6 +137,10 @@ ExternalProject_Add( -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} + -DCMAKE_CXX_STANDARD:STRING=${CMAKE_CXX_STANDARD} + -DCMAKE_CUDA_STANDARD:STRING=${CMAKE_CUDA_STANDARD} + -DCMAKE_CXX_STANDARD_REQUIRED:STRING=${CMAKE_CXX_STANDARD_REQUIRED} + -DCMAKE_CXX_FLAGS:STRING="${CMAKE_CXX_FLAGS} -DCCCL_IGNORE_DEPRECATED_CPP_DIALECT=1" -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 -DNVSHMEM_BUILD_PACKAGES:BOOL=0 -DNVSHMEM_BUILD_TESTS:BOOL=0 From 32309ae13c46f608044fb316c7e0b8914d8fe744 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Tue, 12 Aug 2025 02:20:14 -0700 Subject: [PATCH 05/13] Trying to fix the build Signed-off-by: Boris Fomitchev --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 1 - cpp/tensorrt_llm/kernels/topkLastDim.cu | 1 + .../runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp | 3 ++- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 6b924077d56..483c04a6a5f 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -140,7 +140,6 @@ ExternalProject_Add( -DCMAKE_CXX_STANDARD:STRING=${CMAKE_CXX_STANDARD} -DCMAKE_CUDA_STANDARD:STRING=${CMAKE_CUDA_STANDARD} -DCMAKE_CXX_STANDARD_REQUIRED:STRING=${CMAKE_CXX_STANDARD_REQUIRED} - -DCMAKE_CXX_FLAGS:STRING="${CMAKE_CXX_FLAGS} -DCCCL_IGNORE_DEPRECATED_CPP_DIALECT=1" -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 -DNVSHMEM_BUILD_PACKAGES:BOOL=0 -DNVSHMEM_BUILD_TESTS:BOOL=0 diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index 938e7d3d642..b71d8a158b9 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -25,6 +25,7 @@ #include "topkLastDim.h" #include #include +#include namespace tensorrt_llm { diff --git a/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp b/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp index d41aa157c50..a384f845d6f 100644 --- a/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp +++ b/cpp/tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.cpp @@ -364,7 +364,8 @@ void* HostAccessibleDeviceAllocator::allocate(size_t memorySize) TLLM_CHECK_WITH_INFO( mAllowManagedFallback, "HostAccessibleDeviceAllocator is not supported on the current system."); TLLM_CUDA_CHECK(cudaMallocManaged(&devPtr, memorySize)); - TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, currentDevId)); + cudaMemLocation location {cudaMemLocationTypeDevice, currentDevId}; + TLLM_CUDA_CHECK(cudaMemAdvise(devPtr, memorySize, cudaMemAdviseSetPreferredLocation, location)); hostPtr = devPtr; } recordAllocation(devPtr, memorySize, hostPtr, memDesc); From c47c7b507bb260676de3f95711d3716e2a488142 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Wed, 13 Aug 2025 23:20:29 -0700 Subject: [PATCH 06/13] Trying to build with CUDA13 Signed-off-by: Boris Fomitchev --- cpp/CMakeLists.txt | 9 ++ cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 98 +++---------------- .../kernels/cutlass_kernels/CMakeLists.txt | 2 +- cpp/tensorrt_llm/kernels/topkLastDim.cu | 12 +-- requirements.txt | 2 + 5 files changed, 29 insertions(+), 94 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fe873a6e7c8..3800e04140c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -250,6 +250,14 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8") ) endif() +if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + message( + STATUS + "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags" + ) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared") +endif() + if(ENABLE_MULTI_DEVICE) # MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if # it's not called before "CMAKE_CXX_FLAGS" is set, it breaks on Windows for @@ -366,6 +374,7 @@ if(NVCC_TIMING) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --time ${CMAKE_CURRENT_BINARY_DIR}/nvcc-timing.csv") endif() + message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}") set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR}) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 483c04a6a5f..41e1e67637a 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -38,6 +38,13 @@ endif() # Ensure that dependent libraries are installed find_library(MLX5_lib NAMES mlx5 REQUIRED) +set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem") +# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}") + +find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) +find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) +find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) + # Prepare files # ============= @@ -81,90 +88,6 @@ foreach(_f IN LISTS _files) PROPERTY CMAKE_CONFIGURE_DEPENDS ${_src}) endforeach() -# Delete stale nvshmem on patch update -set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) -file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH) -file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch - NVSHMEM_PATCH_2_HASH) -set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") -string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1") -string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}") -string(APPEND NVSHMEM_STAMP_CONTENT " 103") -string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}") -set(OLD_NVSHMEM_STAMP_CONTENT "") -if(EXISTS ${NVSHMEM_STAMP_FILE}) - file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) -endif() -if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT) - file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix) - file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") -endif() -set_property( - DIRECTORY APPEND - PROPERTY CMAKE_CONFIGURE_DEPENDS - ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch - ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch) - -# Add NVSHMEM -# =========== - -# NVSHMEM only works with GCC. Building NVSHMEM with Clang results in -# compilation errors. Using NVSHMEM with Clang results in slow builds and device -# link issues. -if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - set(CMAKE_C_COMPILER gcc) - set(CMAKE_CXX_COMPILER g++) - set(CMAKE_CUDA_HOST_COMPILER g++) -endif() - -# Add nvshmem external project -include(ExternalProject) -ExternalProject_Add( - nvshmem_project - URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz - URL_HASH ${NVSHMEM_URL_HASH} - PATCH_COMMAND patch -p1 --forward --batch -i - ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch - COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i - src/CMakeLists.txt - COMMAND patch -p1 --forward --batch -i - ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch - CMAKE_CACHE_ARGS - -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} - -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} - -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} - -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER} - -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} - -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} - -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} - -DCMAKE_CXX_STANDARD:STRING=${CMAKE_CXX_STANDARD} - -DCMAKE_CUDA_STANDARD:STRING=${CMAKE_CUDA_STANDARD} - -DCMAKE_CXX_STANDARD_REQUIRED:STRING=${CMAKE_CXX_STANDARD_REQUIRED} - -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 - -DNVSHMEM_BUILD_PACKAGES:BOOL=0 - -DNVSHMEM_BUILD_TESTS:BOOL=0 - -DNVSHMEM_IBGDA_SUPPORT:BOOL=1 - -DNVSHMEM_IBRC_SUPPORT:BOOL=0 - -DNVSHMEM_MPI_SUPPORT:BOOL=0 - -DNVSHMEM_PMIX_SUPPORT:BOOL=0 - -DNVSHMEM_SHMEM_SUPPORT:BOOL=0 - -DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0 - -DNVSHMEM_UCX_SUPPORT:BOOL=0 - -DNVSHMEM_USE_GDRCOPY:BOOL=0 - -DNVSHMEM_USE_NCCL:BOOL=0 - INSTALL_COMMAND "" - BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build - BUILD_BYPRODUCTS - ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a) -add_library(nvshmem_project::nvshmem STATIC IMPORTED) -add_dependencies(nvshmem_project::nvshmem nvshmem_project) -file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) -set_target_properties( - nvshmem_project::nvshmem - PROPERTIES IMPORTED_LOCATION - ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a - INTERFACE_INCLUDE_DIRECTORIES - ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) # Add DeepEP cpp # ============== @@ -191,7 +114,7 @@ set_target_properties( CUDA_SEPARABLE_COMPILATION ON CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}" LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version - INSTALL_RPATH "$ORIGIN/libs/nvshmem;${TORCH_INSTALL_PREFIX}/lib" + INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib" BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( deep_ep_cpp_tllm @@ -200,8 +123,9 @@ target_compile_options( target_compile_definitions( deep_ep_cpp_tllm PRIVATE DISABLE_AGGRESSIVE_PTX_INSTRS TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) +target_include_directories(deep_ep_cpp_tllm PRIVATE ${NVSHMEM_INCLUDE_DIR}) target_link_libraries( - deep_ep_cpp_tllm PRIVATE nvshmem_project::nvshmem ${TORCH_LIBRARIES} + deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}) target_link_options( deep_ep_cpp_tllm PRIVATE @@ -210,4 +134,4 @@ target_link_options( # Set targets # =========== -add_dependencies(deep_ep deep_ep_cpp_tllm nvshmem_project) +add_dependencies(deep_ep deep_ep_cpp_tllm) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt index 7a02cdee73f..4a0e7d21c5a 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt @@ -190,7 +190,7 @@ set_cuda_architectures(fb_gemm_src 89 90 120f) # ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm) add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU}) -set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f) +set_cuda_architectures(fp8_blockscale_gemm_src 90) set(GEMM_SWIGLU_SM90_SRC_CU ${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu) diff --git a/cpp/tensorrt_llm/kernels/topkLastDim.cu b/cpp/tensorrt_llm/kernels/topkLastDim.cu index b71d8a158b9..3006f0f6466 100644 --- a/cpp/tensorrt_llm/kernels/topkLastDim.cu +++ b/cpp/tensorrt_llm/kernels/topkLastDim.cu @@ -25,6 +25,7 @@ #include "topkLastDim.h" #include #include +#include #include namespace tensorrt_llm @@ -1222,9 +1223,9 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx IdxT* sort_in_idx = nullptr; air_topk_stable::ComputeOffset computeoffset(k); - thrust::counting_iterator counting_iter(0); - thrust::transform_iterator, thrust::counting_iterator> - transform_iter(counting_iter, computeoffset); + auto counting_iter = thrust::make_counting_iterator(0); + auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset); + cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, batch_size, transform_iter, transform_iter + 1, stream); if (sorted) @@ -1349,9 +1350,8 @@ void standalone_stable_radix_topk_one_block_(void* buf, size_t& buf_size, T cons const IdxT buf_len = air_topk_stable::calc_buf_len(len); air_topk_stable::ComputeOffset computeoffset(k); - thrust::counting_iterator counting_iter(0); - thrust::transform_iterator, thrust::counting_iterator> - transform_iter(counting_iter, computeoffset); + auto counting_iter = thrust::make_counting_iterator(0); + auto transform_iter = thrust::make_transform_iterator(counting_iter, computeoffset); cub::DeviceSegmentedSort::SortPairs(NULL, temp_storage_bytes, out_idx, out_idx, out, out, k * batch_size, batch_size, transform_iter, transform_iter + 1, stream); diff --git a/requirements.txt b/requirements.txt index 2fa873c6cf7..130a20755e8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ torch>=2.7.1,<=2.9.0a0 torchvision nvidia-modelopt[torch]~=0.33.0 nvidia-nccl-cu13 +nvidia-nvshmem-cu13 nvidia-cuda-nvrtc-cu13 transformers~=4.55.0 pydantic>=2.9.1 @@ -63,3 +64,4 @@ etcd3 blake3 llguidance>=0.7.29 soundfile + From 0fdaec99d71928b218a3114ca5c2bd0e9cd5b417 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 14 Aug 2025 00:08:58 -0700 Subject: [PATCH 07/13] Fixed template symbol visibility issue Signed-off-by: Boris Fomitchev --- cpp/tensorrt_llm/kernels/beamSearchKernels.cu | 26 -------------- cpp/tensorrt_llm/kernels/beamSearchKernels.h | 35 ++++++++++++++++++- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu index 97c35478bca..d606dfea164 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.cu +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.cu @@ -134,32 +134,6 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& sync_check_cuda_error(stream); } -template -__global__ void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, - runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM) -{ - int const bid = blockIdx.x; // Index of request in batch - runtime::SizeType32 const slot = batchSlots[bid]; - float const diversityRate{diversityRates[slot]}; - T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2; - - for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x) - { - int const iBMIn = i / (nBMOut * 2); - if (finished[slot * nBMIn + iBMIn].isFinished()) - { - pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f; - } - else - { - // nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer - pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn; - } - } - return; -} - template __global__ void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h index 10a285af900..58d0e89e6a6 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h @@ -131,9 +131,42 @@ void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); template -__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs, +__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, + FinishedState const* finished, int const* endIds, float const* diversityRates, + runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM) +#ifdef __CUDACC__ +{ + int const bid = blockIdx.x; // Index of request in batch + runtime::SizeType32 const slot = batchSlots[bid]; + float const diversityRate{diversityRates[slot]}; + T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2; + + for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x) + { + int const iBMIn = i / (nBMOut * 2); + if (finished[slot * nBMIn + iBMIn].isFinished()) + { + pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f; + } + else + { + // nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer + pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn; + } + } + return; +} +#else +; +extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, + FinishedState const* finished, int const* endIds, float const* diversityRates, + runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); + +extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); +#endif + __global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nV); From bcca9841e5a38e252e661052811ec1495f1626e8 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 14 Aug 2025 00:36:31 -0700 Subject: [PATCH 08/13] Fixed compilation Signed-off-by: Boris Fomitchev --- cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 41e1e67637a..36b3a864ddf 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -38,12 +38,13 @@ endif() # Ensure that dependent libraries are installed find_library(MLX5_lib NAMES mlx5 REQUIRED) -set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/shmem") +set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/nvshmem") # message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}") find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) -find_library(NVSHMEM_HOST_LIBRARY nvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) +# set(NVSHMEM_HOST_LIBRARY ${NVSHMEM_INSTALL_PREFIX}/lib/libnvshmem_host.so.3) find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) +find_library(NVSHMEM_HOST_LIBRARY libnvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) # Prepare files @@ -114,7 +115,7 @@ set_target_properties( CUDA_SEPARABLE_COMPILATION ON CUDA_ARCHITECTURES "${DEEP_EP_CUDA_ARCHITECTURES}" LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/deep_ep_cpp_tllm.version - INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib" + INSTALL_RPATH "${TORCH_INSTALL_PREFIX}/lib;${NVSHMEM_INSTALL_PREFIX}/lib" BUILD_WITH_INSTALL_RPATH TRUE) target_compile_options( deep_ep_cpp_tllm @@ -125,7 +126,7 @@ target_compile_definitions( TORCH_EXTENSION_NAME=deep_ep_cpp_tllm) target_include_directories(deep_ep_cpp_tllm PRIVATE ${NVSHMEM_INCLUDE_DIR}) target_link_libraries( - deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${TORCH_LIBRARIES} + deep_ep_cpp_tllm PRIVATE ${NVSHMEM_DEVICE_LIBRARY} ${NVSHMEM_HOST_LIBRARY} ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIB}) target_link_options( deep_ep_cpp_tllm PRIVATE From 4eb76186ce929c1fd00a24a76a3e482279bd01d3 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 14 Aug 2025 01:13:36 -0700 Subject: [PATCH 09/13] Addressing code review, cleanup Signed-off-by: Boris Fomitchev --- cpp/CMakeLists.txt | 1 + cpp/tensorrt_llm/kernels/beamSearchKernels.h | 11 +---------- .../fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh | 2 -- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3800e04140c..ec7c4ff9d34 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -256,6 +256,7 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags" ) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared") + ### set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) endif() if(ENABLE_MULTI_DEVICE) diff --git a/cpp/tensorrt_llm/kernels/beamSearchKernels.h b/cpp/tensorrt_llm/kernels/beamSearchKernels.h index 58d0e89e6a6..c0ad49eb097 100644 --- a/cpp/tensorrt_llm/kernels/beamSearchKernels.h +++ b/cpp/tensorrt_llm/kernels/beamSearchKernels.h @@ -130,11 +130,11 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh, runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); +#ifdef __CUDACC__ template __global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished, int const* endIds, float const* diversityRates, runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM) -#ifdef __CUDACC__ { int const bid = blockIdx.x; // Index of request in batch runtime::SizeType32 const slot = batchSlots[bid]; @@ -156,15 +156,6 @@ __global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restr } return; } -#else -; -extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs(float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, - runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); - -extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs(half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, - FinishedState const* finished, int const* endIds, float const* diversityRates, - runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); #endif diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index a9e19b822fa..18911feb7c4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -24,8 +24,6 @@ #include #include -#define PFN_cuTensorMapEncodeTiled PFN_cuTensorMapEncodeTiled_v12000 - namespace tensorrt_llm::kernels::fp8_blockscale_gemm { From 0d70377c0c636308f020c52b94eec6708ac8312e Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 14 Aug 2025 01:31:58 -0700 Subject: [PATCH 10/13] Removed flashinfer-python for now (brings in cuda12 libs) Signed-off-by: Boris Fomitchev --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 130a20755e8..40ad3d1e626 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,7 +52,7 @@ setuptools<80 ordered-set peft einops -flashinfer-python>=0.2.5 +# flashinfer-python>=0.2.5 opencv-python-headless xgrammar>=0.1.19 backoff From b5aa6762713be1666bf4dec62734721de01d07b2 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Thu, 14 Aug 2025 12:52:48 -0700 Subject: [PATCH 11/13] Cleanup Signed-off-by: Boris Fomitchev --- cpp/CMakeLists.txt | 10 +--------- requirements.txt | 5 ++--- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ec7c4ff9d34..50704817be5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -208,11 +208,11 @@ if(BINDING_TYPE STREQUAL "nanobind") ${CMAKE_CURRENT_BINARY_DIR}/nanobind) endif() + # include as system to suppress warnings include_directories( SYSTEM ${CUDAToolkit_INCLUDE_DIRS} - /usr/local/cuda-13.0/targets/x86_64-linux/include/cccl ${CUDNN_ROOT_DIR}/include $ ${3RDPARTY_DIR}/cutlass/include @@ -250,14 +250,6 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8") ) endif() -if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") - message( - STATUS - "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, adding visibility flags" - ) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared") - ### set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) -endif() if(ENABLE_MULTI_DEVICE) # MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if diff --git a/requirements.txt b/requirements.txt index 40ad3d1e626..c5db93a00f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -52,9 +52,9 @@ setuptools<80 ordered-set peft einops -# flashinfer-python>=0.2.5 +### flashinfer-python>=0.2.5 ### installs triton opencv-python-headless -xgrammar>=0.1.19 +### xgrammar>=0.1.19 ### installs triton backoff nvtx matplotlib # FIXME: this is added to make nvtx happy @@ -64,4 +64,3 @@ etcd3 blake3 llguidance>=0.7.29 soundfile - From 155f262d56d887a8d6bb7e57cca0c667416ff592 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 17 Aug 2025 01:15:57 -0700 Subject: [PATCH 12/13] cleanup Signed-off-by: Boris Fomitchev --- cpp/CMakeLists.txt | 8 ++ cpp/tensorrt_llm/deep_ep/CMakeLists.txt | 100 ++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 8 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 50704817be5..d89967516a8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -250,6 +250,14 @@ if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "12.8") ) endif() +if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + message( + STATUS + "CUDAToolkit_VERSION ${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR} is greater or equal than 13.0, setting CMAKE_CUDA_RUNTIME_LIBRARY to Shared" + ) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --device-entity-has-hidden-visibility=false -cudart=shared") + set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) +endif() if(ENABLE_MULTI_DEVICE) # MPI MPI isn't used until tensorrt_llm/CMakeLists.txt is invoked. However, if diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt index 36b3a864ddf..ec90d7056f4 100644 --- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt @@ -36,15 +36,99 @@ if(NOT DEEP_EP_CUDA_ARCHITECTURES) return() endif() +# TODO: restore patched nvshmem for CUDA12 +if(${CUDAToolkit_VERSION} VERSION_GREATER_EQUAL "13.0") + set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/nvshmem") + find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) + find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) + find_library(NVSHMEM_HOST_LIBRARY libnvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) +else() + set(NVSHMEM_INSTALL_PREFIX "$ORIGIN/libs/nvshmem") + # Delete stale nvshmem on patch update + set(NVSHMEM_STAMP_FILE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_stamp.txt) + file(SHA256 ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch NVSHMEM_PATCH_HASH) + file(SHA256 ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch + NVSHMEM_PATCH_2_HASH) + set(NVSHMEM_STAMP_CONTENT "${NVSHMEM_URL_HASH}") + string(APPEND NVSHMEM_STAMP_CONTENT " PATCH_COMMAND v1") + string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_HASH}") + string(APPEND NVSHMEM_STAMP_CONTENT " 103") + string(APPEND NVSHMEM_STAMP_CONTENT " ${NVSHMEM_PATCH_2_HASH}") + set(OLD_NVSHMEM_STAMP_CONTENT "") + if(EXISTS ${NVSHMEM_STAMP_FILE}) + file(READ ${NVSHMEM_STAMP_FILE} OLD_NVSHMEM_STAMP_CONTENT) + endif() + if(NOT OLD_NVSHMEM_STAMP_CONTENT STREQUAL NVSHMEM_STAMP_CONTENT) + file(REMOVE_RECURSE ${CMAKE_CURRENT_BINARY_DIR}/nvshmem_project-prefix) + file(WRITE ${NVSHMEM_STAMP_FILE} "${NVSHMEM_STAMP_CONTENT}") + endif() + set_property( + DIRECTORY APPEND + PROPERTY CMAKE_CONFIGURE_DEPENDS + ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch) + + # Add NVSHMEM + # =========== + + # NVSHMEM only works with GCC. Building NVSHMEM with Clang results in + # compilation errors. Using NVSHMEM with Clang results in slow builds and device + # link issues. + if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + set(CMAKE_C_COMPILER gcc) + set(CMAKE_CXX_COMPILER g++) + set(CMAKE_CUDA_HOST_COMPILER g++) + endif() + + # Add nvshmem external project + include(ExternalProject) + ExternalProject_Add( + nvshmem_project + URL file://${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_src_3.2.5-1.txz + URL_HASH ${NVSHMEM_URL_HASH} + PATCH_COMMAND patch -p1 --forward --batch -i + ${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch + COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i + src/CMakeLists.txt + COMMAND patch -p1 --forward --batch -i + ${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch + CMAKE_CACHE_ARGS + -DCMAKE_C_COMPILER:STRING=${CMAKE_C_COMPILER} + -DCMAKE_C_COMPILER_LAUNCHER:STRING=${CMAKE_C_COMPILER_LAUNCHER} + -DCMAKE_CXX_COMPILER:STRING=${CMAKE_CXX_COMPILER} + -DCMAKE_CXX_COMPILER_LAUNCHER:STRING=${CMAKE_CXX_COMPILER_LAUNCHER} + -DCMAKE_CUDA_ARCHITECTURES:STRING=${DEEP_EP_CUDA_ARCHITECTURES} + -DCMAKE_CUDA_HOST_COMPILER:STRING=${CMAKE_CUDA_HOST_COMPILER} + -DCMAKE_CUDA_COMPILER_LAUNCHER:STRING=${CMAKE_CUDA_COMPILER_LAUNCHER} + -DNVSHMEM_BUILD_EXAMPLES:BOOL=0 + -DNVSHMEM_BUILD_PACKAGES:BOOL=0 + -DNVSHMEM_BUILD_TESTS:BOOL=0 + -DNVSHMEM_IBGDA_SUPPORT:BOOL=1 + -DNVSHMEM_IBRC_SUPPORT:BOOL=0 + -DNVSHMEM_MPI_SUPPORT:BOOL=0 + -DNVSHMEM_PMIX_SUPPORT:BOOL=0 + -DNVSHMEM_SHMEM_SUPPORT:BOOL=0 + -DNVSHMEM_TIMEOUT_DEVICE_POLLING:BOOL=0 + -DNVSHMEM_UCX_SUPPORT:BOOL=0 + -DNVSHMEM_USE_GDRCOPY:BOOL=0 + -DNVSHMEM_USE_NCCL:BOOL=0 + INSTALL_COMMAND "" + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build + BUILD_BYPRODUCTS + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a) + add_library(nvshmem_project::nvshmem STATIC IMPORTED) + add_dependencies(nvshmem_project::nvshmem nvshmem_project) + file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) + set_target_properties( + nvshmem_project::nvshmem + PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/lib/libnvshmem.a + INTERFACE_INCLUDE_DIRECTORIES + ${CMAKE_CURRENT_BINARY_DIR}/nvshmem-build/src/include) +endif() + # Ensure that dependent libraries are installed -find_library(MLX5_lib NAMES mlx5 REQUIRED) -set(NVSHMEM_INSTALL_PREFIX "${TORCH_INSTALL_PREFIX}/../nvidia/nvshmem") -# message("Torch Install Prefix: ${TORCH_INSTALL_PREFIX}") - -find_path(NVSHMEM_INCLUDE_DIR nvshmem.h HINTS ${NVSHMEM_INSTALL_PREFIX}/include) -# set(NVSHMEM_HOST_LIBRARY ${NVSHMEM_INSTALL_PREFIX}/lib/libnvshmem_host.so.3) -find_library(NVSHMEM_DEVICE_LIBRARY nvshmem_device HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) -find_library(NVSHMEM_HOST_LIBRARY libnvshmem_host.so.3 HINTS ${NVSHMEM_INSTALL_PREFIX}/lib) +find_library(MLX5_lib NAMES mlx5 libmlx5.so.1 REQUIRED) # Prepare files From e331ef015c00e88423b033e6ec43dc800e2d6fe1 Mon Sep 17 00:00:00 2001 From: Boris Fomitchev Date: Sun, 17 Aug 2025 15:15:08 -0700 Subject: [PATCH 13/13] Fixing wheel build for nvshmem artifacts Signed-off-by: Boris Fomitchev --- scripts/build_wheel.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py index 3fdaa93febb..7c0fe3d1b01 100755 --- a/scripts/build_wheel.py +++ b/scripts/build_wheel.py @@ -669,18 +669,21 @@ def get_binding_lib(subdirectory, name): "deep_ep", deep_ep_dir, dirs_exist_ok=True) + (lib_dir / "nvshmem").mkdir(exist_ok=True) - install_file( - build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt", - lib_dir / "nvshmem") - install_file( - build_dir / - "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3", - lib_dir / "nvshmem") - install_file( - build_dir / - "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103", - lib_dir / "nvshmem") + nvshmem_license = build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt" + if nvshmem_license.exists(): + install_file( + build_dir / "tensorrt_llm/deep_ep/nvshmem-build/License.txt", + lib_dir / "nvshmem") + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_bootstrap_uid.so.3", + lib_dir / "nvshmem") + install_file( + build_dir / + "tensorrt_llm/deep_ep/nvshmem-build/src/lib/nvshmem_transport_ibgda.so.103", + lib_dir / "nvshmem") if not skip_stubs: with working_directory(project_dir): if binding_type == "nanobind":