-
Notifications
You must be signed in to change notification settings - Fork 1.8k
[WIP][chore] : CUDA13 build #6890
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bb12940
b18be91
e172f96
c61e9ba
c4c7958
32309ae
c47c7b5
0fdaec9
bcca984
4eb7618
0d70377
b5aa676
155f262
e331ef0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -95,7 +95,7 @@ constexpr CUtensorMapDataType get_CUtensorMapDataType() | |
} | ||
} | ||
|
||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | ||
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() | ||
{ | ||
// Get pointer to `cuTensorMapEncodeTiled` | ||
cudaDriverEntryPointQueryResult driver_status; | ||
|
@@ -110,12 +110,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | |
|
||
if (driver_status != cudaDriverEntryPointSuccess) | ||
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess"); | ||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr); | ||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr); | ||
} | ||
|
||
template <typename T> | ||
CUtensorMap make_2d_tma_copy_desc(T* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes, | ||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) | ||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) | ||
{ | ||
Comment on lines
116
to
119
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainDefault parameter type should match the stable PFN type Align the encode_func param with the stable, unversioned type to avoid exposing versioned types. - uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+ uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) Double-check the other call sites for signature consistency. 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== show file (if exists) ==="
FILE="cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh"
if [ -f "$FILE" ]; then
sed -n '1,240p' "$FILE" || true
else
echo "File not found: $FILE"
fi
echo
echo "=== rg: PFN_cuTensorMapEncodeTiled_v12000 ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled_v12000" || true
echo
echo "=== rg: PFN_cuTensorMapEncodeTiled ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled(?!_v12000)" || true
echo
echo "=== rg: make_2d_tma_copy_desc ==="
rg --hidden -n "make_2d_tma_copy_desc" || true
echo
echo "=== rg: cuTensorMapEncodeTiled occurrences ==="
rg --hidden -n "cuTensorMapEncodeTiled" || true
echo
echo "=== rg: cuTensorMap related typedefs/usages ==="
rg --hidden -n "cuTensorMap" || true Length of output: 25549 🏁 Script executed: #!/bin/bash
set -euo pipefail
echo "=== fp8_blockscale_tma_utils.cuh (head) ==="
sed -n '1,240p' cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh || true
echo
echo "=== cudaDriverWrapper.h (head) ==="
sed -n '1,320p' cpp/tensorrt_llm/common/cudaDriverWrapper.h || true
echo
echo "=== cudaDriverWrapper.cpp (head) ==="
sed -n '1,360p' cpp/tensorrt_llm/common/cudaDriverWrapper.cpp || true
echo
echo "=== tma_utils.cuh (relevant region) ==="
sed -n '1,220p' cpp/include/tensorrt_llm/deep_gemm/tma_utils.cuh || true
echo
echo "=== rg: where PFN_cuTensorMapEncodeTiled is defined/mentioned ==="
rg --hidden -n "PFN_cuTensorMapEncodeTiled" || true Length of output: 32409 Prefer the stable PFN_cuTensorMapEncodeTiled type for the default parameter Use the unversioned PFN to avoid leaking the versioned typedef; callers I checked do not pass encode_func explicitly. Files to update:
Call sites verified (no explicit encode_func argument): cpp/include/tensorrt_llm/deep_gemm/fp8_gemm.cuh, cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh, etc. Optional safety: add a local fallback in headers to map the unversioned name to the v12000 typedef if the unversioned PFN is not present: #ifndef PFN_cuTensorMapEncodeTiled 🤖 Prompt for AI Agents
|
||
CUtensorMap tensor_map{}; | ||
constexpr uint32_t rank = 2; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -130,10 +130,34 @@ void invokeTopkBeamSearch(T const* logProbs, T const* bias, void* workspace, Bea | |
void invokeUpdateCacheIndirection(int* tgtCI, int const* srcCI, BeamHypotheses& bh, | ||
runtime::SizeType32 const maxAttentionWindow, runtime::SizeType32 sinkTokenLength, cudaStream_t stream); | ||
|
||
#ifdef __CUDACC__ | ||
template <typename T> | ||
__global__ void addCumLogProbs(T* __restrict pStage1Probs, float const* __restrict cumLogProbs, | ||
__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, | ||
FinishedState const* finished, int const* endIds, float const* diversityRates, | ||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); | ||
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM) | ||
{ | ||
int const bid = blockIdx.x; // Index of request in batch | ||
runtime::SizeType32 const slot = batchSlots[bid]; | ||
float const diversityRate{diversityRates[slot]}; | ||
T* pLocalLogProbs = pStage1LogProbs + bid * nBMIn * nBMOut * 2; | ||
|
||
for (int i = threadIdx.x; i < nBMIn * nBMOut * 2; i += blockDim.x) | ||
{ | ||
int const iBMIn = i / (nBMOut * 2); | ||
if (finished[slot * nBMIn + iBMIn].isFinished()) | ||
{ | ||
pLocalLogProbs[i] += (i == endIds[slot]) ? 1.0f : 0.0f; | ||
} | ||
else | ||
Comment on lines
+147
to
+151
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainEOS/endIds indexing appears incorrect and likely out-of-bounds for [BS, BM] layout endIds is documented as [BS, BM], but the code indexes it as endIds[slot] and compares it to i (a flattened [BM_in, BM_out, 2] index). Two issues:
This can cause incorrect scoring for finished beams and undefined behavior if endIds has only [BS, BM] elements. Please confirm shapes and intended semantics. If indeed per-beam EOS, a minimal indexing fix would be:
To help locate references and prior logic, run: 🏁 Script executed: #!/bin/bash
# Look for other usages of endIds indexing and EOS handling in kernels
rg -n -A3 -B3 $'endIds\\[' cpp | sed -n '1,200p'
# Find all write sites for pStage1LogProbs to infer its layout
rg -n -A3 -B3 'pStage1(Log)?Probs' cpp | sed -n '1,200p'
# Find all callers of addCumLogProbs and their launch grids to reason about mapping
rg -n -A3 -B3 'addCumLogProbs\\s*<|<<<' cpp | sed -n '1,200p' Length of output: 45150 🏁 Script executed: #!/bin/bash
sed -n '1,240p' cpp/tensorrt_llm/kernels/beamSearchKernels.h Length of output: 15146 🏁 Script executed: #!/bin/bash
sed -n '1,420p' cpp/tensorrt_llm/kernels/beamSearchKernels/beamSearchKernelsTemplate.h Length of output: 17803 Fix required — addCumLogProbs compares candidate index to EOS token and ignores beam dim Short summary: addCumLogProbs (cpp/tensorrt_llm/kernels/beamSearchKernels.h) iterates i over the flattened candidate grid (nBMIn * nBMOut * 2) and does (i == endIds[slot]). i is a candidate index, not a token id; also the code ignores the beam index in endIds. This will almost never match the EOS token and therefore fails to prefer EOS for finished beams (and is inconsistent with other kernels that treat endIds as per-slot token ids). Locations to fix:
Suggested minimal changes (apply and adjust to your code style):
Extra: clarify the declared shape of endIds in BeamHypotheses (cpp/tensorrt_llm/kernels/beamSearchKernels.h) — comments currently read [BS, BM] but most kernels use endIds[slot] (per-slot). Confirm whether endIds should be [BS] or [BS, BM] and update code/comments and indexing across kernels accordingly. Reason: The kernel must compare the candidate's token id (from stage-1 ids) to the EOS token id; comparing the flattened candidate index is incorrect and will break finished-beam handling. 🤖 Prompt for AI Agents
|
||
{ | ||
// nBM is used in VBWS since `cumLogProbs` is initialized with kMaxBeamWidth earlier than BeamSearchLayer | ||
pLocalLogProbs[i] += cumLogProbs[slot * nBM + iBMIn] + diversityRate * iBMIn; | ||
} | ||
} | ||
return; | ||
} | ||
#endif | ||
Comment on lines
+133
to
+159
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 💡 Verification agent 🧩 Analysis chainAvoid ODR/duplicate definitions: add extern templates or move the kernel out of the header Defining a templated
Minimal change (add extern templates under the CUDA guard): #ifdef __CUDACC__
template <typename T>
__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
{
@@
}
-#endif
+// Prevent implicit instantiation; explicit instantiations live in the .cu
+extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<float>(
+ float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished,
+ int const* __restrict endIds, int const* __restrict pStage1Ids, float const* __restrict diversityRates,
+ runtime::SizeType32 const* __restrict batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+
+extern template __global__ __attribute__((visibility("default"))) void addCumLogProbs<half>(
+ half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs, FinishedState const* finished,
+ int const* __restrict endIds, int const* __restrict pStage1Ids, float const* __restrict diversityRates,
+ runtime::SizeType32 const* __restrict batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
+#endif And update the explicit instantiations in the .cu to match the new signature: // cpp/tensorrt_llm/kernels/beamSearchKernels.cu (adjust signature)
template __global__ void addCumLogProbs<float>(float* __restrict, float const* __restrict,
FinishedState const*, int const* __restrict, int const* __restrict, float const* __restrict,
runtime::SizeType32 const* __restrict, size_t const, size_t const, size_t const, size_t const);
template __global__ void addCumLogProbs<half>(half* __restrict, float const* __restrict,
FinishedState const*, int const* __restrict, int const* __restrict, float const* __restrict,
runtime::SizeType32 const* __restrict, size_t const, size_t const, size_t const, size_t const); Alternative: Move the kernel body to the .cu and leave a guarded declaration in the header to avoid any template instantiation from headers. To confirm there are explicit instantiations today and avoid duplicates, run: 🏁 Script executed: #!/bin/bash
rg -n -A2 -B2 $'template\\s+__global__\\s+void\\s+addCumLogProbs' cpp | sed -n '1,200p' Length of output: 1181 Move the CUDA kernel body out of the header (or add correct extern template specializations) — explicit instantiations exist in the .cu The header currently defines a templated global kernel while cpp/tensorrt_llm/kernels/beamSearchKernels.cu already contains explicit instantiations for float and half (rg found instantiations at lines ~137 and ~141). This risks ODR/duplicate-definition errors. Fix by removing the definition from the header and leaving only a declaration, or (less preferred) add matching extern template specializations. Files to change
Suggested minimal change (preferred: move body to .cu; in the header leave only the declaration): #ifdef __CUDACC__
-template <typename T>
-__global__ __attribute__((visibility("default"))) void addCumLogProbs(T* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
- FinishedState const* finished, int const* endIds, float const* diversityRates,
- runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM)
-{
- ...
-}
+#ifdef __CUDACC__
+template <typename T>
+__global__ __attribute__((visibility("default"))) void addCumLogProbs(
+ T* __restrict pStage1LogProbs,
+ float const* __restrict cumLogProbs,
+ FinishedState const* finished,
+ int const* endIds,
+ float const* diversityRates,
+ runtime::SizeType32 const* batchSlots,
+ size_t const nBS,
+ size_t const nBMIn,
+ size_t const nBMOut,
+ size_t const nBM);
#endif Optional alternative (if you must keep the body in the header): add extern template declarations that exactly match the explicit instantiations in the .cu (use the same parameter list and types). Example externs (place under the CUDA guard after the definition if you keep it, though moving the body is safer): extern template __global__ void addCumLogProbs<float>(
float* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM);
extern template __global__ void addCumLogProbs<half>(
half* __restrict pStage1LogProbs, float const* __restrict cumLogProbs,
FinishedState const* finished, int const* endIds, float const* diversityRates,
runtime::SizeType32 const* batchSlots, size_t const nBS, size_t const nBMIn, size_t const nBMOut, size_t const nBM); Action: prefer moving the kernel body to the .cu and keep the declaration in the header; that aligns with the existing explicit instantiations and avoids ODR issues. |
||
|
||
|
||
__global__ void gatherId(int const* __restrict pStage1Id, int* __restrict pStage2Id, size_t const nBS, | ||
size_t const nBMIn, size_t const nBMOut, size_t const nV); | ||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -84,7 +84,7 @@ inline CUtensorMapDataType get_CUtensorMapDataType() | |||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | ||||||||||
PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() | ||||||||||
{ | ||||||||||
// Get pointer to cuTensorMapEncodeTiled | ||||||||||
cudaDriverEntryPointQueryResult driver_status; | ||||||||||
|
@@ -101,12 +101,12 @@ PFN_cuTensorMapEncodeTiled get_cuTensorMapEncodeTiled() | |||||||||
throw std::runtime_error("driver_status != cudaDriverEntryPointSuccess"); | ||||||||||
} | ||||||||||
|
||||||||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled>(cuTensorMapEncodeTiled_ptr); | ||||||||||
return reinterpret_cast<PFN_cuTensorMapEncodeTiled_v12000>(cuTensorMapEncodeTiled_ptr); | ||||||||||
} | ||||||||||
|
||||||||||
template <typename data_type> | ||||||||||
CUtensorMap make_2d_tma_copy_desc(data_type* global_address, uint64_t gmem_dim[2], uint64_t stride_in_bytes, | ||||||||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) | ||||||||||
uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr) | ||||||||||
{ | ||||||||||
Comment on lines
+109
to
110
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Unify encode_func parameter type with stable PFN Avoid exposing the versioned PFN type in public signatures. - uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled_v12000 encode_func = nullptr)
+ uint32_t smem_dim[2], CUtensorMapSwizzle swizzle_type, PFN_cuTensorMapEncodeTiled encode_func = nullptr) 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents
|
||||||||||
CUtensorMap tensor_map{}; | ||||||||||
constexpr uint32_t rank = 2; | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
💡 Verification agent
🧩 Analysis chain
Make CUDA runtime selection consistent and avoid redundant flags
You’re switching to Shared runtime for CUDA >= 13, but you also append “-cudart=shared” to CMAKE_CUDA_FLAGS. Prefer one source of truth (CMake’s CMAKE_CUDA_RUNTIME_LIBRARY) and avoid hard-coding
-cudart
in flags. Also, if you keep Shared for >=13, ensure consumers don’t still link againstCUDA::cudart_static
viaCUDA_RT_LIB
.Apply this minimal diff to let CMake own the runtime selection and align the helper variable:
To verify no accidental static cudart linkage remains after this change, run:
🏁 Script executed:
Length of output: 3327
Ensure consistent CUDA runtime selection across the entire CMake setup
We should centralize runtime choice via
CMAKE_CUDA_RUNTIME_LIBRARY
and theCUDA_RT_LIB
helper, and remove all hard-coded references tocudart_static
or-cudart=shared
. In practice that means:• In
cpp/CMakeLists.txt
’ssetup_cuda_architectures()
(around lines 153–165), replace the separate static/default logic with a singleif(VERSION ≥ 13)
/else()
that:– Sets
CMAKE_CUDA_RUNTIME_LIBRARY
to Shared for ≥13, Static otherwise.– Sets
CUDA_RT_LIB
toCUDA::cudart
for ≥13,CUDA::cudart_static
otherwise.– Drops the
-cudart=shared
flag entirely.• Remove all other direct invocations of
CUDA::cudart_static
or-cudart=…
and have consumers link only against${CUDA_RT_LIB}
.• Update every CMakeLists that currently does
to either:
– Remove
cudart_static
from thefind_package
call (letting CMake pick the right runtime), and– Link against
${CUDA_RT_LIB}
(or explicitlyCUDA::cudart
when you truly need the shared library).Key files requiring edits (non-exhaustive):
Example refactor in
setup_cuda_architectures()
:Then remove all other occurrences of
cudart_static
(e.g. in examples) and use${CUDA_RT_LIB}
for linking. This ensures a single source of truth and prevents stale static‐runtime references.🤖 Prompt for AI Agents