NVIDIA · lowsfer · Jun 26, 2025 · Jun 13, 2025
diff --git a/cpp/kernels/xqa/barriers.cuh b/cpp/kernels/xqa/barriers.cuh
@@ -68,7 +68,7 @@ public:
     template <Scope scope = defaultScope, ArriveOrder order = ArriveOrder::RELEASE>
     __device__ inline mha::conditional_t<scope == Scope::CTA, ArrivalToken, void> arrive(uint32_t update = 1)
     {
-        ArrivalToken token;
+        ArrivalToken token{};
 #if __CUDA_ARCH__ >= 900
         if constexpr (scope == Scope::CTA)
         {
@@ -128,9 +128,9 @@ public:
 
     __device__ inline bool isLocal() const
     {
-        uint32_t addrCtaRank;
+        uint32_t addrCtaRank{};
         asm("getctarank.u64 %0, %1;\n" : "=r"(addrCtaRank) : "l"(addr()));
-        uint32_t ctaRank;
+        uint32_t ctaRank{};
         asm("mov.u32 %0, %%cluster_ctarank;\n" : "=r"(ctaRank));
         return addrCtaRank == ctaRank;
     }
@@ -154,7 +154,7 @@ public:
 #if __CUDA_ARCH__ >= 900
         if constexpr (scope == Scope::CTA)
         {
-            ArrivalToken token;
+            ArrivalToken token{};
             asm volatile("mbarrier.arrive.expect_tx.relaxed.cta.b64 %0, [%1], %2;\n"
                          : "=l"(token)
                          : "l"(addr()), "r"(txCount)
@@ -181,7 +181,7 @@ public:
         {
             if constexpr (scope == Scope::CTA)
             {
-                ArrivalToken token;
+                ArrivalToken token{};
                 switch (order)
                 {
                 case ArriveOrder::RELEASE:
@@ -239,7 +239,7 @@ public:
     template <Scope scope = defaultScope>
     __device__ inline bool test_wait(ArrivalToken&& token)
     {
-        uint32_t ready;
+        uint32_t ready{};
         if constexpr (scope == Scope::CGA)
         {
             asm volatile(
@@ -271,7 +271,7 @@ public:
     template <Scope scope = defaultScope>
     __device__ inline bool test_wait_parity(bool parity)
     {
-        uint32_t ready;
+        uint32_t ready{};
         if constexpr (scope == Scope::CGA)
         {
             asm volatile(
@@ -303,7 +303,7 @@ public:
     template <Scope scope = defaultScope>
     __device__ inline bool try_wait(ArrivalToken&& token)
     {
-        uint32_t ready;
+        uint32_t ready{};
         if constexpr (scope == Scope::CGA)
         {
             asm volatile(
@@ -334,7 +334,7 @@ public:
     template <Scope scope = defaultScope>
     __device__ inline bool try_wait_parity(bool parity)
     {
-        uint32_t ready;
+        uint32_t ready{};
         if constexpr (scope == Scope::CGA)
         {
             asm volatile(

diff --git a/cpp/kernels/xqa/mha_components.cuh b/cpp/kernels/xqa/mha_components.cuh
@@ -59,7 +59,7 @@ template <uint32_t n>
 __device__ inline QuadRegRowMaxT<n * warp_size> replicateForQuad(Warp const& warp, Vec<float, n> const& src)
 {
     assertWarpConverged();
-    QuadRegRowMaxT<n * warp_size> dst;
+    QuadRegRowMaxT<n * warp_size> dst{};
 #pragma unroll
     for (uint32_t i = 0; i < src.size; i++)
     {
@@ -82,7 +82,7 @@ __device__ inline ThrdRegRowMaxT<warp_size * exactDiv(n, 4)> dedupFromQuad(Warp
         assert(src[i] == __shfl_sync(~0U, src[i], laneId() / 4 * 4));
     }
 #endif
-    ThrdRegRowMaxT<warp_size * exactDiv(n, 4)> dst;
+    ThrdRegRowMaxT<warp_size * exactDiv(n, 4)> dst{};
     uint32_t const lane = laneId();
     uint32_t const idxMat = lane / 8;
     uint32_t const idxRow = lane % 8;

diff --git a/cpp/kernels/xqa/mha_sm90.cu b/cpp/kernels/xqa/mha_sm90.cu
@@ -1616,7 +1616,7 @@ CUBIN_EXPORT __global__
                 {
                     if (warpElectSync())
                     {
-                        tma::load1DAsync(&smem.tokens[idxBuf], &scratchMem.tokens()[idxChunk],
+                        tma::loadLinearAsync(&smem.tokens[idxBuf], &scratchMem.tokens()[idxChunk],
                             sizeof(smem.tokens[idxBuf]), bar.produced);
                         arrive_tx(bar.produced, sizeof(smem.tokens[idxBuf]), 1);
                     }