chore: modify by CR comments

jmydurant · jmydurant · commit 48590c24fbf7 · 2025-08-12T10:27:02.000+08:00
Signed-off-by: Mingyang Jiang &lt;13463932+jmydurant@users.noreply.github.com&gt;
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
@@ -1252,8 +1252,8 @@ int main(int argc, char** argv)
 
     // Allocate the reference on the host.
     float* o_ref_h = (float*) malloc(o_size * sizeof(float));
-    float* softmax_sum_ref_h = (float*) malloc(2 * b * s * h * sizeof(float));
-    float* softmax_sum_h = (float*) malloc(2 * b * s * h * sizeof(float));
+    float* softmax_stats_ref_h = (float*) malloc(2 * b * s * h * sizeof(float));
+    float* softmax_stats_h = (float*) malloc(2 * b * s * h * sizeof(float));
 
     // The P matrix is stored as one big matrix of size S x B x H x S.
     const size_t p_size = s * b * h * s;
@@ -1947,7 +1947,7 @@ int main(int argc, char** argv)
 
         // Read the results.
         FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_ref_h, o_d, o_size, data_type));
-        FMHA_CHECK_CUDA(cuda_memcpy_d2h(softmax_sum_ref_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
+        FMHA_CHECK_CUDA(cuda_memcpy_d2h(softmax_stats_ref_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
     }
 
     // Fill-in p/s/o with garbage data.
@@ -2033,7 +2033,7 @@ int main(int argc, char** argv)
             std::vector<float> o_ref_trans_h(o_size);
 
             FMHA_CHECK_CUDA(cuda_memcpy_d2h(o_h, o_d_view, o_view_size, output_dtype));
-            FMHA_CHECK_CUDA(cuda_memcpy_d2h(softmax_sum_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
+            FMHA_CHECK_CUDA(cuda_memcpy_d2h(softmax_stats_h, softmax_stats_d, 2 * b * s * h, DATA_TYPE_FP32));
 
             if (interleaved)
             {
@@ -2053,7 +2053,7 @@ int main(int argc, char** argv)
                 dv, epsilon, verbose, true);
             if (save_softmax)
             {
-                auto errors = check_softmax_results(softmax_sum_h, softmax_sum_ref_h, b, s, h, seqlens, cu_seqlens);
+                auto errors = check_softmax_results(softmax_stats_h, softmax_stats_ref_h, b, s, h, seqlens, cu_seqlens);
                 status = status | ((errors.first + errors.second) > 0);
             }
         }
@@ -2149,8 +2149,8 @@ int main(int argc, char** argv)
     free(s_h);
     free(o_h);
     free(o_ref_h);
-    free(softmax_sum_h);
-    free(softmax_sum_ref_h);
+    free(softmax_stats_h);
+    free(softmax_stats_ref_h);
     free(contiguous_kv_h);
     free(kv_cache_ptrs_h);
     free(kv_cache_block_offsets_h);
diff --git a/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu b/cpp/tests/unit_tests/kernels/mlaChunkedPrefillTest.cu
@@ -132,7 +132,6 @@ void selfAttentionRef(T* output, T* const Q, T* const KV, int batch_size, int nu
         int global_q_offset = cu_seq_q_len[b] * num_heads * head_size;
         int global_kv_offset = cu_seq_kv_len[b] * 2 * num_heads * head_size;
         int global_softmax_offset = cu_seq_q_len[b] * num_heads * 2;
-        float bmm1_scale = 1.F / std::sqrt(static_cast<float>(head_size));
         if (curr_q_len == 0 || curr_kv_len == 0)
         {
             continue; // skip empty sequences
@@ -750,7 +749,6 @@ protected:
         auto* d_cu_q_seq_lens_ptr = bufferCast<int64_t>(*(this->d_cu_q_seq_lens));
 
         int const loop_count = (this->mMaxSeqLen + this->mChunkSize - 1) / this->mChunkSize;
-        float bmm1_scale = 1.F / std::sqrt(static_cast<double>(this->mNopeSize + this->mRopeSize));
         // do not apply mask
         for (int _ = 0; _ < loop_count - 1; _++)
         {

Original file line number	Diff line number	Diff line change
`@@ -132,7 +132,6 @@ void selfAttentionRef(T* output, T* const Q, T* const KV, int batch_size, int nu`
`132`	`132`	`int global_q_offset = cu_seq_q_len[b] * num_heads * head_size;`
`133`	`133`	`int global_kv_offset = cu_seq_kv_len[b] * 2 * num_heads * head_size;`
`134`	`134`	`int global_softmax_offset = cu_seq_q_len[b] * num_heads * 2;`
`135`		`- float bmm1_scale = 1.F / std::sqrt(static_cast<float>(head_size));`
`136`	`135`	`if (curr_q_len == 0 \|\| curr_kv_len == 0)`
`137`	`136`	`{`
`138`	`137`	`continue; // skip empty sequences`
`@@ -750,7 +749,6 @@ protected:`
`750`	`749`	`auto* d_cu_q_seq_lens_ptr = bufferCast<int64_t>(*(this->d_cu_q_seq_lens));`
`751`	`750`
`752`	`751`	`int const loop_count = (this->mMaxSeqLen + this->mChunkSize - 1) / this->mChunkSize;`
`753`		`- float bmm1_scale = 1.F / std::sqrt(static_cast<double>(this->mNopeSize + this->mRopeSize));`
`754`	`752`	`// do not apply mask`
`755`	`753`	`for (int _ = 0; _ < loop_count - 1; _++)`
`756`	`754`	`{`