@@ -2288,7 +2288,6 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
22882288 CHECK_EQ (v_data->shape [2 ], v_head_dim_);
22892289 CHECK_EQ (o_data->shape [2 ], v_head_dim_);
22902290
2291-
22922291 // Part 2: Synchronize streams and update auxiliary data.
22932292 ComputeStreamWaitForCopyStream ();
22942293 ICHECK (!dirty_aux_data_device_);
@@ -2303,20 +2302,20 @@ class PagedAttentionKVCacheObj : public AttentionKVCacheObj {
23032302 // Here, we use f_mla_prefill_ragged_normal_, which is designed to work for both decode
23042303 // and normal prefill cases. Optionally, you could check a flag like `use_decode_kernel_[0]`
23052304 // to adjust parameters; here we assume the kernel internally supports both cases.
2306- f_mla_prefill_ragged_normal_ (q_data,
2307- cur_append_length_indptr_view_,
2308- k_data,
2309- v_data,
2310- cur_append_length_indptr_view_,
2311- q_rope_position_map_view_,
2312- k_ragged_rope_pos_offset_view_,
2313- o_data, // output tensor
2314- merged_attn_scores_view_,
2315- /* causal=*/ 1 ,
2316- static_cast <int >(RoPEMode::kNone ), // Rope changes have already been applied before the kernel
2317- 0 , // Rope param, not important
2318- 0 , // Rope param, not important
2319- attn_score_scaling_factor);
2305+ f_mla_prefill_ragged_normal_ (q_data,
2306+ cur_append_length_indptr_view_,
2307+ k_data,
2308+ v_data,
2309+ cur_append_length_indptr_view_,
2310+ q_rope_position_map_view_,
2311+ k_ragged_rope_pos_offset_view_,
2312+ o_data, // output tensor
2313+ merged_attn_scores_view_,
2314+ /* causal=*/ 1 ,
2315+ static_cast <int >(RoPEMode::kNone ),
2316+ 0 , // Rope param, not important
2317+ 0 , // Rope param, not important
2318+ attn_score_scaling_factor);
23202319
23212320 // Part 5: If appending is to occur after attention, call the append kernel.
23222321 if (!append_before_attn_) {
0 commit comments