DMMHA: add unit tests; fix CPU, CUDA kernel (microsoft#22567)

mindest · web-flow · commit 4ffc1ff3b4aa · 2024-11-02T21:05:56.000+08:00
### Description

Fixes:
(1) cpu kernel: applying scale before bias and mask like other MHA ops
(2) cpu kernel: correct offset during appending past to present.
(3) cuda kernel: apply mask if provided; fix output_qk offset.

Add DMMHA unit tests
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h
@@ -77,7 +77,7 @@ class AttentionCPUBase : public AttentionBase {
       // Convert mask from boolean (0/1) to float (mask_filter_value/0.0f).
       // Merge padding mask with causal mask, and broadcast to 3D (BxSxT).
       PrepareMask(mask_index_data, mask_index_dims, static_cast<T*>(mask_data),
-                  causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_);
+                  causal, batch_size, sequence_length, kv_sequence_length, past_sequence_length, mask_filter_value_);
       DUMP_CPU_TENSOR("Mask3D", static_cast<T*>(mask_data), batch_size, sequence_length, total_sequence_length);
     }
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_helper.h b/onnxruntime/contrib_ops/cpu/bert/attention_helper.h
@@ -120,9 +120,10 @@ void PrepareMask(const int32_t* mask_index,
                  bool causal,
                  int batch_size,
                  int sequence_length,
+                 int kv_sequence_length,
                  int past_sequence_length,
                  float mask_filter_value) {
-  const int all_sequence_length = past_sequence_length + sequence_length;
+  const int all_sequence_length = past_sequence_length + kv_sequence_length;
 
   // mask_data has been filled with 0, and its shape is BxSxT
   T* p_mask = mask_data;
diff --git a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.cc
@@ -339,6 +339,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
         T* attention_probs_ptr = reinterpret_cast<T*>(attention_probs) + last_offset;
         math::Dot<float, CPUMathUtil>(head_size, q_vec, K + i * head_size, attention_probs_ptr, nullptr);
 
+        *attention_probs_ptr *= scale;
         // Apply the attention bias and mask
         if (attn_bias_data != nullptr) {
           *attention_probs_ptr += attn_bias_data[attn_bias_base_offset + past_sequence_length];
@@ -348,7 +349,6 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
         if (is_masked) {
           *attention_probs_ptr += mask_filter_value_;
         }
-        *attention_probs_ptr *= scale;
       }
 
       {
@@ -362,6 +362,8 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
           const T* past_k_vec = past_key_data + beam_batch_offset + beam_offset + j * head_size;
           T* output = reinterpret_cast<T*>(attention_probs) + j + i * probs_matrix_size;
           math::Dot<float, CPUMathUtil>(head_size, q_vec, past_k_vec, output, nullptr);
+
+          *output *= scale;
           // Apply the attention bias and mask
           if (attn_bias_data != nullptr) {
             *output += attn_bias_data[attn_bias_base_offset + j];
@@ -371,11 +373,11 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(
           if (is_masked) {
             *output += mask_filter_value_;
           }
-          *output *= scale;
         }
       }
       // Append current key to present key (past_present_share_buffer_ is true)
-      memcpy(present_key_data + i * max_sequence_length * head_size, K + i * head_size, head_size * sizeof(T));
+      memcpy(present_key_data + (i * max_sequence_length + past_sequence_length) * head_size,
+             K + i * head_size, head_size * sizeof(T));
     }
   });
 
@@ -460,7 +462,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeVxAttentionScoreWithBeams(
             }
           }
           // Append current value to present value (past_present_share_buffer_ is true)
-          memcpy(present_value_data + i * max_sequence_length * v_head_size,
+          memcpy(present_value_data + (i * max_sequence_length + past_sequence_length) * v_head_size,
                  V + i * v_head_size,
                  v_head_size * sizeof(T));
         }
diff --git a/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.h b/onnxruntime/contrib_ops/cpu/bert/decoder_masked_multihead_attention.h
@@ -33,7 +33,7 @@ class DecoderMaskedMultiHeadAttention final : public OpKernel, public AttentionC
                                  const Tensor* cache_indir,
                                  OpKernelContext* context,
                                  int beam_width,
-                                 Tensor* scaled_qk = nullptr) const;
+                                 Tensor* output_qk = nullptr) const;
   void ComputeAttentionProbsWithBeams(T* attention_probs,
                                       const T* Q,
                                       const T* K,
@@ -50,7 +50,7 @@ class DecoderMaskedMultiHeadAttention final : public OpKernel, public AttentionC
                                       bool broadcast_attn_bias_dim_1,
                                       const int32_t* cache_indir_data,
                                       int beam_width,
-                                      T* scaled_qk_data = nullptr) const;
+                                      T* output_qk_data = nullptr) const;
   void ComputeVxAttentionScoreWithBeams(T* output,
                                         T* tmp_buffer,
                                         const T* attention_probs,
diff --git a/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu b/onnxruntime/contrib_ops/cuda/bert/fastertransformer_decoder_attention/decoder_masked_multihead_attention_impl.cu
@@ -298,6 +298,9 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
       if (params.attention_bias != nullptr) {
         qk = add_vec(qk, reinterpret_cast<T*>(params.attention_bias)[attn_bias_offset + tlength]);
       }
+      if (params.mask != nullptr && params.mask[bi_total_seq_length + params.past_sequence_length] == 0) {
+        qk += params.mask_filter_value;
+      }
       qk_max = qk;
       qk_smem[tlength] = qk;
     }
@@ -534,7 +537,7 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio
 
   if (params.out_qk != nullptr) {
     // store cross qk before softmax, out_qk has shape [B(batchxbeam), #Head, 1, total_sequence_length]
-    float* target = ((float*)params.out_qk) + ((int64_t)bhi * tlength);
+    float* target = (reinterpret_cast<float*>(params.out_qk)) + (static_cast<int64_t>(bhi) * (sum_tlength + 1));
     for (int ti = tidx; ti <= sum_tlength; ti += THREADS_PER_BLOCK) {
       target[ti] = (float)(qk_smem[ti]);
     }
diff --git a/onnxruntime/core/graph/contrib_ops/bert_defs.cc b/onnxruntime/core/graph/contrib_ops/bert_defs.cc
@@ -908,7 +908,6 @@ ONNX_MS_OPERATOR_SET_SCHEMA(
                OpSchema::Optional)
         .Input(9,
                "cache_indirection",
-               // This input is useful for CUDA EP only.
                "A buffer of shape [batch_size, beam_width, max_output_length] where an `[i, j, k]` entry specifies "
                "which beam the `k`-th token came from for the `j`-th beam for batch `i` in the current iteration",
                "M",
diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc

Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ class AttentionCPUBase : public AttentionBase {`
`77`	`77`	`// Convert mask from boolean (0/1) to float (mask_filter_value/0.0f).`
`78`	`78`	`// Merge padding mask with causal mask, and broadcast to 3D (BxSxT).`
`79`	`79`	`PrepareMask(mask_index_data, mask_index_dims, static_cast<T*>(mask_data),`
`80`		`- causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_);`
	`80`	`+ causal, batch_size, sequence_length, kv_sequence_length, past_sequence_length, mask_filter_value_);`
`81`	`81`	`DUMP_CPU_TENSOR("Mask3D", static_cast<T*>(mask_data), batch_size, sequence_length, total_sequence_length);`
`82`	`82`	`}`
`83`	`83`
Original file line number	Diff line number	Diff line change
`@@ -339,6 +339,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(`
`339`	`339`	`T* attention_probs_ptr = reinterpret_cast<T*>(attention_probs) + last_offset;`
`340`	`340`	`math::Dot<float, CPUMathUtil>(head_size, q_vec, K + i * head_size, attention_probs_ptr, nullptr);`
`341`	`341`
	`342`	`+ attention_probs_ptr = scale;`
`342`	`343`	`// Apply the attention bias and mask`
`343`	`344`	`if (attn_bias_data != nullptr) {`
`344`	`345`	`*attention_probs_ptr += attn_bias_data[attn_bias_base_offset + past_sequence_length];`
`@@ -348,7 +349,6 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(`
`348`	`349`	`if (is_masked) {`
`349`	`350`	`*attention_probs_ptr += mask_filter_value_;`
`350`	`351`	`}`
`351`		`- attention_probs_ptr = scale;`
`352`	`352`	`}`
`353`	`353`
`354`	`354`	`{`
`@@ -362,6 +362,8 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(`
`362`	`362`	`const T* past_k_vec = past_key_data + beam_batch_offset + beam_offset + j * head_size;`
`363`	`363`	`T* output = reinterpret_cast<T>(attention_probs) + j + i probs_matrix_size;`
`364`	`364`	`math::Dot<float, CPUMathUtil>(head_size, q_vec, past_k_vec, output, nullptr);`
	`365`	`+`
	`366`	`+ output = scale;`
`365`	`367`	`// Apply the attention bias and mask`
`366`	`368`	`if (attn_bias_data != nullptr) {`
`367`	`369`	`*output += attn_bias_data[attn_bias_base_offset + j];`
`@@ -371,11 +373,11 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeAttentionProbsWithBeams(`
`371`	`373`	`if (is_masked) {`
`372`	`374`	`*output += mask_filter_value_;`
`373`	`375`	`}`
`374`		`- output = scale;`
`375`	`376`	`}`
`376`	`377`	`}`
`377`	`378`	`// Append current key to present key (past_present_share_buffer_ is true)`
`378`		`- memcpy(present_key_data + i * max_sequence_length * head_size, K + i * head_size, head_size * sizeof(T));`
	`379`	`+ memcpy(present_key_data + (i * max_sequence_length + past_sequence_length) * head_size,`
	`380`	`+ K + i * head_size, head_size * sizeof(T));`
`379`	`381`	`}`
`380`	`382`	`});`
`381`	`383`
`@@ -460,7 +462,7 @@ void DecoderMaskedMultiHeadAttention<T>::ComputeVxAttentionScoreWithBeams(`
`460`	`462`	`}`
`461`	`463`	`}`
`462`	`464`	`// Append current value to present value (past_present_share_buffer_ is true)`
`463`		`- memcpy(present_value_data + i * max_sequence_length * v_head_size,`
	`465`	`+ memcpy(present_value_data + (i * max_sequence_length + past_sequence_length) * v_head_size,`
`464`	`466`	`V + i * v_head_size,`
`465`	`467`	`v_head_size * sizeof(T));`
`466`	`468`	`}`
Original file line number	Diff line number	Diff line change
`@@ -298,6 +298,9 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio`
`298`	`298`	`if (params.attention_bias != nullptr) {`
`299`	`299`	`qk = add_vec(qk, reinterpret_cast<T*>(params.attention_bias)[attn_bias_offset + tlength]);`
`300`	`300`	`}`
	`301`	`+ if (params.mask != nullptr && params.mask[bi_total_seq_length + params.past_sequence_length] == 0) {`
	`302`	`+ qk += params.mask_filter_value;`
	`303`	`+ }`
`301`	`304`	`qk_max = qk;`
`302`	`305`	`qk_smem[tlength] = qk;`
`303`	`306`	`}`
`@@ -534,7 +537,7 @@ __global__ void masked_multihead_attention_kernel(DecoderMaskedMultiHeadAttentio`
`534`	`537`
`535`	`538`	`if (params.out_qk != nullptr) {`
`536`	`539`	`// store cross qk before softmax, out_qk has shape [B(batchxbeam), #Head, 1, total_sequence_length]`
`537`		`- float* target = ((float)params.out_qk) + ((int64_t)bhi tlength);`
	`540`	`+ float* target = (reinterpret_cast<float>(params.out_qk)) + (static_cast<int64_t>(bhi) (sum_tlength + 1));`
`538`	`541`	`for (int ti = tidx; ti <= sum_tlength; ti += THREADS_PER_BLOCK) {`
`539`	`542`	`target[ti] = (float)(qk_smem[ti]);`
`540`	`543`	`}`