modify the kwargs inside _flash_attention_forward

TKONIY · TKONIY · commit 3bf3ae4ba3a7 · 2025-10-06T19:23:35.000+08:00
diff --git a/src/transformers/integrations/flash_attention.py b/src/transformers/integrations/flash_attention.py
@@ -76,7 +76,7 @@ def flash_attention_forward(
         softcap=softcap,
         use_top_left_mask=_use_top_left_mask,
         target_dtype=target_dtype,
-        implementation=module.config._attn_implementation,
+        attn_implementation=module.config._attn_implementation,
         layer_idx=module.layer_idx if hasattr(module, "layer_idx") else None,
         **kwargs,
     )
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -545,7 +545,7 @@ def _flash_attention_forward(
     max_length_q: Optional[int] = None,
     max_length_k: Optional[int] = None,
     target_dtype: Optional[torch.dtype] = None,
-    implementation: Optional[str] = None,
+    attn_implementation: Optional[str] = None,
     **kwargs,
 ):
     """
@@ -568,7 +568,7 @@ def _flash_attention_forward(
             The attention implementation to use. If None, will default to the one based on the environment.
     """
     (flash_fn, flash_varlen_fn, pad_fn, unpad_fn), process_flash_kwargs_fn = lazy_import_flash_attention(
-        implementation
+        attn_implementation
     )
 
     # PEFT possibly silently casts tensors to fp32, this potentially reconverts to correct dtype or is a no op

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def flash_attention_forward(`
`76`	`76`	`softcap=softcap,`
`77`	`77`	`use_top_left_mask=_use_top_left_mask,`
`78`	`78`	`target_dtype=target_dtype,`
`79`		`- implementation=module.config._attn_implementation,`
	`79`	`+ attn_implementation=module.config._attn_implementation,`
`80`	`80`	`layer_idx=module.layer_idx if hasattr(module, "layer_idx") else None,`
`81`	`81`	`**kwargs,`
`82`	`82`	`)`