huggingface · zucchini-nlp · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 20, 2025
diff --git a/docs/source/en/model_doc/qwen2_5_omni.md b/docs/source/en/model_doc/qwen2_5_omni.md
@@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen2-5-OmniProcessor`
     padding=True,
@@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen2-5-OmniProcessor`
     padding=True,

diff --git a/docs/source/en/model_doc/qwen2_audio.md b/docs/source/en/model_doc/qwen2_audio.md
@@ -54,7 +54,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_co
 prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
 url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
 audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
-inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)
 
 generate_ids = model.generate(**inputs, max_length=256)
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
@@ -63,7 +63,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 
 # We can also omit the audio_bos and audio_eos tokens
 prompt = "<|AUDIO|>Generate the caption in English:"
-inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)
 
 generate_ids = model.generate(**inputs, max_length=256)
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
@@ -106,7 +106,7 @@ for message in conversation:
                     sr=processor.feature_extractor.sampling_rate)[0]
                 )
 
-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to(model.device)
 
 generate_ids = model.generate(**inputs, max_length=256)
@@ -156,7 +156,7 @@ for message in conversation:
                         sr=processor.feature_extractor.sampling_rate)[0]
                 )
 
-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to(model.device)
 
 generate_ids = model.generate(**inputs, max_length=256)
@@ -213,7 +213,7 @@ for conversation in conversations:
                             sr=processor.feature_extractor.sampling_rate)[0]
                     )
 
-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs['input_ids'] = inputs['input_ids'].to(model.device)
 inputs.input_ids = inputs.input_ids.to(model.device)
 

diff --git a/docs/source/en/model_doc/qwen3_omni_moe.md b/docs/source/en/model_doc/qwen3_omni_moe.md
@@ -80,7 +80,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen3OmniMoeProcessor`
     padding=True,
@@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen3OmniMoeProcessor`
     padding=True,
@@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
     tokenize=True,
     return_dict=True,
     return_tensors="pt",
-    video_fps=1,
+    fps=1,
 
     # kwargs to be passed to `Qwen3OmniMoeProcessor`
     padding=True,

diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md
@@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
 >>> audio_sample = next(iter(dataset))["audio"]
 
 >>> # now, process it
->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")
 
 >>> # now, process some English test as well
 >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")

diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md
@@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
 >>> audio_sample = next(iter(dataset))["audio"]
 
 >>> # now, process it
->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")
 
 >>> # now, process some English text as well
 >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")

diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py
@@ -27,7 +27,6 @@
 import logging
 import os
 import sys
-import warnings
 from dataclasses import dataclass, field
 from random import randint
 from typing import Optional
@@ -180,29 +179,11 @@ class ModelArguments:
             )
         },
     )
-    freeze_feature_extractor: Optional[bool] = field(
-        default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
     ignore_mismatched_sizes: bool = field(
         default=False,
         metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
     )
 
-    def __post_init__(self):
-        if not self.freeze_feature_extractor and self.freeze_feature_encoder:
-            warnings.warn(
-                "The argument `--freeze_feature_extractor` is deprecated and "
-                "will be removed in a future version. Use `--freeze_feature_encoder` "
-                "instead. Setting `freeze_feature_encoder==True`.",
-                FutureWarning,
-            )
-        if self.freeze_feature_extractor and not self.freeze_feature_encoder:
-            raise ValueError(
-                "The argument `--freeze_feature_extractor` is deprecated and "
-                "should not be used in combination with `--freeze_feature_encoder`. "
-                "Only make use of `--freeze_feature_encoder`."
-            )
-
 
 def main():
     # See all possible arguments in src/transformers/training_args.py

diff --git a/src/transformers/models/aimv2/modeling_aimv2.py b/src/transformers/models/aimv2/modeling_aimv2.py
@@ -37,7 +37,6 @@
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import check_model_inputs
 from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig
 
@@ -445,13 +444,11 @@ def __init__(self, config: Aimv2VisionConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.embeddings.patch_embed
 
-    @deprecate_kwarg("attention_mask", version="v4.58.0")
     @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
         self,
         pixel_values,
-        attention_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPooling:
         r"""

diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py
@@ -32,7 +32,6 @@
     auto_docstring,
     can_return_tuple,
 )
-from ...utils.deprecation import deprecate_kwarg
 from ...utils.generic import check_model_inputs
 from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
 from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm
@@ -488,13 +487,11 @@ def __init__(self, config: Aimv2VisionConfig):
     def get_input_embeddings(self) -> nn.Module:
         return self.embeddings.patch_embed
 
-    @deprecate_kwarg("attention_mask", version="v4.58.0")
     @check_model_inputs(tie_last_hidden_states=False)
     @auto_docstring
     def forward(
         self,
         pixel_values,
-        attention_mask: Optional[torch.Tensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> BaseModelOutputWithPooling:
         r"""

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
@@ -17,7 +17,6 @@
 """
 
 from ...processing_utils import ProcessorMixin
-from ...utils.deprecation import deprecate_kwarg
 
 
 class AltCLIPProcessor(ProcessorMixin):
@@ -39,7 +38,6 @@ class AltCLIPProcessor(ProcessorMixin):
     image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
     tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
-    @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
 

diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
@@ -16,7 +16,6 @@
 
 import collections.abc
 import math
-import warnings
 from dataclasses import dataclass
 from typing import Optional, Union
 
@@ -163,14 +162,7 @@ def forward(
         self,
         pixel_values: torch.Tensor,
         bool_masked_pos: Optional[torch.BoolTensor] = None,
-        interpolate_pos_encoding: Optional[bool] = None,
     ) -> torch.Tensor:
-        if self.position_embeddings is not None and interpolate_pos_encoding is not None:
-            warnings.warn(
-                "`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always "
-                "interpolated to the input image size. The argument will be removed in transformers v4.51.0."
-            )
-
         _, _, height, width = pixel_values.shape
         embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
         batch_size, seq_len, _ = embeddings.size()
@@ -325,19 +317,9 @@ def forward(
     ) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
         if output_attentions:
             logger.warning_once(
-                "`BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not "
-                "support `output_attentions=True`. Falling back to the manual attention implementation, "
-                "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
-                'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                output_attentions=output_attentions,
-                relative_position_bias=relative_position_bias,
-                interpolate_pos_encoding=interpolate_pos_encoding,
-                resolution=resolution,
+                f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will "
+                "be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model."
             )
-
         batch_size, seq_length, _ = hidden_states.shape
         query_layer = (
             self.query(hidden_states)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -15,7 +15,6 @@
 """PyTorch BLIP-2 model."""
 
 import math
-import warnings
 from collections.abc import Callable
 from dataclasses import dataclass
 from typing import Any, Optional, Union
@@ -1090,7 +1089,6 @@ def get_text_features(
         decoder_input_ids: Optional[torch.Tensor] = None,
         decoder_attention_mask: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
-        legacy_output: bool = True,
     ) -> Union[torch.FloatTensor, CausalLMOutputWithPast]:
         r"""
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
@@ -1109,12 +1107,10 @@ def get_text_features(
         decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
             Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
             be used by default.
-        legacy_output (`bool`, *optional*, defaults to `True`):
-            Whether to return a model output object or a tensor of features.
 
         Returns:
-            text_outputs (`CausalLMOutputWithPast` or `torch.FloatTensor`):
-                The language model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
+            text_outputs (``torch.FloatTensor`):
+                The language model's last hidden states.
 
         Examples:
         ```python
@@ -1129,13 +1125,6 @@ def get_text_features(
         ...     text_features = model.get_text_features(**inputs)
         ```"""
 
-        if legacy_output:
-            warnings.warn(
-                "Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. "
-                "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
-                "To opt in to the new behavior now, set `legacy_output=False`."
-            )
-
         if self.config.use_decoder_only_language_model:
             text_outputs: CausalLMOutputWithPast = self.language_model(
                 input_ids=input_ids,
@@ -1153,23 +1142,19 @@ def get_text_features(
                 return_dict=True,
             )
 
-        return text_outputs if legacy_output else text_outputs.logits
+        return text_outputs.logits
 
     @filter_out_non_signature_kwargs()
     @auto_docstring
     def get_image_features(
         self,
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
-        legacy_output: bool = True,
     ) -> Union[torch.FloatTensor, CausalLMOutputWithPast]:
         r"""
-        legacy_output (`bool`, *optional*, defaults to `True`):
-            Whether to return a model output object or a tensor of features.
-
         Returns:
-            vision_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`):
-                The vision model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
+            vision_outputs (`torch.FloatTensor`):
+                The vision model's last layer pooled logits.
 
         Examples:
         ```python
@@ -1187,36 +1172,25 @@ def get_image_features(
         >>> with torch.inference_mode():
         ...     image_outputs = model.get_image_features(**inputs)
         ```"""
-        if legacy_output:
-            warnings.warn(
-                "Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. "
-                "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
-                "To opt in to the new behavior now, set `legacy_output=False`."
-            )
-
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
             return_dict=True,
         )
 
-        return vision_outputs if legacy_output else vision_outputs.pooler_output
+        return vision_outputs.pooler_output
 
     @filter_out_non_signature_kwargs()
     @auto_docstring
     def get_qformer_features(
         self,
         pixel_values: torch.FloatTensor,
         interpolate_pos_encoding: bool = False,
-        legacy_output: bool = True,
     ) -> Union[torch.FloatTensor, BaseModelOutputWithPooling]:
         r"""
-        legacy_output (`bool`, *optional*, defaults to `True`):
-            Whether to return a model output object or a tensor of features.
-
         Returns:
-            qformer_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`):
-                The Q-Former outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
+            qformer_outputs (`torch.FloatTensor`):
+                The Q-Former model's last layer hidden states.
 
         Examples:
 
@@ -1235,14 +1209,6 @@ def get_qformer_features(
         >>> with torch.inference_mode():
         ...     qformer_outputs = model.get_qformer_features(**inputs)
         ```"""
-
-        if legacy_output:
-            warnings.warn(
-                "Deprecation notice: In Transformers v4.59, the default return value of `get_qformer_features` will change. "
-                "Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
-                "To opt in to the new behavior now, set `legacy_output=False`."
-            )
-
         vision_outputs: BaseModelOutputWithPooling = self.vision_model(
             pixel_values=pixel_values,
             interpolate_pos_encoding=interpolate_pos_encoding,
@@ -1262,7 +1228,7 @@ def get_qformer_features(
             return_dict=True,
         )
 
-        return query_outputs if legacy_output else query_outputs.last_hidden_state
+        return query_outputs.last_hidden_state
 
     def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
         """