Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/qwen2_5_omni.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
return_tensors="pt",
video_fps=1,
fps=1,

# kwargs to be passed to `Qwen2-5-OmniProcessor`
padding=True,
Expand Down Expand Up @@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
return_tensors="pt",
video_fps=1,
fps=1,

# kwargs to be passed to `Qwen2-5-OmniProcessor`
padding=True,
Expand Down
10 changes: 5 additions & 5 deletions docs/source/en/model_doc/qwen2_audio.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_co
prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)

generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
Expand All @@ -63,7 +63,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_

# We can also omit the audio_bos and audio_eos tokens
prompt = "<|AUDIO|>Generate the caption in English:"
inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)

generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
Expand Down Expand Up @@ -106,7 +106,7 @@ for message in conversation:
sr=processor.feature_extractor.sampling_rate)[0]
)

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to(model.device)

generate_ids = model.generate(**inputs, max_length=256)
Expand Down Expand Up @@ -156,7 +156,7 @@ for message in conversation:
sr=processor.feature_extractor.sampling_rate)[0]
)

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to(model.device)

generate_ids = model.generate(**inputs, max_length=256)
Expand Down Expand Up @@ -213,7 +213,7 @@ for conversation in conversations:
sr=processor.feature_extractor.sampling_rate)[0]
)

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
inputs['input_ids'] = inputs['input_ids'].to(model.device)
inputs.input_ids = inputs.input_ids.to(model.device)

Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/qwen3_omni_moe.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
return_tensors="pt",
video_fps=1,
fps=1,

# kwargs to be passed to `Qwen3OmniMoeProcessor`
padding=True,
Expand Down Expand Up @@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
return_tensors="pt",
video_fps=1,
fps=1,

# kwargs to be passed to `Qwen3OmniMoeProcessor`
padding=True,
Expand Down Expand Up @@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
tokenize=True,
return_dict=True,
return_tensors="pt",
video_fps=1,
fps=1,

# kwargs to be passed to `Qwen3OmniMoeProcessor`
padding=True,
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/seamless_m4t.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
>>> audio_sample = next(iter(dataset))["audio"]

>>> # now, process it
>>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")

>>> # now, process some English test as well
>>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/seamless_m4t_v2.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
>>> audio_sample = next(iter(dataset))["audio"]

>>> # now, process it
>>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")

>>> # now, process some English text as well
>>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
Expand Down
19 changes: 0 additions & 19 deletions examples/pytorch/audio-classification/run_audio_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@
import logging
import os
import sys
import warnings
from dataclasses import dataclass, field
from random import randint
from typing import Optional
Expand Down Expand Up @@ -180,29 +179,11 @@ class ModelArguments:
)
},
)
freeze_feature_extractor: Optional[bool] = field(
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
)
ignore_mismatched_sizes: bool = field(
default=False,
metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
)

def __post_init__(self):
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
warnings.warn(
"The argument `--freeze_feature_extractor` is deprecated and "
"will be removed in a future version. Use `--freeze_feature_encoder` "
"instead. Setting `freeze_feature_encoder==True`.",
FutureWarning,
)
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
raise ValueError(
"The argument `--freeze_feature_extractor` is deprecated and "
"should not be used in combination with `--freeze_feature_encoder`. "
"Only make use of `--freeze_feature_encoder`."
)


def main():
# See all possible arguments in src/transformers/training_args.py
Expand Down
3 changes: 0 additions & 3 deletions src/transformers/models/aimv2/modeling_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@
from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from ...processing_utils import Unpack
from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs
from ...utils.deprecation import deprecate_kwarg
from ...utils.generic import check_model_inputs
from .configuration_aimv2 import Aimv2Config, Aimv2TextConfig, Aimv2VisionConfig

Expand Down Expand Up @@ -445,13 +444,11 @@ def __init__(self, config: Aimv2VisionConfig):
def get_input_embeddings(self) -> nn.Module:
return self.embeddings.patch_embed

@deprecate_kwarg("attention_mask", version="v4.58.0")
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
pixel_values,
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutputWithPooling:
r"""
Expand Down
3 changes: 0 additions & 3 deletions src/transformers/models/aimv2/modular_aimv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
auto_docstring,
can_return_tuple,
)
from ...utils.deprecation import deprecate_kwarg
from ...utils.generic import check_model_inputs
from ..clip.modeling_clip import CLIPModel, CLIPTextEmbeddings, _get_vector_norm
from ..llama.modeling_llama import LlamaMLP, LlamaRMSNorm
Expand Down Expand Up @@ -488,13 +487,11 @@ def __init__(self, config: Aimv2VisionConfig):
def get_input_embeddings(self) -> nn.Module:
return self.embeddings.patch_embed

@deprecate_kwarg("attention_mask", version="v4.58.0")
@check_model_inputs(tie_last_hidden_states=False)
@auto_docstring
def forward(
self,
pixel_values,
attention_mask: Optional[torch.Tensor] = None,
**kwargs: Unpack[TransformersKwargs],
) -> BaseModelOutputWithPooling:
r"""
Expand Down
2 changes: 0 additions & 2 deletions src/transformers/models/altclip/processing_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
"""

from ...processing_utils import ProcessorMixin
from ...utils.deprecation import deprecate_kwarg


class AltCLIPProcessor(ProcessorMixin):
Expand All @@ -39,7 +38,6 @@ class AltCLIPProcessor(ProcessorMixin):
image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast")
tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

@deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
def __init__(self, image_processor=None, tokenizer=None):
super().__init__(image_processor, tokenizer)

Expand Down
22 changes: 2 additions & 20 deletions src/transformers/models/beit/modeling_beit.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

import collections.abc
import math
import warnings
from dataclasses import dataclass
from typing import Optional, Union

Expand Down Expand Up @@ -163,14 +162,7 @@ def forward(
self,
pixel_values: torch.Tensor,
bool_masked_pos: Optional[torch.BoolTensor] = None,
interpolate_pos_encoding: Optional[bool] = None,
) -> torch.Tensor:
if self.position_embeddings is not None and interpolate_pos_encoding is not None:
warnings.warn(
"`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always "
"interpolated to the input image size. The argument will be removed in transformers v4.51.0."
)

_, _, height, width = pixel_values.shape
embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values)
batch_size, seq_len, _ = embeddings.size()
Expand Down Expand Up @@ -325,19 +317,9 @@ def forward(
) -> Union[tuple[torch.Tensor], tuple[torch.Tensor, torch.Tensor]]:
if output_attentions:
logger.warning_once(
"`BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not "
"support `output_attentions=True`. Falling back to the manual attention implementation, "
"but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states=hidden_states,
output_attentions=output_attentions,
relative_position_bias=relative_position_bias,
interpolate_pos_encoding=interpolate_pos_encoding,
resolution=resolution,
f"{self.__class__.__name__} does not support `output_attentions=True`. The returned attention weights will "
"be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model."
)

batch_size, seq_length, _ = hidden_states.shape
query_layer = (
self.query(hidden_states)
Expand Down
52 changes: 9 additions & 43 deletions src/transformers/models/blip_2/modeling_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
"""PyTorch BLIP-2 model."""

import math
import warnings
from collections.abc import Callable
from dataclasses import dataclass
from typing import Any, Optional, Union
Expand Down Expand Up @@ -1090,7 +1089,6 @@ def get_text_features(
decoder_input_ids: Optional[torch.Tensor] = None,
decoder_attention_mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None,
legacy_output: bool = True,
) -> Union[torch.FloatTensor, CausalLMOutputWithPast]:
r"""
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Expand All @@ -1109,12 +1107,10 @@ def get_text_features(
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
legacy_output (`bool`, *optional*, defaults to `True`):
Whether to return a model output object or a tensor of features.

Returns:
text_outputs (`CausalLMOutputWithPast` or `torch.FloatTensor`):
The language model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
text_outputs (``torch.FloatTensor`):
The language model's last hidden states.

Examples:
```python
Expand All @@ -1129,13 +1125,6 @@ def get_text_features(
... text_features = model.get_text_features(**inputs)
```"""

if legacy_output:
warnings.warn(
"Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. "
"Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
"To opt in to the new behavior now, set `legacy_output=False`."
)

if self.config.use_decoder_only_language_model:
text_outputs: CausalLMOutputWithPast = self.language_model(
input_ids=input_ids,
Expand All @@ -1153,23 +1142,19 @@ def get_text_features(
return_dict=True,
)

return text_outputs if legacy_output else text_outputs.logits
return text_outputs.logits

@filter_out_non_signature_kwargs()
@auto_docstring
def get_image_features(
self,
pixel_values: torch.FloatTensor,
interpolate_pos_encoding: bool = False,
legacy_output: bool = True,
) -> Union[torch.FloatTensor, CausalLMOutputWithPast]:
r"""
legacy_output (`bool`, *optional*, defaults to `True`):
Whether to return a model output object or a tensor of features.

Returns:
vision_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`):
The vision model outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
vision_outputs (`torch.FloatTensor`):
The vision model's last layer pooled logits.

Examples:
```python
Expand All @@ -1187,36 +1172,25 @@ def get_image_features(
>>> with torch.inference_mode():
... image_outputs = model.get_image_features(**inputs)
```"""
if legacy_output:
warnings.warn(
"Deprecation notice: In Transformers v4.59, the default return value of `get_text_features` will change. "
"Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
"To opt in to the new behavior now, set `legacy_output=False`."
)

vision_outputs = self.vision_model(
pixel_values=pixel_values,
interpolate_pos_encoding=interpolate_pos_encoding,
return_dict=True,
)

return vision_outputs if legacy_output else vision_outputs.pooler_output
return vision_outputs.pooler_output

@filter_out_non_signature_kwargs()
@auto_docstring
def get_qformer_features(
self,
pixel_values: torch.FloatTensor,
interpolate_pos_encoding: bool = False,
legacy_output: bool = True,
) -> Union[torch.FloatTensor, BaseModelOutputWithPooling]:
r"""
legacy_output (`bool`, *optional*, defaults to `True`):
Whether to return a model output object or a tensor of features.

Returns:
qformer_outputs (`BaseModelOutputWithPooling` or `torch.FloatTensor`):
The Q-Former outputs. If `legacy_output=False`, the output is a `torch.FloatTensor`.
qformer_outputs (`torch.FloatTensor`):
The Q-Former model's last layer hidden states.

Examples:

Expand All @@ -1235,14 +1209,6 @@ def get_qformer_features(
>>> with torch.inference_mode():
... qformer_outputs = model.get_qformer_features(**inputs)
```"""

if legacy_output:
warnings.warn(
"Deprecation notice: In Transformers v4.59, the default return value of `get_qformer_features` will change. "
"Currently, this method returns a model output object, but starting in v4.59, it will return a tensor instead. "
"To opt in to the new behavior now, set `legacy_output=False`."
)

vision_outputs: BaseModelOutputWithPooling = self.vision_model(
pixel_values=pixel_values,
interpolate_pos_encoding=interpolate_pos_encoding,
Expand All @@ -1262,7 +1228,7 @@ def get_qformer_features(
return_dict=True,
)

return query_outputs if legacy_output else query_outputs.last_hidden_state
return query_outputs.last_hidden_state

def get_placeholder_mask(self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor):
"""
Expand Down
Loading