diff --git a/docs/source/en/model_doc/align.md b/docs/source/en/model_doc/align.md index 275b510ccd5c..dbb11ae0ab36 100644 --- a/docs/source/en/model_doc/align.md +++ b/docs/source/en/model_doc/align.md @@ -154,7 +154,6 @@ for label, score in zip(candidate_labels, probs): ## AlignConfig [[autodoc]] AlignConfig - - from_text_vision_configs ## AlignTextConfig diff --git a/docs/source/en/model_doc/blip-2.md b/docs/source/en/model_doc/blip-2.md index faaaee7b0840..5f5d5efd7a15 100644 --- a/docs/source/en/model_doc/blip-2.md +++ b/docs/source/en/model_doc/blip-2.md @@ -60,7 +60,6 @@ If you're interested in submitting a resource to be included here, please feel f ## Blip2Config [[autodoc]] Blip2Config - - from_vision_qformer_text_configs ## Blip2VisionConfig diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md index 5e727050f6ee..9c30c29ee5a1 100644 --- a/docs/source/en/model_doc/blip.md +++ b/docs/source/en/model_doc/blip.md @@ -87,7 +87,6 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam ## BlipConfig [[autodoc]] BlipConfig - - from_text_vision_configs ## BlipTextConfig diff --git a/docs/source/en/model_doc/chinese_clip.md b/docs/source/en/model_doc/chinese_clip.md index 859534a7a577..c804ce3f04d7 100644 --- a/docs/source/en/model_doc/chinese_clip.md +++ b/docs/source/en/model_doc/chinese_clip.md @@ -76,7 +76,6 @@ Currently, following scales of pretrained Chinese-CLIP models are available on ## ChineseCLIPConfig [[autodoc]] ChineseCLIPConfig - - from_text_vision_configs ## ChineseCLIPTextConfig diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md index ff8428141c4a..a1fe7753feb2 100644 --- a/docs/source/en/model_doc/clap.md +++ b/docs/source/en/model_doc/clap.md @@ -63,7 +63,6 @@ print(f"Text embeddings: {text_features}") ## ClapConfig [[autodoc]] ClapConfig - - from_text_audio_configs ## ClapTextConfig diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md index 6320e0f39853..529194d32a37 100644 --- a/docs/source/en/model_doc/clip.md +++ b/docs/source/en/model_doc/clip.md @@ -87,7 +87,6 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ ## CLIPConfig [[autodoc]] CLIPConfig - - from_text_vision_configs ## CLIPTextConfig diff --git a/docs/source/en/model_doc/clipseg.md b/docs/source/en/model_doc/clipseg.md index 099fd4fb1bac..6af0bb754de4 100644 --- a/docs/source/en/model_doc/clipseg.md +++ b/docs/source/en/model_doc/clipseg.md @@ -72,7 +72,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h ## CLIPSegConfig [[autodoc]] CLIPSegConfig - - from_text_vision_configs ## CLIPSegTextConfig diff --git a/docs/source/en/model_doc/clvp.md b/docs/source/en/model_doc/clvp.md index eead4a546435..ad3bc51161cd 100644 --- a/docs/source/en/model_doc/clvp.md +++ b/docs/source/en/model_doc/clvp.md @@ -73,7 +73,6 @@ Example : ## ClvpConfig [[autodoc]] ClvpConfig - - from_sub_model_configs ## ClvpEncoderConfig diff --git a/docs/source/en/model_doc/groupvit.md b/docs/source/en/model_doc/groupvit.md index f5569d72398f..646da0fa4ab7 100644 --- a/docs/source/en/model_doc/groupvit.md +++ b/docs/source/en/model_doc/groupvit.md @@ -46,7 +46,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h ## GroupViTConfig [[autodoc]] GroupViTConfig - - from_text_vision_configs ## GroupViTTextConfig diff --git a/docs/source/en/model_doc/instructblip.md b/docs/source/en/model_doc/instructblip.md index ac84a71d887e..7cbab82b287e 100644 --- a/docs/source/en/model_doc/instructblip.md +++ b/docs/source/en/model_doc/instructblip.md @@ -45,7 +45,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok ## InstructBlipConfig [[autodoc]] InstructBlipConfig - - from_vision_qformer_text_configs ## InstructBlipVisionConfig diff --git a/docs/source/en/model_doc/instructblipvideo.md b/docs/source/en/model_doc/instructblipvideo.md index d4d868b7f90e..3a6ba29d243f 100644 --- a/docs/source/en/model_doc/instructblipvideo.md +++ b/docs/source/en/model_doc/instructblipvideo.md @@ -45,7 +45,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok ## InstructBlipVideoConfig [[autodoc]] InstructBlipVideoConfig - - from_vision_qformer_text_configs ## InstructBlipVideoVisionConfig diff --git a/docs/source/en/model_doc/metaclip_2.md b/docs/source/en/model_doc/metaclip_2.md index b9fbba090f0a..ce17459b8d85 100644 --- a/docs/source/en/model_doc/metaclip_2.md +++ b/docs/source/en/model_doc/metaclip_2.md @@ -88,7 +88,6 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_ ## MetaClip2Config [[autodoc]] MetaClip2Config - - from_text_vision_configs ## MetaClip2TextConfig diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md index 675dc1c9c0d5..88ca524dd4c4 100644 --- a/docs/source/en/model_doc/owlv2.md +++ b/docs/source/en/model_doc/owlv2.md @@ -90,7 +90,6 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce ## Owlv2Config [[autodoc]] Owlv2Config - - from_text_vision_configs ## Owlv2TextConfig diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md index ceae23b4cf9f..06d88fdf98b5 100644 --- a/docs/source/en/model_doc/owlvit.md +++ b/docs/source/en/model_doc/owlvit.md @@ -81,7 +81,6 @@ A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object de ## OwlViTConfig [[autodoc]] OwlViTConfig - - from_text_vision_configs ## OwlViTTextConfig diff --git a/docs/source/en/model_doc/pix2struct.md b/docs/source/en/model_doc/pix2struct.md index 412d2c2fef95..6894ba7bb593 100644 --- a/docs/source/en/model_doc/pix2struct.md +++ b/docs/source/en/model_doc/pix2struct.md @@ -47,7 +47,6 @@ The original code can be found [here](https://github.com/google-research/pix2str ## Pix2StructConfig [[autodoc]] Pix2StructConfig - - from_text_vision_configs ## Pix2StructTextConfig diff --git a/docs/source/en/model_doc/siglip.md b/docs/source/en/model_doc/siglip.md index bf9c0a460348..28def85a8b03 100644 --- a/docs/source/en/model_doc/siglip.md +++ b/docs/source/en/model_doc/siglip.md @@ -130,7 +130,6 @@ print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'") ## SiglipConfig [[autodoc]] SiglipConfig - - from_text_vision_configs ## SiglipTextConfig diff --git a/docs/source/en/model_doc/xclip.md b/docs/source/en/model_doc/xclip.md index e3219f6d8f30..529879c7bcb3 100644 --- a/docs/source/en/model_doc/xclip.md +++ b/docs/source/en/model_doc/xclip.md @@ -57,7 +57,6 @@ If you're interested in submitting a resource to be included here, please feel f ## XCLIPConfig [[autodoc]] XCLIPConfig - - from_text_vision_configs ## XCLIPTextConfig diff --git a/docs/source/ja/model_doc/align.md b/docs/source/ja/model_doc/align.md index d1ff4d918a64..efa380dcc087 100644 --- a/docs/source/ja/model_doc/align.md +++ b/docs/source/ja/model_doc/align.md @@ -72,7 +72,6 @@ ALIGNの使用を開始するのに役立つ公式のHugging Faceとコミュニ ## AlignConfig [[autodoc]] AlignConfig - - from_text_vision_configs ## AlignTextConfig diff --git a/docs/source/ja/model_doc/altclip.md b/docs/source/ja/model_doc/altclip.md index fe721d29bfe5..108fc55c955a 100644 --- a/docs/source/ja/model_doc/altclip.md +++ b/docs/source/ja/model_doc/altclip.md @@ -65,7 +65,6 @@ Transformerエンコーダーに画像を与えるには、各画像を固定サ ## AltCLIPConfig [[autodoc]] AltCLIPConfig - - from_text_vision_configs ## AltCLIPTextConfig diff --git a/docs/source/ja/model_doc/blip-2.md b/docs/source/ja/model_doc/blip-2.md index 52a092ac9ae6..594631b3e233 100644 --- a/docs/source/ja/model_doc/blip-2.md +++ b/docs/source/ja/model_doc/blip-2.md @@ -51,7 +51,6 @@ BLIP-2 の使用を開始するのに役立つ公式 Hugging Face およびコ ## Blip2Config [[autodoc]] Blip2Config - - from_vision_qformer_text_configs ## Blip2VisionConfig diff --git a/docs/source/ja/model_doc/blip.md b/docs/source/ja/model_doc/blip.md index bda95695923f..f55d4edf17e0 100644 --- a/docs/source/ja/model_doc/blip.md +++ b/docs/source/ja/model_doc/blip.md @@ -42,7 +42,6 @@ BLIP は、次のようなさまざまなマルチモーダル タスクを実 ## BlipConfig [[autodoc]] BlipConfig - - from_text_vision_configs ## BlipTextConfig diff --git a/docs/source/ja/model_doc/chinese_clip.md b/docs/source/ja/model_doc/chinese_clip.md index c5a258c4962c..979c2b798458 100644 --- a/docs/source/ja/model_doc/chinese_clip.md +++ b/docs/source/ja/model_doc/chinese_clip.md @@ -71,7 +71,6 @@ Chinese-CLIP モデルは、[OFA-Sys](https://huggingface.co/OFA-Sys) によっ ## ChineseCLIPConfig [[autodoc]] ChineseCLIPConfig - - from_text_vision_configs ## ChineseCLIPTextConfig diff --git a/docs/source/ja/model_doc/clap.md b/docs/source/ja/model_doc/clap.md index 1a5f2b5dfef1..ee8befe3e28e 100644 --- a/docs/source/ja/model_doc/clap.md +++ b/docs/source/ja/model_doc/clap.md @@ -33,7 +33,6 @@ CLAP (Contrastive Language-Audio Pretraining) は、さまざまな (音声、 ## ClapConfig [[autodoc]] ClapConfig - - from_text_audio_configs ## ClapTextConfig diff --git a/docs/source/ja/model_doc/clip.md b/docs/source/ja/model_doc/clip.md index acfab70139f1..3594fbd8b216 100644 --- a/docs/source/ja/model_doc/clip.md +++ b/docs/source/ja/model_doc/clip.md @@ -106,7 +106,6 @@ CLIP を使い始めるのに役立つ公式 Hugging Face およびコミュニ ## CLIPConfig [[autodoc]] CLIPConfig - - from_text_vision_configs ## CLIPTextConfig diff --git a/docs/source/ja/model_doc/clipseg.md b/docs/source/ja/model_doc/clipseg.md index 8853565fac0c..3ad91c8d7ad4 100644 --- a/docs/source/ja/model_doc/clipseg.md +++ b/docs/source/ja/model_doc/clipseg.md @@ -67,7 +67,6 @@ CLIPSeg の使用を開始するのに役立つ、公式 Hugging Face および ## CLIPSegConfig [[autodoc]] CLIPSegConfig - - from_text_vision_configs ## CLIPSegTextConfig diff --git a/docs/source/ja/model_doc/clvp.md b/docs/source/ja/model_doc/clvp.md index 874e0779c7c3..654addc180dd 100644 --- a/docs/source/ja/model_doc/clvp.md +++ b/docs/source/ja/model_doc/clvp.md @@ -70,7 +70,6 @@ CLVP (Contrastive Language-Voice Pretrained Transformer) モデルは、James Be ## ClvpConfig [[autodoc]] ClvpConfig - - from_sub_model_configs ## ClvpEncoderConfig diff --git a/docs/source/ko/model_doc/altclip.md b/docs/source/ko/model_doc/altclip.md index f736ab9c5c94..117a18c76774 100644 --- a/docs/source/ko/model_doc/altclip.md +++ b/docs/source/ko/model_doc/altclip.md @@ -46,7 +46,6 @@ AltCLIP은 멀티모달 비전 및 언어 모델입니다. 이미지와 텍스 ## AltCLIPConfig [[autodoc]] AltCLIPConfig - - from_text_vision_configs ## AltCLIPTextConfig diff --git a/docs/source/ko/model_doc/blip-2.md b/docs/source/ko/model_doc/blip-2.md index c9d7b99f81d3..648a20b16c95 100644 --- a/docs/source/ko/model_doc/blip-2.md +++ b/docs/source/ko/model_doc/blip-2.md @@ -46,7 +46,6 @@ BLIP-2를 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티 ## Blip2Config[[transformers.Blip2Config]] [[autodoc]] Blip2Config - - from_vision_qformer_text_configs ## Blip2VisionConfig[[transformers.Blip2VisionConfig]] diff --git a/docs/source/ko/model_doc/blip.md b/docs/source/ko/model_doc/blip.md index 4aa81c0b9cd3..3342decf902e 100644 --- a/docs/source/ko/model_doc/blip.md +++ b/docs/source/ko/model_doc/blip.md @@ -42,7 +42,6 @@ BLIP은 여러 멀티모달 작업을 수행할 수 있는 모델입니다: ## BlipConfig[[transformers.BlipConfig]] [[autodoc]] BlipConfig - - from_text_vision_configs ## BlipTextConfig[[transformers.BlipTextConfig]] diff --git a/docs/source/ko/model_doc/clip.md b/docs/source/ko/model_doc/clip.md index 4929408e6e13..62df5ed03bf8 100644 --- a/docs/source/ko/model_doc/clip.md +++ b/docs/source/ko/model_doc/clip.md @@ -199,7 +199,6 @@ CLIP을 시작하는 데 도움이 되는 Hugging Face와 community 자료 목 ## CLIPConfig[[transformers.CLIPConfig]] [[autodoc]] CLIPConfig - - from_text_vision_configs ## CLIPTextConfig[[transformers.CLIPTextConfig]] diff --git a/docs/source/ko/model_doc/clipseg.md b/docs/source/ko/model_doc/clipseg.md index a2a28ca64a69..12846635b055 100644 --- a/docs/source/ko/model_doc/clipseg.md +++ b/docs/source/ko/model_doc/clipseg.md @@ -55,7 +55,6 @@ CLIPSeg를 시작하는 데 도움이 될 Hugging Face 공식 자료와 커뮤 ## CLIPSegConfig[[transformers.CLIPSegConfig]] [[autodoc]] CLIPSegConfig - - from_text_vision_configs ## CLIPSegTextConfig[[transformers.CLIPSegTextConfig]] diff --git a/docs/source/ko/model_doc/siglip.md b/docs/source/ko/model_doc/siglip.md index c25823848cd5..f1221be16d97 100644 --- a/docs/source/ko/model_doc/siglip.md +++ b/docs/source/ko/model_doc/siglip.md @@ -197,7 +197,6 @@ PyTorch는 `torch.nn.functional`의 일부로 스케일된 점곱 어텐션(SDPA ## SiglipConfig [[autodoc]] SiglipConfig - - from_text_vision_configs ## SiglipTextConfig diff --git a/docs/source/ko/model_doc/xclip.md b/docs/source/ko/model_doc/xclip.md index 62cc5964249b..52882b25c33c 100644 --- a/docs/source/ko/model_doc/xclip.md +++ b/docs/source/ko/model_doc/xclip.md @@ -56,7 +56,6 @@ X-CLIP을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티 ## XCLIPConfig[[xclipconfig]] [[autodoc]] XCLIPConfig - - from_text_vision_configs ## XCLIPTextConfig[[xcliptextconfig]] diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index d00289ebe8c5..7f186a32437b 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -1131,11 +1131,11 @@ def _get_non_default_generation_parameters(self) -> dict[str, Any]: non_default_generation_parameters = {} decoder_attribute_name = None - # Composite models don't have a default config, use their decoder config as a fallback for default values + # Some composite models don't have a default config, use their decoder config as a fallback for default values # If no known pattern is matched, then `default_config = None` -> check against the global generation defaults - try: + if not self.has_no_defaults_at_init: default_config = self.__class__() - except ValueError: + else: decoder_config = self.get_text_config(decoder=True) if decoder_config is not self: default_config = decoder_config.__class__() @@ -1246,42 +1246,6 @@ def get_text_config(self, decoder=None, encoder=None) -> "PreTrainedConfig": return config_to_return - @classmethod - def from_text_vision_configs(cls, text_config, vision_config, **kwargs): - r""" - Instantiate a model config (or a derived class) from text model configuration and vision model - configuration. - - Returns: - [`PreTrainedConfig`]: An instance of a configuration object - """ - - warnings.warn( - "The `from_text_vision_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate " - "the config class directly with `MyConfig(text_config=text_config, vision_config=vision_config, **kwargs)` instead.", - FutureWarning, - ) - - return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) - - @classmethod - def from_text_audio_configs(cls, text_config, audio_config, **kwargs): - r""" - Instantiate a model config (or a derived class) from text model configuration and audio model - configuration. - - Returns: - [`PreTrainedConfig`]: An instance of a configuration object - """ - - warnings.warn( - "The `from_text_audio_configs` method is deprecated and will be removed in v4.60 of Transformers. Please instantiate " - "the config class directly with `MyConfig(text_config=text_config, audio_config=audio_config, **kwargs)` instead.", - FutureWarning, - ) - - return cls(text_config=text_config.to_dict(), audio_config=audio_config.to_dict(), **kwargs) - def get_configuration_file(configuration_files: list[str]) -> str: """ diff --git a/src/transformers/models/aimv2/configuration_aimv2.py b/src/transformers/models/aimv2/configuration_aimv2.py index ee2e1f2052b9..49abf63e6c14 100644 --- a/src/transformers/models/aimv2/configuration_aimv2.py +++ b/src/transformers/models/aimv2/configuration_aimv2.py @@ -264,21 +264,25 @@ class Aimv2Config(PreTrainedConfig): def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs ): - super().__init__(**kwargs) - + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.max_logit_scale = 100.0 if text_config is None: - text_config = {} + text_config = Aimv2TextConfig() logger.info("`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = Aimv2TextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = Aimv2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Aimv2VisionConfig(**vision_config) - self.text_config = Aimv2TextConfig(**text_config) - self.vision_config = Aimv2VisionConfig(**vision_config) - self.projection_dim = projection_dim - self.logit_scale_init_value = logit_scale_init_value - self.max_logit_scale = 100.0 + self.text_config = text_config + self.vision_config = vision_config + + super().__init__(**kwargs) __all__ = ["Aimv2Config", "Aimv2VisionConfig", "Aimv2TextConfig"] diff --git a/src/transformers/models/aimv2/modular_aimv2.py b/src/transformers/models/aimv2/modular_aimv2.py index 18ef50e5bcc1..1320ac8c64ac 100644 --- a/src/transformers/models/aimv2/modular_aimv2.py +++ b/src/transformers/models/aimv2/modular_aimv2.py @@ -280,10 +280,10 @@ class Aimv2Config(SiglipConfig): def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs ): - super().__init__(text_config, vision_config, **kwargs) self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.max_logit_scale = 100.0 + super().__init__(text_config, vision_config, **kwargs) del self.initializer_factor diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 2802c3baa6b4..570fb8346b37 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -287,7 +287,7 @@ class AlignConfig(PreTrainedConfig): >>> config_text = AlignTextConfig() >>> config_vision = AlignVisionConfig() - >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision) + >>> config = AlignConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "align" @@ -302,22 +302,25 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the AlignTextConfig with default values.") + text_config = AlignTextConfig() + logger.info("`text_config` is `None`. Initializing the `AlignTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = AlignTextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.") + vision_config = AlignVisionConfig() + logger.info("`vision_config` is `None`. initializing the `AlignVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = AlignVisionConfig(**vision_config) - self.text_config = AlignTextConfig(**text_config) - self.vision_config = AlignVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.temperature_init_value = temperature_init_value self.initializer_range = initializer_range + super().__init__(**kwargs) __all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"] diff --git a/src/transformers/models/align/convert_align_tf_to_hf.py b/src/transformers/models/align/convert_align_tf_to_hf.py index 74309a0d7076..874befda5277 100644 --- a/src/transformers/models/align/convert_align_tf_to_hf.py +++ b/src/transformers/models/align/convert_align_tf_to_hf.py @@ -56,9 +56,7 @@ def get_align_config(): vision_config.depthwise_padding = [] text_config = BertConfig() - config = AlignConfig.from_text_vision_configs( - text_config=text_config, vision_config=vision_config, projection_dim=640 - ) + config = AlignConfig(text_config=text_config, vision_config=vision_config, projection_dim=640) return config diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 88e49d06ca5c..7d950d0cf5de 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -266,7 +266,7 @@ class AltCLIPConfig(PreTrainedConfig): >>> config_text = AltCLIPTextConfig() >>> config_vision = AltCLIPVisionConfig() - >>> config = AltCLIPConfig.from_text_vision_configs(config_text, config_vision) + >>> config = AltCLIPConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "altclip" @@ -281,8 +281,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -346,19 +344,24 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} + text_config = AltCLIPTextConfig() logger.info("`text_config` is `None`. Initializing the `AltCLIPTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = AltCLIPTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = AltCLIPVisionConfig() logger.info("`vision_config` is `None`. initializing the `AltCLIPVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = AltCLIPVisionConfig(**vision_config) - self.text_config = AltCLIPTextConfig(**text_config) - self.vision_config = AltCLIPVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = 1.0 + super().__init__(**kwargs) __all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"] diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 451acad65200..8aed0d4a812d 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -180,13 +180,10 @@ def __init__( moe_num_shared_experts: int = 2, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + self.intermediate_size = intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -215,9 +212,14 @@ def __init__( if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) - self.moe_num_experts = moe_num_experts - self.moe_topk = moe_topk - self.moe_num_shared_experts = moe_num_shared_experts + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class AriaConfig(PreTrainedConfig): diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index 46e35911c1f1..4853ef361eb8 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -214,11 +214,11 @@ def __init__( pad_token_id=2, **super_kwargs, ): - super().__init__(pad_token_id=pad_token_id, **super_kwargs) self.intermediate_size = intermediate_size self.moe_num_experts = moe_num_experts self.moe_topk = moe_topk self.moe_num_shared_experts = moe_num_shared_experts + super().__init__(pad_token_id=pad_token_id, **super_kwargs) class AriaConfig(PreTrainedConfig): diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index d5ec180c459b..7355356b90db 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -221,7 +221,7 @@ class BarkConfig(PreTrainedConfig): >>> # Initializing a Bark module style configuration - >>> configuration = BarkConfig.from_sub_model_configs( + >>> configuration = BarkConfig( ... semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config ... ) @@ -251,53 +251,40 @@ def __init__( **kwargs, ): if semantic_config is None: - semantic_config = {} - logger.info("semantic_config is None. initializing the semantic model with default values.") + semantic_config = BarkSemanticConfig() + logger.info("`semantic_config` is `None`. Initializing the `BarkSemanticConfig` with default values.") + elif isinstance(semantic_config, dict): + semantic_config = BarkSemanticConfig(**semantic_config) if coarse_acoustics_config is None: - coarse_acoustics_config = {} - logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.") + coarse_acoustics_config = BarkCoarseConfig() + logger.info( + "`coarse_acoustics_config` is `None`. Initializing the `BarkCoarseConfig` with default values." + ) + elif isinstance(coarse_acoustics_config, dict): + coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config) if fine_acoustics_config is None: - fine_acoustics_config = {} - logger.info("fine_acoustics_config is None. initializing the fine model with default values.") + fine_acoustics_config = BarkFineConfig() + logger.info("`fine_acoustics_config` is `None`. Initializing the `BarkFineConfig` with default values.") + elif isinstance(fine_acoustics_config, dict): + fine_acoustics_config = BarkFineConfig(**fine_acoustics_config) if codec_config is None: - codec_config = {} - logger.info("codec_config is None. initializing the codec model with default values.") + codec_config = CONFIG_MAPPING["encodec"]() + logger.info("`codec_config` is `None`. Initializing the `codec_config` with default values.") + elif isinstance(codec_config, dict): + codec_model_type = codec_config.get("model_type", "encodec") + codec_config = CONFIG_MAPPING[codec_model_type](**codec_config) - self.semantic_config = BarkSemanticConfig(**semantic_config) - self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config) - self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config) - codec_model_type = codec_config.get("model_type", "encodec") - self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config) + self.semantic_config = semantic_config + self.coarse_acoustics_config = coarse_acoustics_config + self.fine_acoustics_config = fine_acoustics_config + self.codec_config = codec_config self.initializer_range = initializer_range super().__init__(**kwargs) - @classmethod - def from_sub_model_configs( - cls, - semantic_config: BarkSemanticConfig, - coarse_acoustics_config: BarkCoarseConfig, - fine_acoustics_config: BarkFineConfig, - codec_config: PreTrainedConfig, - **kwargs, - ): - r""" - Instantiate a [`BarkConfig`] (or a derived class) from bark sub-models configuration. - - Returns: - [`BarkConfig`]: An instance of a configuration object - """ - return cls( - semantic_config=semantic_config.to_dict(), - coarse_acoustics_config=coarse_acoustics_config.to_dict(), - fine_acoustics_config=fine_acoustics_config.to_dict(), - codec_config=codec_config.to_dict(), - **kwargs, - ) - __all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/src/transformers/models/bark/convert_suno_to_hf.py b/src/transformers/models/bark/convert_suno_to_hf.py index af2c4f3e8d73..d1c2f85a3c7b 100644 --- a/src/transformers/models/bark/convert_suno_to_hf.py +++ b/src/transformers/models/bark/convert_suno_to_hf.py @@ -229,11 +229,9 @@ def load_whole_bark_model( fineAcoustic = BarkFineModel.from_pretrained(fine_path) codec = EncodecModel.from_pretrained("facebook/encodec_24khz") - bark_config = BarkConfig.from_sub_model_configs( - semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig - ) + bark_config = BarkConfig(semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig) - bark_generation_config = BarkGenerationConfig.from_sub_model_configs( + bark_generation_config = BarkGenerationConfig( semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config ) diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 66e918499d55..e03c0fe8dbd4 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -275,7 +275,7 @@ class BlipConfig(PreTrainedConfig): >>> config_text = BlipTextConfig() >>> config_vision = BlipVisionConfig() - >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision) + >>> config = BlipConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "blip" @@ -291,18 +291,20 @@ def __init__( label_smoothing=0.0, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} + text_config = BlipTextConfig() logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = BlipTextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.") + vision_config = BlipVisionConfig() + logger.info("`vision_config` is `None`. initializing the `BlipVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = BlipVisionConfig(**vision_config) - self.text_config = BlipTextConfig(**text_config) - self.vision_config = BlipVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.text_config.encoder_hidden_size = self.vision_config.hidden_size @@ -312,6 +314,7 @@ def __init__( self.initializer_range = 0.02 self.image_text_hidden_size = image_text_hidden_size self.label_smoothing = label_smoothing + super().__init__(**kwargs) __all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"] diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 8e5cda91b65c..2694fdeb1085 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -14,8 +14,6 @@ # limitations under the License. """BLIP-2 model configuration""" -from typing import Optional - from ...configuration_utils import PreTrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging @@ -261,7 +259,7 @@ class Blip2Config(PreTrainedConfig): >>> qformer_config = Blip2QFormerConfig() >>> text_config = OPTConfig() - >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config) + >>> config = Blip2Config(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config) ```""" model_type = "blip-2" @@ -280,64 +278,39 @@ def __init__( image_token_index=None, **kwargs, ): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.") + if text_config is None: + text_config = CONFIG_MAPPING["opt"]() + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + elif isinstance(text_config, dict): + text_model_type = text_config.get("model_type", "opt") + text_config = CONFIG_MAPPING[text_model_type](**text_config) if qformer_config is None: - qformer_config = {} + qformer_config = Blip2QFormerConfig() logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.") + elif isinstance(qformer_config, dict): + qformer_config = Blip2QFormerConfig(**qformer_config) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + if vision_config is None: + vision_config = Blip2VisionConfig() + logger.info("`vision_config` is `None`. initializing the `Blip2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Blip2VisionConfig(**vision_config) - self.vision_config = Blip2VisionConfig(**vision_config) - self.qformer_config = Blip2QFormerConfig(**qformer_config) - text_model_type = text_config.get("model_type", "opt") - self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + self.text_config = text_config + self.vision_config = vision_config + self.qformer_config = qformer_config self.num_query_tokens = num_query_tokens self.image_text_hidden_size = image_text_hidden_size self.image_token_index = image_token_index self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES - self.is_encoder_decoder = self.text_config.is_encoder_decoder self.initializer_factor = 1.0 self.initializer_range = 0.02 - @classmethod - def from_vision_qformer_text_configs( - cls, - vision_config: Blip2VisionConfig, - qformer_config: Blip2QFormerConfig, - text_config: Optional[PreTrainedConfig] = None, - **kwargs, - ): - r""" - Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model - configurations. - - Args: - vision_config (`dict`): - Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. - qformer_config (`dict`): - Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. - text_config (`dict`, *optional*): - Dictionary of configuration options used to initialize any [`PreTrainedConfig`]. - - Returns: - [`Blip2Config`]: An instance of a configuration object - """ - - return cls( - vision_config=vision_config.to_dict(), - qformer_config=qformer_config.to_dict(), - text_config=text_config.to_dict() if text_config is not None else None, - **kwargs, - ) + kwargs["is_encoder_decoder"] = self.text_config.is_encoder_decoder + super().__init__(**kwargs) __all__ = ["Blip2Config", "Blip2QFormerConfig", "Blip2VisionConfig"] diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index e53a7a9bc8a5..7a0dcf754711 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -272,7 +272,6 @@ def __init__( _ = kwargs.pop("text_config_dict", None) _ = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers self.hidden_act = hidden_act self.hidden_size = hidden_size @@ -286,15 +285,20 @@ def __init__( self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.") + text_config = BridgeTowerTextConfig() + logger.info("`text_config` is `None`. initializing the `BridgeTowerTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = BridgeTowerTextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.") + vision_config = BridgeTowerVisionConfig() + logger.info("`vision_config` is `None`. initializing the `BridgeTowerVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = BridgeTowerVisionConfig(**vision_config) - self.text_config = BridgeTowerTextConfig(**text_config) - self.vision_config = BridgeTowerVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config + super().__init__(**kwargs) __all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"] diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 73c7dd2c6596..72f37d5f58a8 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -269,7 +269,7 @@ class ChineseCLIPConfig(PreTrainedConfig): >>> config_text = ChineseCLIPTextConfig() >>> config_vision = ChineseCLIPVisionConfig() - >>> config = ChineseCLIPConfig.from_text_vision_configs(config_text, config_vision) + >>> config = ChineseCLIPConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "chinese_clip" @@ -284,8 +284,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -349,20 +347,25 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `ChineseCLIPTextConfig` with default values.") + text_config = ChineseCLIPTextConfig() + logger.info("`text_config` is `None`. initializing the `ChineseCLIPTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = ChineseCLIPTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = ChineseCLIPVisionConfig() logger.info("`vision_config` is `None`. initializing the `ChineseCLIPVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = ChineseCLIPVisionConfig(**vision_config) - self.text_config = ChineseCLIPTextConfig(**text_config) - self.vision_config = ChineseCLIPVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = 1.0 self.initializer_range = 0.02 + super().__init__(**kwargs) class ChineseCLIPOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 69a10f0fd5d9..0e45b6e4e244 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -328,7 +328,7 @@ class ClapConfig(PreTrainedConfig): >>> config_text = ClapTextConfig() >>> config_audio = ClapAudioConfig() - >>> config = ClapConfig.from_text_audio_configs(config_text, config_audio) + >>> config = ClapConfig(text_config=config_text, audio_config=config_audio) ```""" model_type = "clap" @@ -344,18 +344,21 @@ def __init__( initializer_factor=1.0, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the ClapTextConfig with default values.") + text_config = ClapTextConfig() + logger.info("`text_config` is `None`. initializing the `ClapTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = ClapTextConfig(**text_config) if audio_config is None: - audio_config = {} - logger.info("audio_config is None. initializing the ClapAudioConfig with default values.") + audio_config = ClapAudioConfig() + logger.info("`audio_config` is `None`. initializing the `ClapAudioConfig` with default values.") + elif isinstance(audio_config, dict): + audio_config = ClapAudioConfig(**audio_config) + + self.text_config = text_config + self.audio_config = audio_config - self.text_config = ClapTextConfig(**text_config) - self.audio_config = ClapAudioConfig(**audio_config) self.text_config.projection_dim = projection_dim self.audio_config.projection_dim = projection_dim @@ -369,6 +372,7 @@ def __init__( self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = initializer_factor self.num_hidden_layers = self.text_config.num_hidden_layers + len(self.audio_config.depths) + super().__init__(**kwargs) __all__ = ["ClapAudioConfig", "ClapConfig", "ClapTextConfig"] diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 84dbbb6ac7a0..3f4681482cdd 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -266,7 +266,7 @@ class CLIPConfig(PreTrainedConfig): >>> config_text = CLIPTextConfig() >>> config_vision = CLIPVisionConfig() - >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision) + >>> config = CLIPConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "clip" @@ -281,8 +281,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -346,19 +344,24 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.") + text_config = CLIPTextConfig() + logger.info("`text_config` is `None`. initializing the `CLIPTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = CLIPTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = CLIPVisionConfig() logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = CLIPVisionConfig(**vision_config) - self.text_config = CLIPTextConfig(**text_config) - self.vision_config = CLIPVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = 1.0 + super().__init__(**kwargs) class CLIPOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 74345502e419..e8f0057912a4 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -265,7 +265,7 @@ class CLIPSegConfig(PreTrainedConfig): >>> config_text = CLIPSegTextConfig() >>> config_vision = CLIPSegVisionConfig() - >>> config = CLIPSegConfig.from_text_vision_configs(config_text, config_vision) + >>> config = CLIPSegConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "clipseg" @@ -293,8 +293,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -358,15 +356,19 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `CLIPSegTextConfig` with default values.") + text_config = CLIPSegTextConfig() + logger.info("`text_config` is `None`. initializing the `CLIPSegTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = CLIPSegTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = CLIPSegVisionConfig() logger.info("`vision_config` is `None`. initializing the `CLIPSegVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = CLIPSegVisionConfig(**vision_config) - self.text_config = CLIPSegTextConfig(**text_config) - self.vision_config = CLIPSegVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value @@ -379,6 +381,7 @@ def __init__( self.conditional_layer = conditional_layer self.initializer_factor = 1.0 self.use_complex_transposed_convolution = use_complex_transposed_convolution + super().__init__(**kwargs) __all__ = ["CLIPSegConfig", "CLIPSegTextConfig", "CLIPSegVisionConfig"] diff --git a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py index 7ea82bce515c..01f6d8e472fa 100644 --- a/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py +++ b/src/transformers/models/clipseg/convert_clipseg_original_pytorch_to_hf.py @@ -39,9 +39,9 @@ def get_clipseg_config(model_name): use_complex_transposed_convolution = "refined" in model_name reduce_dim = 16 if "rd16" in model_name else 64 - config = CLIPSegConfig.from_text_vision_configs( - text_config, - vision_config, + config = CLIPSegConfig( + text_config=text_config, + vision_config=vision_config, use_complex_transposed_convolution=use_complex_transposed_convolution, reduce_dim=reduce_dim, ) diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index 6fac97a40122..a3b5633fb588 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -362,7 +362,7 @@ class ClvpConfig(PreTrainedConfig): >>> config_speech = ClvpEncoderConfig() >>> decoder_config = ClvpDecoderConfig() - >>> config = ClvpConfig.from_sub_model_configs(config_text, config_speech, decoder_config) + >>> config = ClvpConfig(config_text, config_speech, decoder_config) ```""" model_type = "clvp" @@ -382,58 +382,32 @@ def __init__( initializer_factor=1.0, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `ClvpEncoderConfig` with default values.") + text_config = ClvpEncoderConfig() + logger.info("`text_config` is `None`. initializing the `ClvpEncoderConfig` with default values.") + elif isinstance(text_config, dict): + text_config = ClvpEncoderConfig(**text_config) if speech_config is None: - speech_config = {} + speech_config = ClvpEncoderConfig() logger.info("`speech_config` is `None`. initializing the `ClvpEncoderConfig` with default values.") + elif isinstance(speech_config, dict): + speech_config = ClvpEncoderConfig(**speech_config) if decoder_config is None: - decoder_config = {} - logger.info("`decoder_config` is `None`. initializing the `ClvpDecoderConfig` with default values.") + decoder_config = ClvpDecoderConfig() + logger.info("`image_config` is `None`. initializing the `ClvpDecoderConfig` with default values.") + elif isinstance(decoder_config, dict): + decoder_config = ClvpDecoderConfig(**decoder_config) - self.text_config = ClvpEncoderConfig(**text_config) - self.speech_config = ClvpEncoderConfig(**speech_config) - self.decoder_config = ClvpDecoderConfig(**decoder_config) + self.text_config = text_config + self.speech_config = speech_config + self.decoder_config = decoder_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = initializer_factor - - @classmethod - def from_sub_model_configs( - cls, - text_config: ClvpEncoderConfig, - speech_config: ClvpEncoderConfig, - decoder_config: ClvpDecoderConfig, - **kwargs, - ): - r""" - Instantiate a [`ClvpConfig`] (or a derived class) from CLVP text model configuration, CLVP speech model - configuration and CLVP decoder model configuration. - - Args: - text_config (`ClvpEncoderConfig`): - Text model configuration of type [`ClvpEncoderConfig`]. - speech_config (`ClvpEncoderConfig`): - Speech model configuration of type [`ClvpEncoderConfig`]. - decoder_config (`ClvpDecoderConfig`): - Decoder model configuration of type [`ClvpDecoderConfig`]. - - Returns: - [`ClvpConfig`]: An instance of a configuration object - """ - - return cls( - text_config=text_config.to_dict(), - speech_config=speech_config.to_dict(), - decoder_config=decoder_config.to_dict(), - **kwargs, - ) + super().__init__(**kwargs) __all__ = ["ClvpConfig", "ClvpDecoderConfig", "ClvpEncoderConfig"] diff --git a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py index 688da89165c9..911a25266105 100644 --- a/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/configuration_cohere2_vision.py @@ -51,7 +51,6 @@ def __init__( alignment_intermediate_size=36864, **kwargs, ): - super().__init__(**kwargs) self.downsample_factor = downsample_factor self.image_token_id = image_token_id self.alignment_intermediate_size = alignment_intermediate_size @@ -77,6 +76,7 @@ def __init__( text_config = CONFIG_MAPPING["cohere2"](tie_word_embeddings=True) self.text_config = text_config + super().__init__(**kwargs) __all__ = ["Cohere2VisionConfig"] diff --git a/src/transformers/models/colpali/modular_colpali.py b/src/transformers/models/colpali/modular_colpali.py index 176b3e6a15ee..fa0198ef4962 100644 --- a/src/transformers/models/colpali/modular_colpali.py +++ b/src/transformers/models/colpali/modular_colpali.py @@ -73,9 +73,9 @@ def __init__( visual_prompt_prefix: str = "Describe the image.", query_prefix: str = "Question: ", ): - super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template) self.visual_prompt_prefix = visual_prompt_prefix self.query_prefix = query_prefix + super().__init__(image_processor=image_processor, tokenizer=tokenizer, chat_template=chat_template) @property def query_augmentation_token(self) -> str: diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index 032cc70d4482..cd33607a35fd 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -105,7 +105,8 @@ def __init__( visual_prompt_prefix: str = "Describe the image.", query_prefix: str = "Question: ", ): - super().__init__(image_processor, tokenizer, chat_template=chat_template) + self.visual_prompt_prefix = visual_prompt_prefix + self.query_prefix = query_prefix if not hasattr(image_processor, "image_seq_length"): raise ValueError("Image processor is missing an `image_seq_length` attribute.") @@ -124,8 +125,8 @@ def __init__( tokenizer.add_tokens(EXTRA_TOKENS) tokenizer.add_bos_token = False tokenizer.add_eos_token = False - self.visual_prompt_prefix = visual_prompt_prefix - self.query_prefix = query_prefix + + super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( self, diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 5771fa57314c..8604951436c2 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -373,14 +373,6 @@ def __init__( if kwargs.pop("tie_word_embeddings", False): raise ValueError("`tie_word_embeddings=True` is not supported for CsmConfig") - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=False, - **kwargs, - ) - if depth_decoder_config is None: self.depth_decoder_config = CsmDepthDecoderConfig() logger.info("depth_decoder_config is None, using default depth decoder config.") @@ -433,6 +425,14 @@ def __init__( self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=False, + **kwargs, + ) + __all__ = [ "CsmDepthDecoderConfig", diff --git a/src/transformers/models/d_fine/configuration_d_fine.py b/src/transformers/models/d_fine/configuration_d_fine.py index 9a7464042dee..722888d5022f 100644 --- a/src/transformers/models/d_fine/configuration_d_fine.py +++ b/src/transformers/models/d_fine/configuration_d_fine.py @@ -397,22 +397,5 @@ def __init__( ) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - @classmethod - def from_backbone_configs(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`DFineConfig`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) - __all__ = ["DFineConfig"] diff --git a/src/transformers/models/d_fine/modular_d_fine.py b/src/transformers/models/d_fine/modular_d_fine.py index 93505d8deaaf..01d59e238acb 100644 --- a/src/transformers/models/d_fine/modular_d_fine.py +++ b/src/transformers/models/d_fine/modular_d_fine.py @@ -416,23 +416,6 @@ def __init__( ) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - @classmethod - def from_backbone_configs(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`DFineConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`DFineConfig`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) - class DFineMultiscaleDeformableAttention(nn.Module): def __init__(self, config: DFineConfig): diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 32401bcb8a6c..1d803891d128 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -174,13 +174,20 @@ def __init__( moe_intermediate_size=1407, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + self.first_k_dense_replace = first_k_dense_replace + self.kv_lora_rank = kv_lora_rank + self.q_lora_rank = q_lora_rank + self.n_group = n_group + self.n_routed_experts = n_routed_experts + self.n_shared_experts = n_shared_experts + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.routed_scaling_factor = routed_scaling_factor + self.topk_group = topk_group + self.topk_method = topk_method + self.v_head_dim = v_head_dim + self.num_experts_per_tok = num_experts_per_tok + self.moe_intermediate_size = moe_intermediate_size self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -202,26 +209,21 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias + self.head_dim = qk_rope_head_dim # Validate the correctness of rotary position embeddings parameters # BC: if there is a 'type' field, copy it it to 'rope_type'. if self.rope_scaling is not None and "type" in self.rope_scaling: self.rope_scaling["rope_type"] = self.rope_scaling["type"] rope_config_validation(self) - self.first_k_dense_replace = first_k_dense_replace - self.kv_lora_rank = kv_lora_rank - self.q_lora_rank = q_lora_rank - self.n_group = n_group - self.n_routed_experts = n_routed_experts - self.n_shared_experts = n_shared_experts - self.qk_nope_head_dim = qk_nope_head_dim - self.qk_rope_head_dim = qk_rope_head_dim - self.routed_scaling_factor = routed_scaling_factor - self.topk_group = topk_group - self.topk_method = topk_method - self.v_head_dim = v_head_dim - self.num_experts_per_tok = num_experts_per_tok - self.moe_intermediate_size = moe_intermediate_size + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["DeepseekV2Config"] diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index e8be9d72f36a..8bfe11345393 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -188,9 +188,6 @@ def __init__( moe_intermediate_size=1407, **kwargs, ): - super().__init__(**kwargs) - - del self.pretraining_tp self.first_k_dense_replace = first_k_dense_replace self.kv_lora_rank = kv_lora_rank self.q_lora_rank = q_lora_rank @@ -205,7 +202,11 @@ def __init__( self.v_head_dim = v_head_dim self.num_experts_per_tok = num_experts_per_tok self.moe_intermediate_size = moe_intermediate_size + + super().__init__(**kwargs) + self.head_dim = qk_rope_head_dim + del self.pretraining_tp def apply_rotary_emb( diff --git a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py index 4fe3a5e4d82c..64dcf84d4a35 100644 --- a/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/configuration_deepseek_vl.py @@ -71,8 +71,6 @@ def __init__( image_token_id: int = 100015, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") @@ -92,6 +90,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + super().__init__(**kwargs) __all__ = ["DeepseekVLConfig"] diff --git a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py index ed5f7d655e34..36c5aee2569f 100644 --- a/src/transformers/models/deepseek_vl/modular_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/modular_deepseek_vl.py @@ -82,8 +82,6 @@ def __init__( image_token_id: int = 100015, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") @@ -103,6 +101,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id + super().__init__(**kwargs) class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): diff --git a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py index 6a99cc4dab97..cbda990152e7 100644 --- a/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/configuration_deepseek_vl_hybrid.py @@ -74,8 +74,15 @@ def __init__( image_token_id: int = 100015, **kwargs, ): - super().__init__(**kwargs) + if high_res_vision_config is None: + high_res_vision_config = {} + logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") + + if isinstance(high_res_vision_config, dict): + high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") + high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) + self.high_res_vision_config = high_res_vision_config if text_config is None: text_config = {} logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.") @@ -95,16 +102,7 @@ def __init__( self.text_config = text_config self.vision_config = vision_config self.image_token_id = image_token_id - - if high_res_vision_config is None: - high_res_vision_config = {} - logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") - - if isinstance(high_res_vision_config, dict): - high_res_vision_config["model_type"] = high_res_vision_config.get("model_type", "sam_vision_model") - high_res_vision_config = CONFIG_MAPPING[high_res_vision_config["model_type"]](**high_res_vision_config) - - self.high_res_vision_config = high_res_vision_config + super().__init__(**kwargs) __all__ = ["DeepseekVLHybridConfig"] diff --git a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py index 43af7d43dfb3..27062cfd06b2 100644 --- a/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/modular_deepseek_vl_hybrid.py @@ -125,13 +125,6 @@ def __init__( image_token_id: int = 100015, **kwargs, ): - super().__init__( - text_config=text_config, - vision_config=vision_config, - image_token_id=image_token_id, - **kwargs, - ) - if high_res_vision_config is None: high_res_vision_config = {} logger.info("`high_res_vision_config` is `None`. Initializing the `SamVisionConfig` with default values.") @@ -142,6 +135,13 @@ def __init__( self.high_res_vision_config = high_res_vision_config + super().__init__( + text_config=text_config, + vision_config=vision_config, + image_token_id=image_token_id, + **kwargs, + ) + class DeepseekVLHybridBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast): pass diff --git a/src/transformers/models/deprecated/jukebox/configuration_jukebox.py b/src/transformers/models/deprecated/jukebox/configuration_jukebox.py index 1c9b44cebe4e..a4ce05d67d3c 100644 --- a/src/transformers/models/deprecated/jukebox/configuration_jukebox.py +++ b/src/transformers/models/deprecated/jukebox/configuration_jukebox.py @@ -559,14 +559,16 @@ def __init__( **kwargs, ): if vqvae_config is None: - vqvae_config = {} + vqvae_config = JukeboxVQVAEConfig() logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.") - - self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config) - if prior_config_list is not None: - self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list] - else: - self.prior_configs = [] + elif isinstance(vqvae_config, dict): + vqvae_config = JukeboxVQVAEConfig(**vqvae_config) + self.vqvae_config = vqvae_config + + if prior_config_list is not None and isinstance(prior_config_list[0], dict): + prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list] + elif prior_config_list is None: + prior_configs = [] for prior_idx in range(nb_priors): prior_config = kwargs.pop(f"prior_{prior_idx}", None) if prior_config is None: @@ -575,10 +577,10 @@ def __init__( f"prior_{prior_idx}'s config is None. Initializing the JukeboxPriorConfig list with default" " values." ) - self.prior_configs.append(JukeboxPriorConfig(**prior_config)) + prior_configs.append(JukeboxPriorConfig(**prior_config)) + self.prior_configs = prior_configs self.hop_fraction = self.vqvae_config.hop_fraction - self.nb_priors = nb_priors # Metadata conditioning @@ -591,18 +593,6 @@ def __init__( super().__init__(**kwargs) - @classmethod - def from_configs(cls, prior_configs: list[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs): - r""" - Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model - configuration. - - Returns: - [`JukeboxConfig`]: An instance of a configuration object - """ - prior_config_list = [config.to_dict() for config in prior_configs] - return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs) - def to_dict(self): # Override the default to_dict to apply to_dict to the list of prior configs. result = super().to_dict() diff --git a/src/transformers/models/depth_anything/configuration_depth_anything.py b/src/transformers/models/depth_anything/configuration_depth_anything.py index 9e263bb6406a..4ffd9c4c58bc 100644 --- a/src/transformers/models/depth_anything/configuration_depth_anything.py +++ b/src/transformers/models/depth_anything/configuration_depth_anything.py @@ -108,7 +108,6 @@ def __init__( max_depth=None, **kwargs, ): - super().__init__(**kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.") backbone_config = CONFIG_MAPPING["dinov2"]( @@ -150,5 +149,7 @@ def __init__( self.depth_estimation_type = depth_estimation_type self.max_depth = max_depth if max_depth else 1 + super().__init__(**kwargs) + __all__ = ["DepthAnythingConfig"] diff --git a/src/transformers/models/depth_pro/configuration_depth_pro.py b/src/transformers/models/depth_pro/configuration_depth_pro.py index 8817420dffed..0740ec1769b5 100644 --- a/src/transformers/models/depth_pro/configuration_depth_pro.py +++ b/src/transformers/models/depth_pro/configuration_depth_pro.py @@ -109,8 +109,6 @@ def __init__( fov_model_config=None, **kwargs, ): - super().__init__(**kwargs) - # scaled_images_ratios is sorted if scaled_images_ratios != sorted(scaled_images_ratios): raise ValueError( @@ -200,5 +198,7 @@ def __init__( setattr(self, sub_config_key, sub_config) + super().__init__(**kwargs) + __all__ = ["DepthProConfig"] diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py index 7c69f06318a1..5536734d2cb0 100644 --- a/src/transformers/models/detr/configuration_detr.py +++ b/src/transformers/models/detr/configuration_detr.py @@ -245,18 +245,6 @@ def __init__( self.eos_coefficient = eos_coefficient super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - @classmethod - def from_backbone_config(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`DetrConfig`] (or a derived class) from a pre-trained backbone model configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - Returns: - [`DetrConfig`]: An instance of a configuration object - """ - return cls(backbone_config=backbone_config, **kwargs) - class DetrOnnxConfig(OnnxConfig): torch_onnx_minimum_version = version.parse("1.11") diff --git a/src/transformers/models/dpt/configuration_dpt.py b/src/transformers/models/dpt/configuration_dpt.py index 99277ab87368..509c21464598 100644 --- a/src/transformers/models/dpt/configuration_dpt.py +++ b/src/transformers/models/dpt/configuration_dpt.py @@ -180,8 +180,6 @@ def __init__( pooler_act="tanh", **kwargs, ): - super().__init__(**kwargs) - self.hidden_size = hidden_size self.is_hybrid = is_hybrid @@ -273,6 +271,7 @@ def __init__( self.semantic_classifier_dropout = semantic_classifier_dropout self.pooler_output_size = pooler_output_size if pooler_output_size else hidden_size self.pooler_act = pooler_act + super().__init__(**kwargs) __all__ = ["DPTConfig"] diff --git a/src/transformers/models/edgetam/configuration_edgetam.py b/src/transformers/models/edgetam/configuration_edgetam.py index a260b279bacd..2c4ef6e1d433 100644 --- a/src/transformers/models/edgetam/configuration_edgetam.py +++ b/src/transformers/models/edgetam/configuration_edgetam.py @@ -84,8 +84,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) - backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list backbone_feature_sizes = ( [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes @@ -118,6 +116,7 @@ def __init__( self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range + super().__init__(**kwargs) class EdgeTamPromptEncoderConfig(PreTrainedConfig): @@ -309,7 +308,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) vision_config = vision_config if vision_config is not None else {} prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} @@ -327,6 +325,7 @@ def __init__( self.mask_decoder_config = EdgeTamMaskDecoderConfig(**mask_decoder_config) self.initializer_range = initializer_range + super().__init__(**kwargs) __all__ = ["EdgeTamConfig", "EdgeTamVisionConfig", "EdgeTamPromptEncoderConfig", "EdgeTamMaskDecoderConfig"] diff --git a/src/transformers/models/edgetam/modular_edgetam.py b/src/transformers/models/edgetam/modular_edgetam.py index 5ed9841aad69..b32d71ec0a5a 100644 --- a/src/transformers/models/edgetam/modular_edgetam.py +++ b/src/transformers/models/edgetam/modular_edgetam.py @@ -107,8 +107,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) - backbone_channel_list = [384, 192, 96, 48] if backbone_channel_list is None else backbone_channel_list backbone_feature_sizes = ( [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes @@ -141,6 +139,7 @@ def __init__( self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range + super().__init__(**kwargs) class EdgeTamPromptEncoderConfig(Sam2PromptEncoderConfig): diff --git a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py index 93730940c9af..face8009a686 100644 --- a/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py +++ b/src/transformers/models/falcon_mamba/configuration_falcon_mamba.py @@ -133,7 +133,6 @@ def __init__( mixer_rms_eps=1e-6, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) self.vocab_size = vocab_size self.hidden_size = hidden_size self.state_size = state_size @@ -164,6 +163,8 @@ def __init__( self.residual_in_fp32 = residual_in_fp32 self.use_cache = use_cache self.use_falcon_mambapy = use_falcon_mambapy + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) self.mixer_rms_eps = mixer_rms_eps diff --git a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py index 6f840438dcae..b3818246dce0 100644 --- a/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py +++ b/src/transformers/models/fastspeech2_conformer/convert_model_with_hifigan.py @@ -63,7 +63,7 @@ def convert_FastSpeech2ConformerWithHifiGan_checkpoint( load_weights(espnet_checkpoint, vocoder, vocoder_config) # Prepare the model + vocoder - config = FastSpeech2ConformerWithHifiGanConfig.from_sub_model_configs(model_config, vocoder_config) + config = FastSpeech2ConformerWithHifiGanConfig(model_config, vocoder_config) with_hifigan_model = FastSpeech2ConformerWithHifiGan(config) with_hifigan_model.model = model with_hifigan_model.vocoder = vocoder diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index 2997d11bd4d4..75dcae413c21 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -494,8 +494,6 @@ def __init__( multimodal_config_dict = kwargs.pop("multimodal_config_dict", None) image_codebook_config_dict = kwargs.pop("image_codebook_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -619,28 +617,35 @@ def __init__( # Update all values in `image_codebook_config` with the ones in `_image_codebook_config_dict`. image_codebook_config.update(_image_codebook_config_dict) + if text_config is None: + text_config = FlavaTextConfig() + logger.info("`text_config` is `None`. initializing the `FlavaTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = FlavaTextConfig(**text_config) + if image_config is None: - image_config = {} + image_config = FlavaImageConfig() logger.info("`image_config` is `None`. initializing the `FlavaImageConfig` with default values.") - - if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `FlavaTextConfig` with default values.") + elif isinstance(image_config, dict): + image_config = FlavaImageConfig(**image_config) if multimodal_config is None: - multimodal_config = {} - logger.info("`multimodal_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.") + multimodal_config = FlavaMultimodalConfig() + logger.info("`image_config` is `None`. initializing the `FlavaMultimodalConfig` with default values.") + elif isinstance(multimodal_config, dict): + multimodal_config = FlavaMultimodalConfig(**multimodal_config) if image_codebook_config is None: - image_codebook_config = {} - logger.info( - "`image_codebook_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values." - ) - - self.image_config = FlavaImageConfig(**image_config) - self.text_config = FlavaTextConfig(**text_config) - self.multimodal_config = FlavaMultimodalConfig(**multimodal_config) - self.image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config) + image_codebook_config = FlavaImageCodebookConfig() + logger.info("`image_config` is `None`. initializing the `FlavaImageCodebookConfig` with default values.") + elif isinstance(image_codebook_config, dict): + image_codebook_config = FlavaImageCodebookConfig(**image_codebook_config) + + self.text_config = text_config + self.image_config = image_config + self.multimodal_config = multimodal_config + self.image_codebook_config = image_codebook_config + self.projection_dim = projection_dim self.init_codebook = init_codebook @@ -659,31 +664,7 @@ def __init__( self.global_backprop_contrastive = global_backprop_contrastive self.skip_unmasked_multimodal_encoder = skip_unmasked_multimodal_encoder self.return_loss = return_loss - - @classmethod - def from_configs( - cls, - image_config: FlavaImageConfig, - text_config: FlavaTextConfig, - multimodal_config: FlavaMultimodalConfig, - image_codebook_config: FlavaImageCodebookConfig, - **kwargs, - ): - r""" - Instantiate a [`FlavaConfig`] (or a derived class) from flava text model configuration, flava image model - configuration, flava multimodal model and flava codebook model configuration. - - Returns: - [`FlavaConfig`]: An instance of a configuration object - """ - - return cls( - image_config=image_config.to_dict(), - text_config=text_config.to_dict(), - multimodal_config=multimodal_config.to_dict(), - image_codebook_config=image_codebook_config.to_dict(), - **kwargs, - ) + super().__init__(**kwargs) __all__ = ["FlavaConfig", "FlavaImageCodebookConfig", "FlavaImageConfig", "FlavaMultimodalConfig", "FlavaTextConfig"] diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 838baa0b496a..e3b36bab6128 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -499,15 +499,18 @@ def __init__( model_args: Optional[dict] = None, **kwargs, ): - super().__init__(**kwargs) self.architecture = architecture self.initializer_range = initializer_range self.do_pooling = do_pooling - self.model_args = model_args # named "model_args" for BC with timm self.hidden_size = hidden_size self.vocab_size = vocab_size self.vocab_offset = vocab_offset self.rms_norm_eps = rms_norm_eps + self.architecture = architecture + self.initializer_range = initializer_range + self.do_pooling = do_pooling + self.model_args = model_args # named "model_args" for BC with timm + super().__init__(**kwargs) @classmethod def from_dict(cls, config_dict: dict[str, Any], **kwargs): diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 507c6498315e..edf5fd4e3db4 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -511,7 +511,6 @@ def __init__( model_args: Optional[dict] = None, **kwargs, ): - super().__init__(**kwargs) self.architecture = architecture self.initializer_range = initializer_range self.do_pooling = do_pooling @@ -519,6 +518,7 @@ def __init__( self.vocab_size = vocab_size self.vocab_offset = vocab_offset self.rms_norm_eps = rms_norm_eps + super().__init__(**kwargs) class Gemma3nConfig(PreTrainedConfig): diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index e8277d773bc1..f22fd8e8d547 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -185,8 +185,6 @@ def __init__( num_image_with_embedding=None, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs) - if vision_config is None: vision_config = {} logger.info("vision_config is None. initializing the GitVisionConfig with default values.") @@ -204,11 +202,15 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings self.num_image_with_embedding = num_image_with_embedding - self.bos_token_id = bos_token_id - self.eos_token_id = eos_token_id + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["GitConfig", "GitVisionConfig"] diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index ad97a10efd73..e8f9c948c66d 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -66,7 +66,6 @@ class Glm4vProcessor(ProcessorMixin): tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast") def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): - super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token self.image_token_id = ( @@ -79,6 +78,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c if getattr(tokenizer, "video_token_id", None) else tokenizer.convert_tokens_to_ids(self.video_token) ) + super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) def __call__( self, diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 51ba96d96beb..1c495b403d48 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -365,7 +365,6 @@ def __init__( video_end_token_id=151342, **kwargs, ): - super().__init__(**kwargs) if isinstance(vision_config, dict): self.vision_config = self.sub_configs["vision_config"](**vision_config) elif vision_config is None: @@ -383,5 +382,7 @@ def __init__( self.image_start_token_id = image_start_token_id self.image_end_token_id = image_end_token_id + super().__init__(**kwargs) + __all__ = ["Glm4vMoeConfig", "Glm4vMoeTextConfig"] diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index 0f432e621b5c..45adf3606578 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -274,8 +274,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -339,15 +337,19 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `GroupViTTextConfig` with default values.") + text_config = GroupViTTextConfig() + logger.info("`text_config` is `None`. initializing the `GroupViTTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = GroupViTTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = GroupViTVisionConfig() logger.info("`vision_config` is `None`. initializing the `GroupViTVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = GroupViTVisionConfig(**vision_config) - self.text_config = GroupViTTextConfig(**text_config) - self.vision_config = GroupViTVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.projection_intermediate_dim = projection_intermediate_dim @@ -355,6 +357,7 @@ def __init__( self.initializer_range = 0.02 self.initializer_factor = 1.0 self.output_segmentation = False + super().__init__(**kwargs) class GroupViTOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index e912c3d3d83c..c8ba3203ac6f 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -256,7 +256,7 @@ class InstructBlipConfig(PreTrainedConfig): >>> qformer_config = InstructBlipQFormerConfig() >>> text_config = OPTConfig() - >>> config = InstructBlipConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + >>> config = InstructBlipConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config) ```""" model_type = "instructblip" @@ -278,24 +278,28 @@ def __init__( image_token_index=None, **kwargs, ): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the InstructBlipVisionConfig with default values.") + if text_config is None: + text_config = CONFIG_MAPPING["opt"]() + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + elif isinstance(text_config, dict): + text_model_type = text_config.get("model_type", "opt") + text_config = CONFIG_MAPPING[text_model_type](**text_config) if qformer_config is None: - qformer_config = {} + qformer_config = InstructBlipQFormerConfig() logger.info("qformer_config is None. Initializing the InstructBlipQFormerConfig with default values.") + elif isinstance(qformer_config, dict): + qformer_config = InstructBlipQFormerConfig(**qformer_config) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + if vision_config is None: + vision_config = InstructBlipVisionConfig() + logger.info("`vision_config` is `None`. initializing the `InstructBlipVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = InstructBlipVisionConfig(**vision_config) - self.vision_config = InstructBlipVisionConfig(**vision_config) - self.qformer_config = InstructBlipQFormerConfig(**qformer_config) - text_model_type = text_config.get("model_type", "opt") - self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + self.text_config = text_config + self.vision_config = vision_config + self.qformer_config = qformer_config self.num_query_tokens = num_query_tokens self.image_token_index = image_token_index @@ -303,29 +307,7 @@ def __init__( self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES self.initializer_factor = 1.0 self.initializer_range = 0.02 - - @classmethod - def from_vision_qformer_text_configs( - cls, - vision_config: InstructBlipVisionConfig, - qformer_config: InstructBlipQFormerConfig, - text_config: PreTrainedConfig, - **kwargs, - ): - r""" - Instantiate a [`InstructBlipConfig`] (or a derived class) from a InstructBLIP vision model, Q-Former and - language model configurations. - - Returns: - [`InstructBlipConfig`]: An instance of a configuration object - """ - - return cls( - vision_config=vision_config.to_dict(), - qformer_config=qformer_config.to_dict(), - text_config=text_config.to_dict(), - **kwargs, - ) + super().__init__(**kwargs) __all__ = ["InstructBlipConfig", "InstructBlipQFormerConfig", "InstructBlipVisionConfig"] diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index f8d00a9a19c4..f39d23f23860 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -262,7 +262,7 @@ class InstructBlipVideoConfig(PreTrainedConfig): >>> qformer_config = InstructBlipVideoQFormerConfig() >>> text_config = OPTConfig() - >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config) ```""" model_type = "instructblipvideo" @@ -284,24 +284,30 @@ def __init__( video_token_index=None, **kwargs, ): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.") + if text_config is None: + text_config = CONFIG_MAPPING["opt"]() + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + elif isinstance(text_config, dict): + text_model_type = text_config.get("model_type", "opt") + text_config = CONFIG_MAPPING[text_model_type](**text_config) if qformer_config is None: - qformer_config = {} + qformer_config = InstructBlipVideoQFormerConfig() logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") + elif isinstance(qformer_config, dict): + qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + if vision_config is None: + vision_config = InstructBlipVideoVisionConfig() + logger.info( + "`vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values." + ) + elif isinstance(vision_config, dict): + vision_config = InstructBlipVideoVisionConfig(**vision_config) - self.vision_config = InstructBlipVideoVisionConfig(**vision_config) - self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) - text_model_type = text_config.get("model_type", "opt") - self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + self.text_config = text_config + self.vision_config = vision_config + self.qformer_config = qformer_config self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index @@ -309,29 +315,7 @@ def __init__( self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES self.initializer_factor = 1.0 self.initializer_range = 0.02 - - @classmethod - def from_vision_qformer_text_configs( - cls, - vision_config: InstructBlipVideoVisionConfig, - qformer_config: InstructBlipVideoQFormerConfig, - text_config: PreTrainedConfig, - **kwargs, - ): - r""" - Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and - language model configurations. - - Returns: - [`InstructBlipVideoConfig`]: An instance of a configuration object - """ - - return cls( - vision_config=vision_config.to_dict(), - qformer_config=qformer_config.to_dict(), - text_config=text_config.to_dict(), - **kwargs, - ) + super().__init__(**kwargs) __all__ = ["InstructBlipVideoConfig", "InstructBlipVideoQFormerConfig", "InstructBlipVideoVisionConfig"] diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 3f96eb3f88af..f03df415e467 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -103,7 +103,7 @@ class InstructBlipVideoConfig(PreTrainedConfig): >>> qformer_config = InstructBlipVideoQFormerConfig() >>> text_config = OPTConfig() - >>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config) + >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config) ```""" model_type = "instructblipvideo" @@ -125,24 +125,30 @@ def __init__( video_token_index=None, **kwargs, ): - super().__init__(**kwargs) - - if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the InstructBlipVideoVisionConfig with default values.") + if text_config is None: + text_config = CONFIG_MAPPING["opt"]() + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + elif isinstance(text_config, dict): + text_model_type = text_config.get("model_type", "opt") + text_config = CONFIG_MAPPING[text_model_type](**text_config) if qformer_config is None: - qformer_config = {} + qformer_config = InstructBlipVideoQFormerConfig() logger.info("qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.") + elif isinstance(qformer_config, dict): + qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + if vision_config is None: + vision_config = InstructBlipVideoVisionConfig() + logger.info( + "`vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values." + ) + elif isinstance(vision_config, dict): + vision_config = InstructBlipVideoVisionConfig(**vision_config) - self.vision_config = InstructBlipVideoVisionConfig(**vision_config) - self.qformer_config = InstructBlipVideoQFormerConfig(**qformer_config) - text_model_type = text_config.get("model_type", "opt") - self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + self.text_config = text_config + self.vision_config = vision_config + self.qformer_config = qformer_config self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index @@ -150,29 +156,7 @@ def __init__( self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES self.initializer_factor = 1.0 self.initializer_range = 0.02 - - @classmethod - def from_vision_qformer_text_configs( - cls, - vision_config: InstructBlipVideoVisionConfig, - qformer_config: InstructBlipVideoQFormerConfig, - text_config: PreTrainedConfig, - **kwargs, - ): - r""" - Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and - language model configurations. - - Returns: - [`InstructBlipVideoConfig`]: An instance of a configuration object - """ - - return cls( - vision_config=vision_config.to_dict(), - qformer_config=qformer_config.to_dict(), - text_config=text_config.to_dict(), - **kwargs, - ) + super().__init__(**kwargs) class InstructBlipVideoPreTrainedModel(InstructBlipPreTrainedModel): diff --git a/src/transformers/models/janus/image_processing_janus_fast.py b/src/transformers/models/janus/image_processing_janus_fast.py index 4de23e80e63a..b8e032786bf4 100644 --- a/src/transformers/models/janus/image_processing_janus_fast.py +++ b/src/transformers/models/janus/image_processing_janus_fast.py @@ -54,11 +54,11 @@ class JanusImageProcessorFast(BaseImageProcessorFast): valid_kwargs = JanusImageProcessorKwargs def __init__(self, **kwargs: Unpack[JanusImageProcessorKwargs]): + super().__init__(**kwargs) if kwargs.get("image_mean") is None: background_color = (127, 127, 127) else: background_color = tuple(int(x * 255) for x in kwargs.get("image_mean")) - super().__init__(**kwargs) self.background_color = tuple(background_color) def resize( diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index 1ad8b133f021..1b9dff5aabf9 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -245,20 +245,22 @@ def __init__( latent_query_num=64, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `Kosmos2TextConfig` with default values.") + text_config = Kosmos2TextConfig() + logger.info("`text_config` is `None`. initializing the `Kosmos2TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = Kosmos2TextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("`vision_config` is `None`. Initializing the `Kosmos2VisionConfig` with default values.") - - self.text_config = Kosmos2TextConfig(**text_config) - self.vision_config = Kosmos2VisionConfig(**vision_config) + vision_config = Kosmos2VisionConfig() + logger.info("`vision_config` is `None`. initializing the `Kosmos2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Kosmos2VisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.latent_query_num = latent_query_num + super().__init__(**kwargs) __all__ = ["Kosmos2Config"] diff --git a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py index d3044eb4cb26..b36619419496 100644 --- a/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py +++ b/src/transformers/models/kosmos2_5/configuration_kosmos2_5.py @@ -237,18 +237,22 @@ def __init__( latent_query_num=2048, **kwargs, ): - super().__init__(**kwargs) if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.") - if vision_config is None: - vision_config = {} - logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.") + text_config = Kosmos2_5TextConfig() + logger.info("`text_config` is `None`. initializing the `Kosmos2_5TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = Kosmos2_5TextConfig(**text_config) - self.text_config = Kosmos2_5TextConfig(**text_config) - self.vision_config = Kosmos2_5VisionConfig(**vision_config) + if vision_config is None: + vision_config = Kosmos2_5VisionConfig() + logger.info("`vision_config` is `None`. initializing the `Kosmos2_5VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Kosmos2_5VisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.latent_query_num = latent_query_num + super().__init__(**kwargs) __all__ = ["Kosmos2_5Config"] diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 940986c407de..6618b8573aad 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -142,10 +142,6 @@ def __init__( codec_config=None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs - ) - if codec_config is None: self.codec_config = AutoConfig.for_model("mimi") logger.info("codec_config is None, using default audio encoder config.") @@ -184,5 +180,9 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.sliding_window = sliding_window + super().__init__( + pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs + ) + __all__ = ["KyutaiSpeechToTextConfig"] diff --git a/src/transformers/models/mask2former/configuration_mask2former.py b/src/transformers/models/mask2former/configuration_mask2former.py index 22f2f7034aa7..0b39d0a01a43 100644 --- a/src/transformers/models/mask2former/configuration_mask2former.py +++ b/src/transformers/models/mask2former/configuration_mask2former.py @@ -237,21 +237,5 @@ def __init__( super().__init__(**kwargs) - @classmethod - def from_backbone_config(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`Mask2FormerConfig`] (or a derived class) from a pre-trained backbone model configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`Mask2FormerConfig`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) - __all__ = ["Mask2FormerConfig"] diff --git a/src/transformers/models/maskformer/configuration_maskformer.py b/src/transformers/models/maskformer/configuration_maskformer.py index 6d16780818c1..cd6d08a7c003 100644 --- a/src/transformers/models/maskformer/configuration_maskformer.py +++ b/src/transformers/models/maskformer/configuration_maskformer.py @@ -201,27 +201,5 @@ def __init__( self.backbone_kwargs = backbone_kwargs super().__init__(**kwargs) - @classmethod - def from_backbone_and_decoder_configs( - cls, backbone_config: PreTrainedConfig, decoder_config: PreTrainedConfig, **kwargs - ): - """Instantiate a [`MaskFormerConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - decoder_config ([`PreTrainedConfig`]): - The transformer decoder configuration to use. - - Returns: - [`MaskFormerConfig`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - decoder_config=decoder_config, - **kwargs, - ) - __all__ = ["MaskFormerConfig"] diff --git a/src/transformers/models/metaclip_2/configuration_metaclip_2.py b/src/transformers/models/metaclip_2/configuration_metaclip_2.py index ecd2d245df64..1bf50e8bd1c1 100644 --- a/src/transformers/models/metaclip_2/configuration_metaclip_2.py +++ b/src/transformers/models/metaclip_2/configuration_metaclip_2.py @@ -248,7 +248,7 @@ class MetaClip2Config(PreTrainedConfig): >>> config_text = MetaClip2TextConfig() >>> config_vision = MetaClip2VisionConfig() - >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) + >>> config = MetaClip2Config(text_config=config_text, vision_config=config_vision) ```""" model_type = "metaclip_2" @@ -263,8 +263,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -328,19 +326,24 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `MetaClip2TextConfig` with default values.") + text_config = MetaClip2TextConfig() + logger.info("`text_config` is `None`. initializing the `MetaClip2TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = MetaClip2TextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = MetaClip2VisionConfig() logger.info("`vision_config` is `None`. initializing the `MetaClip2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = MetaClip2VisionConfig(**vision_config) - self.text_config = MetaClip2TextConfig(**text_config) - self.vision_config = MetaClip2VisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = 1.0 + super().__init__(**kwargs) __all__ = ["MetaClip2Config", "MetaClip2TextConfig", "MetaClip2VisionConfig"] diff --git a/src/transformers/models/metaclip_2/modular_metaclip_2.py b/src/transformers/models/metaclip_2/modular_metaclip_2.py index cd42344bd406..98252c7fbc68 100644 --- a/src/transformers/models/metaclip_2/modular_metaclip_2.py +++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py @@ -197,7 +197,7 @@ class MetaClip2Config(CLIPConfig): >>> config_text = MetaClip2TextConfig() >>> config_vision = MetaClip2VisionConfig() - >>> config = MetaClip2Config.from_text_vision_configs(config_text, config_vision) + >>> config = MetaClip2Config(text_config=config_text, vision_config=config_vision) ```""" pass diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 5eb3045a4523..d12264e2ae49 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -181,13 +181,14 @@ def __init__( mlp_beta_factor=1, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + self.layer_types = layer_types + self.block_size = block_size + self.full_attn_alpha_factor = full_attn_alpha_factor + self.full_attn_beta_factor = full_attn_beta_factor + self.linear_attn_alpha_factor = linear_attn_alpha_factor + self.linear_attn_beta_factor = linear_attn_beta_factor + self.mlp_alpha_factor = mlp_alpha_factor + self.mlp_beta_factor = mlp_beta_factor self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -214,15 +215,13 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise - self.layer_types = layer_types - self.block_size = block_size - self.full_attn_alpha_factor = full_attn_alpha_factor - self.full_attn_beta_factor = full_attn_beta_factor - self.linear_attn_alpha_factor = linear_attn_alpha_factor - self.linear_attn_beta_factor = linear_attn_beta_factor - self.mlp_alpha_factor = mlp_alpha_factor - self.mlp_beta_factor = mlp_beta_factor - + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) if self.layer_types is None: self.layer_types = [ "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 2504b0048789..4afe2b57bf83 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -163,7 +163,6 @@ def __init__( mlp_beta_factor=1, **super_kwargs, ): - super().__init__(**super_kwargs) self.layer_types = layer_types self.block_size = block_size self.full_attn_alpha_factor = full_attn_alpha_factor @@ -173,6 +172,7 @@ def __init__( self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor + super().__init__(**super_kwargs) if self.layer_types is None: self.layer_types = [ "full_attention" if bool((i + 1) % 2) else "linear_attention" for i in range(self.num_hidden_layers) diff --git a/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py index 8ee2e1ce3c13..7a257591b514 100644 --- a/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/configuration_mm_grounding_dino.py @@ -198,7 +198,6 @@ def __init__( layer_norm_eps=1e-5, **kwargs, ): - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.") backbone_config = CONFIG_MAPPING["swin"]( @@ -281,5 +280,7 @@ def __init__( self.init_std = init_std self.layer_norm_eps = layer_norm_eps + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + __all__ = ["MMGroundingDinoConfig"] diff --git a/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py b/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py index 15d2484cc9b3..4aed0c1a9b64 100644 --- a/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py +++ b/src/transformers/models/mm_grounding_dino/modular_mm_grounding_dino.py @@ -20,9 +20,8 @@ from ...configuration_utils import PreTrainedConfig from ...utils import logging from ...utils.backbone_utils import verify_backbone_config_arguments -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig from ..auto.modeling_auto import AutoModel -from ..grounding_dino.configuration_grounding_dino import GroundingDinoConfig from ..grounding_dino.modeling_grounding_dino import ( GroundingDinoContrastiveEmbedding, GroundingDinoConvEncoder, @@ -40,7 +39,7 @@ logger = logging.get_logger(__name__) -class MMGroundingDinoConfig(GroundingDinoConfig, PreTrainedConfig): +class MMGroundingDinoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MMGroundingDinoModel`]. It is used to instantiate a MM Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a @@ -158,6 +157,11 @@ class MMGroundingDinoConfig(GroundingDinoConfig, PreTrainedConfig): ```""" model_type = "mm-grounding-dino" + sub_configs = {"backbone_config": AutoConfig, "text_config": AutoConfig} + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } def __init__( self, @@ -205,7 +209,6 @@ def __init__( layer_norm_eps=1e-5, **kwargs, ): - PreTrainedConfig.__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.") backbone_config = CONFIG_MAPPING["swin"]( @@ -288,6 +291,8 @@ def __init__( self.init_std = init_std self.layer_norm_eps = layer_norm_eps + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + class MMGroundingDinoContrastiveEmbedding(GroundingDinoContrastiveEmbedding): def __init__(self, config): diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index f6223287a45c..76c951668f46 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -142,15 +142,12 @@ class MusicgenConfig(PreTrainedConfig): documentation from [`PreTrainedConfig`] for more information. Args: - kwargs (*optional*): - Dictionary of keyword arguments. Notably: - - - **text_encoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that - defines the text encoder config. - - **audio_encoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that - defines the audio encoder config. - - **decoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that defines - the decoder config. + text_encoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the text encoder config. + audio_encoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the audio encoder config. + decoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the decoder config. Example: @@ -168,8 +165,10 @@ class MusicgenConfig(PreTrainedConfig): >>> audio_encoder_config = EncodecConfig() >>> decoder_config = MusicgenDecoderConfig() - >>> configuration = MusicgenConfig.from_sub_models_config( - ... text_encoder_config, audio_encoder_config, decoder_config + >>> configuration = MusicgenConfig( + ... text_encoder=text_encoder_config, + ... audio_encoder=audio_encoder_config, + ... decoder=decoder_config, ... ) >>> # Initializing a MusicgenForConditionalGeneration (with random weights) from the facebook/musicgen-small style configuration @@ -197,47 +196,25 @@ class MusicgenConfig(PreTrainedConfig): } has_no_defaults_at_init = True - def __init__(self, **kwargs): - super().__init__(**kwargs) - if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs: - raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config") - - text_encoder_config = kwargs.pop("text_encoder") - text_encoder_model_type = text_encoder_config.pop("model_type") + def __init__(self, text_encoder, audio_encoder, decoder, **kwargs): + if isinstance(text_encoder, dict): + text_encoder_model_type = text_encoder.pop("model_type") + text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder) - audio_encoder_config = kwargs.pop("audio_encoder") - audio_encoder_model_type = audio_encoder_config.pop("model_type") + if isinstance(audio_encoder, dict): + audio_encoder_model_type = audio_encoder.pop("model_type") + audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder) - decoder_config = kwargs.pop("decoder") + if isinstance(decoder, dict): + decoder = MusicgenDecoderConfig(**decoder) - self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config) - self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config) - self.decoder = MusicgenDecoderConfig(**decoder_config) - self.is_encoder_decoder = True + self.text_encoder = text_encoder + self.audio_encoder = audio_encoder + self.decoder = decoder self.initializer_factor = self.decoder.initializer_factor - @classmethod - def from_sub_models_config( - cls, - text_encoder_config: PreTrainedConfig, - audio_encoder_config: PreTrainedConfig, - decoder_config: MusicgenDecoderConfig, - **kwargs, - ): - r""" - Instantiate a [`MusicgenConfig`] (or a derived class) from text encoder, audio encoder and decoder - configurations. - - Returns: - [`MusicgenConfig`]: An instance of a configuration object - """ - - return cls( - text_encoder=text_encoder_config.to_dict(), - audio_encoder=audio_encoder_config.to_dict(), - decoder=decoder_config.to_dict(), - **kwargs, - ) + kwargs["is_encoder_decoder"] = True + super().__init__(**kwargs) @property # This is a property because you might want to change the codec model on the fly diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 75c8c660c130..77c2a4ee1fb3 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1308,7 +1308,9 @@ def __init__( "Either a configuration has to be provided, or all three of text encoder, audio encoder and MusicGen decoder." ) if config is None: - config = MusicgenConfig.from_sub_models_config(text_encoder.config, audio_encoder.config, decoder.config) + config = MusicgenConfig( + text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config + ) else: if not isinstance(config, self.config_class): raise ValueError(f"Config: {config} has to be of type {self.config_class}") @@ -1616,8 +1618,8 @@ def from_sub_models_pretrained( decoder = MusicgenForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) # instantiate config with corresponding kwargs - config = MusicgenConfig.from_sub_models_config( - text_encoder.config, audio_encoder.config, decoder.config, **kwargs + config = MusicgenConfig( + text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config, **kwargs ) return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config) diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 59190444990b..a4ec8528590a 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -145,18 +145,16 @@ class MusicgenMelodyConfig(PreTrainedConfig): documentation from [`PreTrainedConfig`] for more information. Args: - num_chroma (`int`, *optional*, defaults to 12): Number of chroma bins to use. + text_encoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the text encoder config. + audio_encoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the audio encoder config. + decoder (`Union[dict, `PretrainedConfig`]`): + An instance of a configuration object that defines the decoder config. + num_chroma (`int`, *optional*, defaults to 12): + Number of chroma bins to use. chroma_length (`int`, *optional*, defaults to 235): Maximum chroma duration if audio is used to condition the model. Corresponds to the maximum duration used during training. - kwargs (*optional*): - Dictionary of keyword arguments. Notably: - - - **text_encoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that - defines the text encoder config. - - **audio_encoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that - defines the audio encoder config. - - **decoder** ([`PreTrainedConfig`], *optional*) -- An instance of a configuration object that defines - the decoder config. Example: @@ -174,8 +172,8 @@ class MusicgenMelodyConfig(PreTrainedConfig): >>> audio_encoder_config = EncodecConfig() >>> decoder_config = MusicgenMelodyDecoderConfig() - >>> configuration = MusicgenMelodyConfig.from_sub_models_config( - ... text_encoder_config, audio_encoder_config, decoder_config + >>> configuration = MusicgenMelodyConfig( + ... text_encoder=text_encoder_config, audio_encoder=audio_encoder_config, decoder=decoder_config ... ) >>> # Initializing a MusicgenMelodyForConditionalGeneration (with random weights) from the facebook/musicgen-melody style configuration @@ -205,52 +203,31 @@ class MusicgenMelodyConfig(PreTrainedConfig): def __init__( self, + text_encoder, + audio_encoder, + decoder, num_chroma=12, chroma_length=235, **kwargs, ): - super().__init__(**kwargs) - if "text_encoder" not in kwargs or "audio_encoder" not in kwargs or "decoder" not in kwargs: - raise ValueError("Config has to be initialized with text_encoder, audio_encoder and decoder config") - - text_encoder_config = kwargs.pop("text_encoder") - text_encoder_model_type = text_encoder_config.pop("model_type") + if isinstance(text_encoder, dict): + text_encoder_model_type = text_encoder.pop("model_type") + text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder) - audio_encoder_config = kwargs.pop("audio_encoder") - audio_encoder_model_type = audio_encoder_config.pop("model_type") + if isinstance(audio_encoder, dict): + audio_encoder_model_type = audio_encoder.pop("model_type") + audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder) - decoder_config = kwargs.pop("decoder") - - self.text_encoder = AutoConfig.for_model(text_encoder_model_type, **text_encoder_config) - self.audio_encoder = AutoConfig.for_model(audio_encoder_model_type, **audio_encoder_config) - self.decoder = MusicgenMelodyDecoderConfig(**decoder_config) - self.is_encoder_decoder = False + if isinstance(decoder, dict): + decoder = MusicgenMelodyDecoderConfig(**decoder) + self.text_encoder = text_encoder + self.audio_encoder = audio_encoder + self.decoder = decoder self.num_chroma = num_chroma self.chroma_length = chroma_length - - @classmethod - def from_sub_models_config( - cls, - text_encoder_config: PreTrainedConfig, - audio_encoder_config: PreTrainedConfig, - decoder_config: MusicgenMelodyDecoderConfig, - **kwargs, - ): - r""" - Instantiate a [`MusicgenMelodyConfig`] (or a derived class) from text encoder, audio encoder and decoder - configurations. - - Returns: - [`MusicgenMelodyConfig`]: An instance of a configuration object - """ - - return cls( - text_encoder=text_encoder_config.to_dict(), - audio_encoder=audio_encoder_config.to_dict(), - decoder=decoder_config.to_dict(), - **kwargs, - ) + kwargs["is_encoder_decoder"] = False + super().__init__(**kwargs) @property # This is a property because you might want to change the codec model on the fly diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 2f9bcd419ed9..a918c847e106 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1253,8 +1253,8 @@ def __init__( "Either a configuration has to be provided, or all three of text encoder, audio encoder and Musicgen Melody decoder." ) if config is None: - config = MusicgenMelodyConfig.from_sub_models_config( - text_encoder.config, audio_encoder.config, decoder.config + config = MusicgenMelodyConfig( + text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config ) else: if not isinstance(config, self.config_class): @@ -1537,8 +1537,8 @@ def from_sub_models_pretrained( ) # instantiate config with corresponding kwargs - config = MusicgenMelodyConfig.from_sub_models_config( - text_encoder.config, audio_encoder.config, decoder.config, **kwargs + config = MusicgenMelodyConfig( + text_encoder=text_encoder.config, audio_encoder=audio_encoder.config, decoder=decoder.config, **kwargs ) return cls(text_encoder=text_encoder, audio_encoder=audio_encoder, decoder=decoder, config=config) diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 9126429dfe39..7b2190d6bae4 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -246,38 +246,26 @@ def __init__( return_dict=True, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the Owlv2TextConfig with default values.") + text_config = Owlv2TextConfig() + logger.info("`text_config` is `None`. initializing the `Owlv2TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = Owlv2TextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the Owlv2VisionConfig with default values.") + vision_config = Owlv2VisionConfig() + logger.info("`vision_config` is `None`. initializing the `Owlv2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Owlv2VisionConfig(**vision_config) - self.text_config = Owlv2TextConfig(**text_config) - self.vision_config = Owlv2VisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.return_dict = return_dict self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: dict, vision_config: dict, **kwargs): - r""" - Instantiate a [`Owlv2Config`] (or a derived class) from owlv2 text model configuration and owlv2 vision - model configuration. - - Returns: - [`Owlv2Config`]: An instance of a configuration object - """ - config_dict = {} - config_dict["text_config"] = text_config - config_dict["vision_config"] = vision_config - - return cls.from_dict(config_dict, **kwargs) + super().__init__(**kwargs) __all__ = ["Owlv2Config", "Owlv2TextConfig", "Owlv2VisionConfig"] diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 95f31363488d..851dc077a47c 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -252,38 +252,26 @@ def __init__( return_dict=True, **kwargs, ): - super().__init__(**kwargs) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the OwlViTTextConfig with default values.") + text_config = OwlViTTextConfig() + logger.info("`text_config` is `None`. initializing the `OwlViTTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = OwlViTTextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("vision_config is None. initializing the OwlViTVisionConfig with default values.") + vision_config = OwlViTVisionConfig() + logger.info("`vision_config` is `None`. initializing the `OwlViTVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = OwlViTVisionConfig(**vision_config) - self.text_config = OwlViTTextConfig(**text_config) - self.vision_config = OwlViTVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.logit_scale_init_value = logit_scale_init_value self.return_dict = return_dict self.initializer_factor = 1.0 - - @classmethod - def from_text_vision_configs(cls, text_config: dict, vision_config: dict, **kwargs): - r""" - Instantiate a [`OwlViTConfig`] (or a derived class) from owlvit text model configuration and owlvit vision - model configuration. - - Returns: - [`OwlViTConfig`]: An instance of a configuration object - """ - config_dict = {} - config_dict["text_config"] = text_config - config_dict["vision_config"] = vision_config - - return cls.from_dict(config_dict, **kwargs) + super().__init__(**kwargs) class OwlViTOnnxConfig(OnnxConfig): diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 3c4efd4b5a36..bed51d1639fc 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -379,13 +379,17 @@ def __init__( audio_config=None, **kwargs, ): - super().__init__( - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) + if isinstance(vision_config, dict): + vision_config = Phi4MultimodalVisionConfig(**vision_config) + elif vision_config is None: + Phi4MultimodalVisionConfig() + self.vision_config = vision_config + + if isinstance(audio_config, dict): + audio_config = Phi4MultimodalAudioConfig(**audio_config) + elif vision_config is None: + audio_config = Phi4MultimodalAudioConfig() + self.audio_config = audio_config self.vocab_size = vocab_size self.hidden_size = hidden_size self.intermediate_size = intermediate_size @@ -412,17 +416,13 @@ def __init__( self._rope_scaling_validation() self.sliding_window = sliding_window - if isinstance(vision_config, dict): - vision_config = Phi4MultimodalVisionConfig(**vision_config) - elif vision_config is None: - Phi4MultimodalVisionConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Phi4MultimodalAudioConfig(**audio_config) - elif vision_config is None: - audio_config = Phi4MultimodalAudioConfig() - self.audio_config = audio_config + super().__init__( + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) def _rope_scaling_adjustment(self): """ diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 486d701a4311..6b132126bceb 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -402,6 +402,18 @@ def __init__( audio_config=None, **kwargs, ): + if isinstance(vision_config, dict): + vision_config = Phi4MultimodalVisionConfig(**vision_config) + elif vision_config is None: + Phi4MultimodalVisionConfig() + self.vision_config = vision_config + + if isinstance(audio_config, dict): + audio_config = Phi4MultimodalAudioConfig(**audio_config) + elif vision_config is None: + audio_config = Phi4MultimodalAudioConfig() + self.audio_config = audio_config + super().__init__( vocab_size=vocab_size, hidden_size=hidden_size, @@ -429,18 +441,6 @@ def __init__( **kwargs, ) - if isinstance(vision_config, dict): - vision_config = Phi4MultimodalVisionConfig(**vision_config) - elif vision_config is None: - Phi4MultimodalVisionConfig() - self.vision_config = vision_config - - if isinstance(audio_config, dict): - audio_config = Phi4MultimodalAudioConfig(**audio_config) - elif vision_config is None: - audio_config = Phi4MultimodalAudioConfig() - self.audio_config = audio_config - class Phi4MultimodalVisionMLP(SiglipMLP): pass diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index 8caf5f006670..a31f5751ad5d 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -289,7 +289,7 @@ class Pix2StructConfig(PreTrainedConfig): >>> config_text = Pix2StructTextConfig() >>> config_vision = Pix2StructVisionConfig() - >>> config = Pix2StructConfig.from_text_vision_configs(config_text, config_vision) + >>> config = Pix2StructConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "pix2struct" @@ -306,20 +306,24 @@ def __init__( is_encoder_decoder=True, **kwargs, ): - super().__init__(tie_word_embeddings=tie_word_embeddings, is_encoder_decoder=is_encoder_decoder, **kwargs) - if text_config is None: - text_config = {} - logger.info("text_config is None. Initializing the Pix2StructTextConfig with default values.") + text_config = Pix2StructTextConfig( + {"is_encoder_decoder": is_encoder_decoder, "tie_word_embeddings": tie_word_embeddings} + ) + logger.info("`text_config` is `None`. initializing the `Pix2StructTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config["is_encoder_decoder"] = is_encoder_decoder + text_config["tie_word_embeddings"] = tie_word_embeddings + text_config = Pix2StructTextConfig(**text_config) if vision_config is None: - vision_config = {} - logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.") + vision_config = Pix2StructVisionConfig() + logger.info("`vision_config` is `None`. initializing the `Pix2StructVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Pix2StructVisionConfig(**vision_config) - text_config["is_encoder_decoder"] = is_encoder_decoder - text_config["tie_word_embeddings"] = tie_word_embeddings - self.text_config = Pix2StructTextConfig(**text_config) - self.vision_config = Pix2StructVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.decoder_start_token_id = self.text_config.decoder_start_token_id self.pad_token_id = self.text_config.pad_token_id @@ -332,6 +336,7 @@ def __init__( self.vision_config.initializer_range = self.initializer_range self.is_vqa = is_vqa + super().__init__(tie_word_embeddings=tie_word_embeddings, is_encoder_decoder=is_encoder_decoder, **kwargs) __all__ = ["Pix2StructConfig", "Pix2StructTextConfig", "Pix2StructVisionConfig"] diff --git a/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py index 22983bcccd1f..53653abd9521 100644 --- a/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py +++ b/src/transformers/models/prompt_depth_anything/configuration_prompt_depth_anything.py @@ -111,7 +111,6 @@ def __init__( max_depth=None, **kwargs, ): - super().__init__(**kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `Dinov2` backbone.") backbone_config = CONFIG_MAPPING["dinov2"]( @@ -153,5 +152,7 @@ def __init__( self.depth_estimation_type = depth_estimation_type self.max_depth = max_depth if max_depth else 1 + super().__init__(**kwargs) + __all__ = ["PromptDepthAnythingConfig"] diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 3cf0ee47d2c3..74281ab88f97 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -1027,7 +1027,7 @@ class Qwen2_5OmniConfig(PreTrainedConfig): >>> # Initializing a module style configuration - >>> configuration = Qwen2_5OmniConfig.from_sub_model_configs( + >>> configuration = Qwen2_5OmniConfig( ... thinker_config, talker_config, token2wav_config ... ) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 6620765bc83a..d4aa1af34486 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -1061,7 +1061,7 @@ class Qwen2_5OmniConfig(PreTrainedConfig): >>> # Initializing a module style configuration - >>> configuration = Qwen2_5OmniConfig.from_sub_model_configs( + >>> configuration = Qwen2_5OmniConfig( ... thinker_config, talker_config, token2wav_config ... ) diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 7acafe963be8..a96b7699c7b5 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -343,10 +343,6 @@ def __init__( mlp_only_layers=None, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -380,6 +376,11 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): r""" @@ -453,7 +454,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) self.user_token_id = user_token_id self.position_id_per_seconds = position_id_per_seconds self.audio_start_token_id = audio_start_token_id @@ -476,6 +476,8 @@ def __init__( elif text_config is None: text_config = Qwen3OmniMoeTextConfig() self.text_config = text_config + + super().__init__(**kwargs) self.audio_token_id = audio_token_id self.image_token_id = image_token_id self.video_token_id = video_token_id @@ -635,10 +637,6 @@ def __init__( num_code_groups=32, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -676,6 +674,11 @@ def __init__( for i in range(self.num_hidden_layers) ] layer_type_validation(self.layer_types, self.num_hidden_layers) + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) self.num_code_groups = num_code_groups @@ -853,10 +856,6 @@ def __init__( mlp_only_layers=None, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -890,6 +889,11 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + class Qwen3OmniMoeTalkerConfig(PreTrainedConfig): r""" @@ -991,7 +995,6 @@ def __init__( speaker_id=None, **kwargs, ): - super().__init__(**kwargs) if code_predictor_config is None: code_predictor_config = {} self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig() @@ -1025,6 +1028,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + super().__init__(**kwargs) class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): @@ -1203,7 +1207,6 @@ def __init__( assistant_token_id=77091, **kwargs, ): - super().__init__(**kwargs) if thinker_config is None: thinker_config = {} logger.info("thinker_config is None. Initializing thinker model with default values") @@ -1228,6 +1231,7 @@ def __init__( self.system_token_id = system_token_id self.user_token_id = user_token_id self.assistant_token_id = assistant_token_id + super().__init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": """ diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 504cbb2f3689..a3d3f5ecb9ae 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -526,7 +526,6 @@ def __init__( speaker_id=None, **kwargs, ): - super().__init__(**kwargs) if code_predictor_config is None: code_predictor_config = {} self.code_predictor_config = Qwen3OmniMoeTalkerCodePredictorConfig() @@ -560,6 +559,7 @@ def __init__( self.audio_start_token_id = audio_start_token_id self.vision_start_token_id = vision_start_token_id self.speaker_id = speaker_id + super().__init__(**kwargs) class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): @@ -738,7 +738,6 @@ def __init__( assistant_token_id=77091, **kwargs, ): - super().__init__(**kwargs) if thinker_config is None: thinker_config = {} logger.info("thinker_config is None. Initializing thinker model with default values") @@ -763,6 +762,7 @@ def __init__( self.system_token_id = system_token_id self.user_token_id = user_token_id self.assistant_token_id = assistant_token_id + super().__init__(**kwargs) def get_text_config(self, decoder=False) -> "PreTrainedConfig": """ diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py index 49199f0c3ecc..d367a184f01f 100644 --- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py @@ -67,7 +67,6 @@ class Qwen3VLProcessor(ProcessorMixin): tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): - super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token self.image_token_id = ( @@ -80,6 +79,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c if getattr(tokenizer, "video_token_id", None) else tokenizer.convert_tokens_to_ids(self.video_token) ) + super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) self.vision_start_token = ( "<|vision_start|>" if not hasattr(tokenizer, "vision_start_token") else tokenizer.vision_start_token ) diff --git a/src/transformers/models/rt_detr/configuration_rt_detr.py b/src/transformers/models/rt_detr/configuration_rt_detr.py index f176390fd7b5..565a6e18091b 100644 --- a/src/transformers/models/rt_detr/configuration_rt_detr.py +++ b/src/transformers/models/rt_detr/configuration_rt_detr.py @@ -336,22 +336,5 @@ def __init__( self.eos_coefficient = eos_coefficient super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) - @classmethod - def from_backbone_configs(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`RTDetrConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`RTDetrConfig`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) - __all__ = ["RTDetrConfig"] diff --git a/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py index a711f6a4e6fe..b40ee12ea43a 100644 --- a/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/configuration_rt_detr_v2.py @@ -258,7 +258,6 @@ def __init__( decoder_method="default", **kwargs, ): - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) self.initializer_range = initializer_range self.initializer_bias_prior_prob = initializer_bias_prior_prob self.layer_norm_eps = layer_norm_eps @@ -358,23 +357,7 @@ def __init__( self.decoder_n_levels = decoder_n_levels self.decoder_offset_scale = decoder_offset_scale self.decoder_method = decoder_method - - @classmethod - def from_backbone_configs(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`RTDetrV2Config`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`RTDetrV2Config`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) __all__ = ["RTDetrV2Config"] diff --git a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py index b96b8b494d64..e5e243e1e7f8 100644 --- a/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py +++ b/src/transformers/models/rt_detr_v2/modular_rt_detr_v2.py @@ -269,7 +269,6 @@ def __init__( decoder_method="default", **kwargs, ): - super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) self.initializer_range = initializer_range self.initializer_bias_prior_prob = initializer_bias_prior_prob self.layer_norm_eps = layer_norm_eps @@ -369,23 +368,7 @@ def __init__( self.decoder_n_levels = decoder_n_levels self.decoder_offset_scale = decoder_offset_scale self.decoder_method = decoder_method - - @classmethod - def from_backbone_configs(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`RTDetrV2Config`] (or a derived class) from a pre-trained backbone model configuration and DETR model - configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - - Returns: - [`RTDetrV2Config`]: An instance of a configuration object - """ - return cls( - backbone_config=backbone_config, - **kwargs, - ) + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) def multi_scale_deformable_attention_v2( diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py index 4d4a4761ad1b..0229cf40d8cb 100644 --- a/src/transformers/models/sam/configuration_sam.py +++ b/src/transformers/models/sam/configuration_sam.py @@ -316,7 +316,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) vision_config = vision_config if vision_config is not None else {} prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} @@ -332,6 +331,7 @@ def __init__( self.prompt_encoder_config = SamPromptEncoderConfig(**prompt_encoder_config) self.mask_decoder_config = SamMaskDecoderConfig(**mask_decoder_config) self.initializer_range = initializer_range + super().__init__(**kwargs) __all__ = ["SamConfig", "SamMaskDecoderConfig", "SamPromptEncoderConfig", "SamVisionConfig"] diff --git a/src/transformers/models/sam2/configuration_sam2.py b/src/transformers/models/sam2/configuration_sam2.py index 8734e22a9eab..a0aa5e97e96a 100644 --- a/src/transformers/models/sam2/configuration_sam2.py +++ b/src/transformers/models/sam2/configuration_sam2.py @@ -202,8 +202,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) - backbone_channel_list = [768, 384, 192, 96] if backbone_channel_list is None else backbone_channel_list backbone_feature_sizes = ( [[256, 256], [128, 128], [64, 64]] if backbone_feature_sizes is None else backbone_feature_sizes @@ -233,6 +231,7 @@ def __init__( self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps self.initializer_range = initializer_range + super().__init__(**kwargs) class Sam2PromptEncoderConfig(PreTrainedConfig): @@ -424,7 +423,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) vision_config = vision_config if vision_config is not None else {} prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} @@ -442,6 +440,7 @@ def __init__( self.mask_decoder_config = Sam2MaskDecoderConfig(**mask_decoder_config) self.initializer_range = initializer_range + super().__init__(**kwargs) __all__ = [ diff --git a/src/transformers/models/sam_hq/configuration_sam_hq.py b/src/transformers/models/sam_hq/configuration_sam_hq.py index 68e2fc8d9b9c..f80d10704fcf 100644 --- a/src/transformers/models/sam_hq/configuration_sam_hq.py +++ b/src/transformers/models/sam_hq/configuration_sam_hq.py @@ -292,7 +292,6 @@ def __init__( initializer_range=0.02, **kwargs, ): - super().__init__(**kwargs) vision_config = vision_config if vision_config is not None else {} prompt_encoder_config = prompt_encoder_config if prompt_encoder_config is not None else {} mask_decoder_config = mask_decoder_config if mask_decoder_config is not None else {} @@ -308,6 +307,7 @@ def __init__( self.prompt_encoder_config = SamHQPromptEncoderConfig(**prompt_encoder_config) self.mask_decoder_config = SamHQMaskDecoderConfig(**mask_decoder_config) self.initializer_range = initializer_range + super().__init__(**kwargs) __all__ = ["SamHQVisionConfig", "SamHQMaskDecoderConfig", "SamHQPromptEncoderConfig", "SamHQConfig"] diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index 64637cd26bcc..102d63512f54 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -231,27 +231,30 @@ class SiglipConfig(PreTrainedConfig): >>> config_text = SiglipTextConfig() >>> config_vision = SiglipVisionConfig() - >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision) + >>> config = SiglipConfig(text_config=config_text, vision_config=config_vision) ```""" model_type = "siglip" sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): - super().__init__(**kwargs) - if text_config is None: - text_config = {} + text_config = SiglipTextConfig() logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = SiglipTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = SiglipVisionConfig() logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = SiglipVisionConfig(**vision_config) - self.text_config = SiglipTextConfig(**text_config) - self.vision_config = SiglipVisionConfig(**vision_config) - + self.text_config = text_config + self.vision_config = vision_config self.initializer_factor = 1.0 + super().__init__(**kwargs) + __all__ = ["SiglipConfig", "SiglipTextConfig", "SiglipVisionConfig"] diff --git a/src/transformers/models/siglip2/configuration_siglip2.py b/src/transformers/models/siglip2/configuration_siglip2.py index d1980d8c3377..8d29bea4e73a 100644 --- a/src/transformers/models/siglip2/configuration_siglip2.py +++ b/src/transformers/models/siglip2/configuration_siglip2.py @@ -239,27 +239,30 @@ class Siglip2Config(PreTrainedConfig): >>> config_text = Siglip2TextConfig() >>> config_vision = Siglip2VisionConfig() - >>> config = Siglip2Config.from_text_vision_configs(config_text, config_vision) + >>> config = Siglip2Config(text_config=config_text, vision_config=config_vision) ```""" model_type = "siglip2" sub_configs = {"text_config": Siglip2TextConfig, "vision_config": Siglip2VisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): - super().__init__(**kwargs) - if text_config is None: - text_config = {} + text_config = Siglip2TextConfig() logger.info("`text_config` is `None`. Initializing the `Siglip2TextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = Siglip2TextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = Siglip2VisionConfig() logger.info("`vision_config` is `None`. initializing the `Siglip2VisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = Siglip2VisionConfig(**vision_config) - self.text_config = Siglip2TextConfig(**text_config) - self.vision_config = Siglip2VisionConfig(**vision_config) - + self.text_config = text_config + self.vision_config = vision_config self.initializer_factor = 1.0 + super().__init__(**kwargs) + __all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"] diff --git a/src/transformers/models/timesfm/configuration_timesfm.py b/src/transformers/models/timesfm/configuration_timesfm.py index 3c073fdff06c..83ea976296cd 100644 --- a/src/transformers/models/timesfm/configuration_timesfm.py +++ b/src/transformers/models/timesfm/configuration_timesfm.py @@ -118,10 +118,8 @@ def __init__( self.min_timescale = min_timescale self.max_timescale = max_timescale - super().__init__( - is_encoder_decoder=self.is_encoder_decoder, - **kwargs, - ) + kwargs["is_encoder_decoder"] = self.is_encoder_decoder + super().__init__(**kwargs) __all__ = ["TimesFmConfig"] diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py index 7d4081b59c8b..eed6a435d940 100644 --- a/src/transformers/models/tvp/configuration_tvp.py +++ b/src/transformers/models/tvp/configuration_tvp.py @@ -128,7 +128,6 @@ def __init__( attention_probs_dropout_prob=0.1, **kwargs, ): - super().__init__(**kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"]) @@ -171,17 +170,7 @@ def __init__( self.initializer_range = initializer_range self.attention_probs_dropout_prob = attention_probs_dropout_prob - @classmethod - def from_backbone_config(cls, backbone_config: PreTrainedConfig, **kwargs): - """Instantiate a [`TvpConfig`] (or a derived class) from a pre-trained backbone model configuration. - - Args: - backbone_config ([`PreTrainedConfig`]): - The backbone configuration. - Returns: - [`TvpConfig`]: An instance of a configuration object - """ - return cls(backbone_config=backbone_config, **kwargs) + super().__init__(**kwargs) __all__ = ["TvpConfig"] diff --git a/src/transformers/models/upernet/configuration_upernet.py b/src/transformers/models/upernet/configuration_upernet.py index 27b1c38bc52f..ec7a564ef55a 100644 --- a/src/transformers/models/upernet/configuration_upernet.py +++ b/src/transformers/models/upernet/configuration_upernet.py @@ -104,7 +104,6 @@ def __init__( loss_ignore_index=255, **kwargs, ): - super().__init__(**kwargs) if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.") backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage1", "stage2", "stage3", "stage4"]) @@ -137,5 +136,7 @@ def __init__( self.auxiliary_concat_input = auxiliary_concat_input self.loss_ignore_index = loss_ignore_index + super().__init__(**kwargs) + __all__ = ["UperNetConfig"] diff --git a/src/transformers/models/vitmatte/configuration_vitmatte.py b/src/transformers/models/vitmatte/configuration_vitmatte.py index 1fdeb03af759..ae465f53c22b 100644 --- a/src/transformers/models/vitmatte/configuration_vitmatte.py +++ b/src/transformers/models/vitmatte/configuration_vitmatte.py @@ -93,8 +93,6 @@ def __init__( fusion_hidden_sizes: list[int] = [256, 128, 64, 32], **kwargs, ): - super().__init__(**kwargs) - if backbone_config is None and backbone is None: logger.info("`backbone_config` is `None`. Initializing the config with the default `VitDet` backbone.") backbone_config = CONFIG_MAPPING["vitdet"](out_features=["stage4"]) @@ -122,5 +120,7 @@ def __init__( self.convstream_hidden_sizes = convstream_hidden_sizes self.fusion_hidden_sizes = fusion_hidden_sizes + super().__init__(**kwargs) + __all__ = ["VitMatteConfig"] diff --git a/src/transformers/models/vitpose/configuration_vitpose.py b/src/transformers/models/vitpose/configuration_vitpose.py index e9ae2813f9d8..f2a50d561153 100644 --- a/src/transformers/models/vitpose/configuration_vitpose.py +++ b/src/transformers/models/vitpose/configuration_vitpose.py @@ -88,8 +88,6 @@ def __init__( use_simple_decoder: bool = True, **kwargs, ): - super().__init__(**kwargs) - if use_pretrained_backbone: logger.info( "`use_pretrained_backbone` is `True`. For the pure inference purpose of VitPose weight do not set this value." @@ -123,5 +121,7 @@ def __init__( self.scale_factor = scale_factor self.use_simple_decoder = use_simple_decoder + super().__init__(**kwargs) + __all__ = ["VitPoseConfig"] diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index a41a8d183805..25dcc37e3de8 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -280,8 +280,6 @@ def __init__( text_config_dict = kwargs.pop("text_config_dict", None) vision_config_dict = kwargs.pop("vision_config_dict", None) - super().__init__(**kwargs) - # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. @@ -345,15 +343,19 @@ def __init__( vision_config.update(_vision_config_dict) if text_config is None: - text_config = {} - logger.info("`text_config` is `None`. Initializing the `XCLIPTextConfig` with default values.") + text_config = XCLIPTextConfig() + logger.info("`text_config` is `None`. initializing the `XCLIPTextConfig` with default values.") + elif isinstance(text_config, dict): + text_config = XCLIPTextConfig(**text_config) if vision_config is None: - vision_config = {} + vision_config = XCLIPVisionConfig() logger.info("`vision_config` is `None`. initializing the `XCLIPVisionConfig` with default values.") + elif isinstance(vision_config, dict): + vision_config = XCLIPVisionConfig(**vision_config) - self.text_config = XCLIPTextConfig(**text_config) - self.vision_config = XCLIPVisionConfig(**vision_config) + self.text_config = text_config + self.vision_config = vision_config self.projection_dim = projection_dim self.prompt_layers = prompt_layers @@ -365,5 +367,7 @@ def __init__( self.logit_scale_init_value = logit_scale_init_value self.initializer_factor = 1.0 + super().__init__(**kwargs) + __all__ = ["XCLIPConfig", "XCLIPTextConfig", "XCLIPVisionConfig"] diff --git a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py index fbd2762cef85..99890fca81c5 100644 --- a/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/x_clip/convert_x_clip_original_pytorch_to_hf.py @@ -55,7 +55,7 @@ def get_xclip_config(model_name, num_frames): if model_name == "xclip-large-patch14-16-frames": vision_config.image_size = 336 - config = XCLIPConfig.from_text_vision_configs(text_config, vision_config) + config = XCLIPConfig(text_config=text_config, vision_config=vision_config) if "large" in model_name: config.projection_dim = 768 diff --git a/src/transformers/models/xcodec/configuration_xcodec.py b/src/transformers/models/xcodec/configuration_xcodec.py index 7b9dcf8b4884..bf91c02912ca 100644 --- a/src/transformers/models/xcodec/configuration_xcodec.py +++ b/src/transformers/models/xcodec/configuration_xcodec.py @@ -102,8 +102,6 @@ def __init__( semantic_model_config: Union[dict, HubertConfig] = None, **kwargs, ): - super().__init__(**kwargs) - if acoustic_model_config is None: self.acoustic_model_config = DacConfig( encoder_hidden_size=64, @@ -158,6 +156,8 @@ def __init__( codebook_dim = self.acoustic_model_config.hidden_size + self.semantic_model_config.hidden_size self.codebook_dim = codebook_dim + super().__init__(**kwargs) + @property def frame_rate(self) -> int: return math.ceil(self.sample_rate / self.hop_length) diff --git a/src/transformers/models/zoedepth/configuration_zoedepth.py b/src/transformers/models/zoedepth/configuration_zoedepth.py index 9f82523d5caa..c193ad5310d8 100644 --- a/src/transformers/models/zoedepth/configuration_zoedepth.py +++ b/src/transformers/models/zoedepth/configuration_zoedepth.py @@ -169,8 +169,6 @@ def __init__( patch_transformer_num_attention_heads=None, **kwargs, ): - super().__init__(**kwargs) - if readout_type not in ["ignore", "add", "project"]: raise ValueError("Readout_type must be one of ['ignore', 'add', 'project']") @@ -234,5 +232,7 @@ def __init__( self.patch_transformer_intermediate_size = patch_transformer_intermediate_size self.patch_transformer_num_attention_heads = patch_transformer_num_attention_heads + super().__init__(**kwargs) + __all__ = ["ZOEDEPTH_PRETRAINED_CONFIG_ARCHIVE_MAP", "ZoeDepthConfig"] diff --git a/tests/models/aimv2/test_modeling_aimv2.py b/tests/models/aimv2/test_modeling_aimv2.py index 5ec0a9795fc4..25d0ac4a5ea3 100644 --- a/tests/models/aimv2/test_modeling_aimv2.py +++ b/tests/models/aimv2/test_modeling_aimv2.py @@ -352,8 +352,10 @@ def prepare_config_and_inputs(self): return config, input_ids, attention_mask, pixel_values def get_config(self): - return Aimv2Config.from_text_vision_configs( - self.text_model_tester.get_config(), self.vision_model_tester.get_config(), projection_dim=64 + return Aimv2Config( + text_config=self.text_model_tester.get_config(), + vision_config=self.vision_model_tester.get_config(), + projection_dim=64, ) def create_and_check_model(self, config, input_ids, attention_mask, pixel_values): diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py index 1ef08b74d396..909c9b22208e 100644 --- a/tests/models/bark/test_modeling_bark.py +++ b/tests/models/bark/test_modeling_bark.py @@ -490,7 +490,7 @@ def __init__( self.is_training = is_training def get_config(self): - return BarkConfig.from_sub_model_configs( + return BarkConfig( self.semantic_model_tester.get_config(), self.coarse_acoustics_model_tester.get_config(), self.fine_acoustics_model_tester.get_config(), diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 8bf897eeca4c..73715e160022 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -427,7 +427,7 @@ def prepare_config_and_inputs(self): return config, input_ids, attention_mask, pixel_values def get_config(self): - return Blip2Config.from_vision_qformer_text_configs( + return Blip2Config( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), @@ -733,7 +733,7 @@ def prepare_config_and_inputs(self): return config, input_ids, attention_mask, pixel_values, decoder_input_ids, decoder_attention_mask, lm_labels def get_config(self): - return Blip2Config.from_vision_qformer_text_configs( + return Blip2Config( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), @@ -1010,7 +1010,7 @@ def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training= self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test def get_config(self): - return Blip2Config.from_vision_qformer_text_configs( + return Blip2Config( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), ) @@ -1168,7 +1168,7 @@ def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training= self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test def get_config(self): - return Blip2Config.from_vision_qformer_text_configs( + return Blip2Config( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), ) @@ -1330,7 +1330,7 @@ def __init__(self, parent, vision_kwargs=None, qformer_kwargs=None, is_training= self.batch_size = self.vision_model_tester.batch_size # need bs for batching_equivalence test def get_config(self): - return Blip2Config.from_vision_qformer_text_configs( + return Blip2Config( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), ) diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index e627a530f724..9f635afbfc75 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -357,10 +357,10 @@ def get_config(self): speech_config = self.clvp_encoder_tester.get_config() speech_config.vocab_size = 300 - return ClvpConfig.from_sub_model_configs( - text_config, - speech_config, - decoder_config, + return ClvpConfig( + text_config=text_config, + speech_config=speech_config, + decoder_config=decoder_config, projection_dim=16, ) diff --git a/tests/models/d_fine/test_modeling_d_fine.py b/tests/models/d_fine/test_modeling_d_fine.py index 040ca327396a..ee554a0d6c80 100644 --- a/tests/models/d_fine/test_modeling_d_fine.py +++ b/tests/models/d_fine/test_modeling_d_fine.py @@ -206,7 +206,7 @@ def get_config(self): stem_channels=[3, 16, 16], use_lab=True, ) - return DFineConfig.from_backbone_configs( + return DFineConfig( backbone_config=backbone_config, encoder_hidden_dim=self.encoder_hidden_dim, encoder_in_channels=self.encoder_in_channels, diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py index 7f393cb1f3cc..2c9656771a13 100644 --- a/tests/models/dbrx/test_modeling_dbrx.py +++ b/tests/models/dbrx/test_modeling_dbrx.py @@ -126,12 +126,6 @@ def test_model_rope_scaling_frequencies(self): def test_model_rope_scaling_from_config(self, scaling_type): pass - # - # @unittest.skip(reason="Not that big not that slow offload") - # def test_model_is_small(self): - # pass - # - @require_torch class DbrxModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index a517b6f236b2..8042b917ed6f 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -814,11 +814,11 @@ def prepare_config_and_inputs_for_common(self): } def get_config(self): - return FlavaConfig.from_configs( - self.image_model_tester.get_config(), - self.text_model_tester.get_config(), - self.multimodal_model_tester.get_config(), - self.image_codebook_tester.get_config(), + return FlavaConfig( + image_config=self.image_model_tester.get_config(), + text_config=self.text_model_tester.get_config(), + multimodal_config=self.multimodal_model_tester.get_config(), + image_codebook_config=self.image_codebook_tester.get_config(), hidden_size=self.hidden_size, projection_dim=self.projection_dim, initializer_range=self.initializer_range, diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index b694d1a084a8..ee06e2e85da6 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -422,7 +422,7 @@ def prepare_config_and_inputs(self): return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values def get_config(self): - return InstructBlipConfig.from_vision_qformer_text_configs( + return InstructBlipConfig( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index c411fe8c874a..b7acc2a5389f 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -439,7 +439,7 @@ def prepare_config_and_inputs(self): return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values def get_config(self): - return InstructBlipVideoConfig.from_vision_qformer_text_configs( + return InstructBlipVideoConfig( vision_config=self.vision_model_tester.get_config(), qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py index 6d1058fa03bc..f1d11545dfe2 100644 --- a/tests/models/maskformer/test_modeling_maskformer.py +++ b/tests/models/maskformer/test_modeling_maskformer.py @@ -97,7 +97,7 @@ def prepare_config_and_inputs(self): return config, pixel_values, pixel_mask, mask_labels, class_labels def get_config(self): - return MaskFormerConfig.from_backbone_and_decoder_configs( + return MaskFormerConfig( backbone_config=SwinConfig( depths=[1, 1, 1, 1], embed_dim=16, diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 32d1913387db..d0bf11c33132 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -555,7 +555,7 @@ def get_config(self): tie_word_embeddings=False, audio_channels=self.audio_channels, ) - config = MusicgenConfig.from_sub_models_config(text_encoder_config, audio_encoder_config, decoder_config) + config = MusicgenConfig(text_encoder_config, audio_encoder_config, decoder_config) return config def prepare_config_and_inputs_for_common(self): diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 0e7e1c2866af..12ba4c7d1345 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -573,7 +573,7 @@ def get_config(self): tie_word_embeddings=False, audio_channels=self.audio_channels, ) - config = MusicgenMelodyConfig.from_sub_models_config( + config = MusicgenMelodyConfig( text_encoder_config, audio_encoder_config, decoder_config, chroma_length=self.chroma_length ) return config diff --git a/tests/models/rt_detr/test_modeling_rt_detr.py b/tests/models/rt_detr/test_modeling_rt_detr.py index 77bfec414adb..bb66366f789d 100644 --- a/tests/models/rt_detr/test_modeling_rt_detr.py +++ b/tests/models/rt_detr/test_modeling_rt_detr.py @@ -180,7 +180,7 @@ def get_config(self): out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) - return RTDetrConfig.from_backbone_configs( + return RTDetrConfig( backbone_config=backbone_config, encoder_hidden_dim=self.encoder_hidden_dim, encoder_in_channels=hidden_sizes[1:], diff --git a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py index 5a35e9804ceb..5aeb3d6043bd 100644 --- a/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py +++ b/tests/models/rt_detr_v2/test_modeling_rt_detr_v2.py @@ -183,7 +183,7 @@ def get_config(self): out_features=["stage2", "stage3", "stage4"], out_indices=[2, 3, 4], ) - return RTDetrV2Config.from_backbone_configs( + return RTDetrV2Config( backbone_config=backbone_config, encoder_hidden_dim=self.encoder_hidden_dim, encoder_in_channels=hidden_sizes[1:], diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 3c27829e5166..e0e8f0275961 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -4103,12 +4103,21 @@ def check_attn_implementation_setter(config: PreTrainedConfig, attn_implementati if isinstance(attribute_value, PreTrainedConfig): check_attn_implementation_setter(attribute_value, attn_implementation) - config._attn_implementation = "eager" + # Check that attention implementation can be passed with init args + config_dict = config.to_diff_dict() + config_dict.pop("_attn_implementation_internal", None) + config_dict.pop("_attn_implementation", None) + config_dict["attn_implementation"] = "eager" + config = type(config)(**config_dict) check_attn_implementation_setter(config, "eager") + # Check that attention implementation can be set to different value config._attn_implementation = "sdpa" check_attn_implementation_setter(config, "sdpa") + config._attn_implementation = "eager" + check_attn_implementation_setter(config, "eager") + def test_internal_model_config_and_subconfig_are_same(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() subconfig_keys = list(config.sub_configs.keys()) diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index d32e1d2dbe7c..e56afce2184a 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -312,8 +312,23 @@ def _fix_post_init_location(self, new_body: list[cst.CSTNode]): break return new_body - def _fix_init_location(self, new_body): - """Fix the location of the `super().__init__()` in the new body, if we had new statements before it.""" + def _fix_init_location(self, new_body, original_body): + """ + Fix the location of the `super().__init__()` in the new body, if we had new statements before it. + If the original class' `super().__init__()` is not in the beginning, do not fix it and leave where it is. + In some cases we do not want to call super() at the very beginning. + """ + start_index = 0 + for i, node in enumerate(original_body): + if m.matches(node, DOCSTRING_NODE) and i == start_index: + start_index += 1 + continue + code = self.python_module.code_for_node(node) + comment_less_code = re.sub(r"#.*", "", code).strip() + comment_less_code = re.sub(r"\ *\n", "\n", comment_less_code).strip() + if "super().__init__" in comment_less_code and i > start_index: + return new_body + start_index = 0 for i, node in enumerate(new_body): if m.matches(node, DOCSTRING_NODE) and i == start_index: @@ -344,7 +359,7 @@ def leave_FunctionDef(self, original_node: cst.FunctionDef, updated_node: cst.Fu if self.is_call_to_super(base_statement_node, func_name): original_modeling_method_body = self.original_modeling_methods[func_name].body.body new_body.extend(self.update_body(original_modeling_method_body, actual_body[i + 1 :])) - new_body = self._fix_init_location(new_body) + new_body = self._fix_init_location(new_body, original_modeling_method_body) # Break here as all future statement were already accounted for in `update_body` break # If not a call to super, this will replace all calls of the form `module.Class.func(...)` by a @@ -1039,6 +1054,7 @@ def replace_class_node( # Recreate the whole new class body new_class_body = new_class_docstring + new_class_attributes + new_class_methods + # if renamed_super_class == "Aimv2Config": # Replace the calls to `super()` of the redefined modular methods with the unrolled code result_node = original_modeling_node.with_changes(body=cst.IndentedBlock(body=new_class_body)) temp_module = cst.Module(body=[result_node])