diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2f4e755131e5..c6a8661af415 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -320,8 +320,9 @@ def shard_checkpoint( weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split. - if last_block_size + weight_size > max_shard_size: + # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one + # weight in the current shard. + if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0: sharded_state_dicts.append({}) last_block_size = 0 @@ -3044,15 +3045,30 @@ def _fix_key(key): expected_keys = [".".join([prefix, s]) for s in expected_keys] missing_keys = list(set(expected_keys) - set(loaded_keys)) - unexpected_keys = list(set(loaded_keys) - set(expected_keys)) + unexpected_keys = set(loaded_keys) - set(expected_keys) + # Remove nonpersistent buffers from unexpected keys: they are not in the state dict but will be in the model + # buffers + model_buffers = {n for n, _ in model.named_buffers()} + if remove_prefix_from_model: + model_buffers = {key[len(_prefix) :] if key.startswith(_prefix) else key for key in model_buffers} + elif add_prefix_to_model: + model_buffers = {".".join([prefix, key]) for key in model_buffers} + unexpected_keys = list(unexpected_keys - model_buffers) - if is_accelerate_available(): - model.tie_weights() - tied_params = find_tied_parameters(model) - else: - tied_params = [] + model.tie_weights() + ptrs = collections.defaultdict(list) + for name, tensor in model.state_dict().items(): + id_tensor = id_tensor_storage(tensor) if tensor.device != torch.device("meta") else id(tensor) + ptrs[id_tensor].append(name) + + # These are all the pointers of shared tensors. + tied_params = [names for _, names in ptrs.items() if len(names) > 1] for group in tied_params: + if remove_prefix_from_model: + group = [key[len(_prefix) :] if key.startswith(_prefix) else key for key in group] + elif add_prefix_to_model: + group = [".".join([prefix, key]) for key in group] missing_in_group = [k for k in missing_keys if k in group] if len(missing_in_group) > 0 and len(missing_in_group) < len(group): missing_keys = [k for k in missing_keys if k not in missing_in_group] diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 2a2f6d6ef539..7196e14be291 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -208,7 +208,9 @@ def __init__(self, config: AlbertConfig): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -507,7 +509,6 @@ class AlbertPreTrainedModel(PreTrainedModel): config_class = AlbertConfig load_tf_weights = load_tf_weights_in_albert base_model_prefix = "albert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights.""" @@ -760,11 +761,6 @@ def forward( ) class AlbertForPreTraining(AlbertPreTrainedModel): _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] - _keys_to_ignore_on_load_missing = [ - "predictions.decoder.weight", - "predictions.decoder.bias", - "embeddings.position_ids", - ] def __init__(self, config: AlbertConfig): super().__init__(config) @@ -912,13 +908,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor: ALBERT_START_DOCSTRING, ) class AlbertForMaskedLM(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] - _keys_to_ignore_on_load_missing = [ - "predictions.decoder.weight", - "predictions.decoder.bias", - "embeddings.position_ids", - ] def __init__(self, config): super().__init__(config) @@ -1133,8 +1123,6 @@ def forward( ALBERT_START_DOCSTRING, ) class AlbertForTokenClassification(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config: AlbertConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1218,8 +1206,6 @@ def forward( ALBERT_START_DOCSTRING, ) class AlbertForQuestionAnswering(AlbertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config: AlbertConfig): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 09ee6eca6265..a7d31775cf40 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -687,7 +687,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1176,7 +1178,6 @@ class AlignPreTrainedModel(PreTrainedModel): config_class = AlignConfig base_model_prefix = "align" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index 26b3f5928081..fe2754cac808 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -216,7 +216,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1016,7 +1018,7 @@ def __init__(self, config: AltCLIPVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -1038,7 +1040,6 @@ class AltCLIPPreTrainedModel(PreTrainedModel): config_class = AltCLIPConfig base_model_prefix = "altclip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index f426956594d6..ad4d4ab9c988 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -506,7 +506,7 @@ class BartPretrainedModel(PreTrainedModel): config_class = BartConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] + _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"] _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"] _skip_keys_device_placement = "past_key_values" @@ -1170,7 +1170,6 @@ def custom_forward(*inputs): BART_START_DOCSTRING, ) class BartModel(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BartConfig): @@ -1300,12 +1299,7 @@ def forward( class BartForConditionalGeneration(BartPretrainedModel): base_model_prefix = "model" _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - _keys_to_ignore_on_load_missing = [ - "final_logits_bias", - "lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] def __init__(self, config: BartConfig): super().__init__(config) @@ -1478,7 +1472,6 @@ def _reorder_cache(past_key_values, beam_idx): BART_START_DOCSTRING, ) class BartForSequenceClassification(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BartConfig, **kwargs): @@ -1609,7 +1602,6 @@ def forward( BART_START_DOCSTRING, ) class BartForQuestionAnswering(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -1748,7 +1740,6 @@ def forward(self, *args, **kwargs): BART_START_DOCSTRING, ) class BartForCausalLM(BartPretrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index b17721fb2bcd..d698cff88b14 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -459,7 +459,7 @@ def __init__(self, config: BeitConfig, window_size: tuple) -> None: relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", relative_position_index) + self.register_buffer("relative_position_index", relative_position_index, persistent=False) def forward(self) -> torch.Tensor: relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index fb92a0e84cc4..17667e8443dd 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -192,7 +192,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -743,7 +745,6 @@ class BertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_bert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1053,7 +1054,6 @@ def forward( BERT_START_DOCSTRING, ) class BertForPreTraining(BertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1160,8 +1160,6 @@ def forward( """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING ) class BertLMHeadModel(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1301,8 +1299,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", r"cls.predictions.decoder.weight"] _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1715,8 +1711,6 @@ def forward( BERT_START_DOCSTRING, ) class BertForTokenClassification(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1800,8 +1794,6 @@ def forward( BERT_START_DOCSTRING, ) class BertForQuestionAnswering(BertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index f92b7a0633e8..3f4a26da4594 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -556,7 +556,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0): if input_ids is not None: @@ -588,7 +590,6 @@ class BertGenerationPreTrainedModel(PreTrainedModel): config_class = BertGenerationConfig base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -860,7 +861,6 @@ def _tie_weights(self): BERT_GENERATION_START_DOCSTRING, ) class BertGenerationDecoder(BertGenerationPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder.weight", "lm_head.decoder.bias", "embeddings.position_ids"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index e1346a23c9db..a2db2e2638f5 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -257,7 +257,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1765,7 +1767,6 @@ class BigBirdPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_big_bird base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -2261,7 +2262,6 @@ def _pad_to_block_size( class BigBirdForPreTraining(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -2368,7 +2368,6 @@ def forward( @add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) class BigBirdForMaskedLM(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -2513,12 +2512,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ """BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING ) class BigBirdForCausalLM(BigBirdPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"predictions.decoder.bias", - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index d7683d6fcf8f..fe43c1e68e25 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -2358,7 +2358,6 @@ def custom_forward(*inputs): BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BigBirdPegasusConfig): @@ -2491,12 +2490,7 @@ def forward( class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel): base_model_prefix = "model" _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] - _keys_to_ignore_on_load_missing = [ - "final_logits_bias", - "lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] def __init__(self, config: BigBirdPegasusConfig): super().__init__(config) @@ -2669,7 +2663,6 @@ def _reorder_cache(past_key_values, beam_idx): BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: BigBirdPegasusConfig, **kwargs): @@ -2799,7 +2792,6 @@ def forward( BIGBIRD_PEGASUS_START_DOCSTRING, ) class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -2932,7 +2924,6 @@ def forward(self, *args, **kwargs): class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/biogpt/modeling_biogpt.py b/src/transformers/models/biogpt/modeling_biogpt.py index 3e925917cffb..7f6d44502c01 100755 --- a/src/transformers/models/biogpt/modeling_biogpt.py +++ b/src/transformers/models/biogpt/modeling_biogpt.py @@ -646,7 +646,6 @@ def custom_forward(*inputs): """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING ) class BioGptForCausalLM(BioGptPreTrainedModel): - _keys_to_ignore_on_load_missing = ["output_projection.weight"] _tied_weights_keys = ["output_projection.weight"] def __init__(self, config): diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py index 8e582c4fa33a..a3aaf6b4a812 100755 --- a/src/transformers/models/blenderbot/modeling_blenderbot.py +++ b/src/transformers/models/blenderbot/modeling_blenderbot.py @@ -1102,7 +1102,6 @@ def custom_forward(*inputs): BLENDERBOT_START_DOCSTRING, ) class BlenderbotModel(BlenderbotPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: BlenderbotConfig): @@ -1244,14 +1243,7 @@ def forward( ) class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: BlenderbotConfig): @@ -1441,7 +1433,6 @@ def forward(self, *args, **kwargs): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill class BlenderbotForCausalLM(BlenderbotPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py index 890b47373e7e..70794e80a43b 100755 --- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py +++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -1096,7 +1096,6 @@ def custom_forward(*inputs): BLENDERBOT_SMALL_START_DOCSTRING, ) class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: BlenderbotSmallConfig): @@ -1226,14 +1225,7 @@ def forward( ) class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: BlenderbotSmallConfig): @@ -1408,7 +1400,6 @@ def forward(self, *args, **kwargs): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 0e70333c3404..115aa14e83fa 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -255,7 +255,9 @@ def __init__(self, config: BlipTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -419,7 +421,6 @@ class BlipPreTrainedModel(PreTrainedModel): config_class = BlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -927,7 +928,6 @@ def forward( ) class BlipForConditionalGeneration(BlipPreTrainedModel): config_class = BlipConfig - _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] main_input_name = "pixel_values" @@ -1100,7 +1100,6 @@ def generate( ) class BlipForQuestionAnswering(BlipPreTrainedModel): config_class = BlipConfig - _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] def __init__(self, config: BlipConfig): diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 1f269cf852ee..444a7a22b6b0 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -56,7 +56,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config @@ -552,7 +554,6 @@ class BlipTextPreTrainedModel(PreTrainedModel): config_class = BlipTextConfig base_model_prefix = "bert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -808,9 +809,6 @@ def forward( # Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 class BlipTextLMHeadModel(BlipTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index b52a58d97f4a..5856df2c2572 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -273,12 +273,6 @@ class Blip2PreTrainedModel(PreTrainedModel): config_class = Blip2Config base_model_prefix = "blip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"language_model.encoder.embed_tokens.weight", - r"language_model.decoder.embed_tokens.weight", - r"language_model.lm_head.weight", - ] _no_split_modules = ["Blip2Attention", "T5Block", "OPTDecoderLayer"] _skip_keys_device_placement = "past_key_values" _keep_in_fp32_modules = ["wo"] diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py index 4f6de49a1447..d37972a429f1 100644 --- a/src/transformers/models/bloom/modeling_bloom.py +++ b/src/transformers/models/bloom/modeling_bloom.py @@ -471,12 +471,6 @@ def forward( class BloomPreTrainedModel(PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - config_class = BloomConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True @@ -826,7 +820,6 @@ def custom_forward(*inputs): BLOOM_START_DOCSTRING, ) class BloomForCausalLM(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: BloomConfig): @@ -995,8 +988,6 @@ def _reorder_cache( BLOOM_START_DOCSTRING, ) class BloomForSequenceClassification(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config: BloomConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1123,8 +1114,6 @@ def forward( BLOOM_START_DOCSTRING, ) class BloomForTokenClassification(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config: BloomConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1226,8 +1215,6 @@ def forward( BLOOM_START_DOCSTRING, ) class BloomForQuestionAnswering(BloomPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h.*.self_attention.scale_mask_softmax.causal_mask", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.transformer = BloomModel(config) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index 4290241fbc09..1fb3cc131bc8 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -280,7 +280,7 @@ def __init__(self, config: BridgeTowerVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -880,7 +880,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1038,8 +1040,6 @@ class BridgeTowerTextModel(BridgeTowerPreTrainedModel): config_class = BridgeTowerTextConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index e98840fbc6d2..ed3afab11aa4 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -94,7 +94,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -627,15 +629,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, CamembertEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - CAMEMBERT_INPUTS_DOCSTRING = r""" Args: @@ -762,7 +755,6 @@ class CamembertModel(CamembertPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] _no_split_modules = [] # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Camembert @@ -935,9 +927,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForMaskedLM(CamembertPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -952,9 +941,6 @@ def __init__(self, config): self.roberta = CamembertModel(config, add_pooling_layer=False) self.lm_head = CamembertLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1042,8 +1028,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForSequenceClassification(CamembertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1144,8 +1128,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForMultipleChoice(CamembertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1241,9 +1223,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForTokenClassification(CamembertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1330,9 +1309,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT class CamembertForQuestionAnswering(CamembertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1431,9 +1407,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, roberta-base->camembert-base class CamembertForCausalLM(CamembertPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1445,9 +1418,6 @@ def __init__(self, config): self.roberta = CamembertModel(config, add_pooling_layer=False) self.lm_head = CamembertLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index a91d42f0395e..b863e294bdd2 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -216,7 +216,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int): @@ -900,7 +902,6 @@ class CaninePreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_canine base_model_prefix = "canine" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index 0adf5cfdcb18..86da1c7b6a8b 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -121,7 +121,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -190,7 +192,7 @@ def __init__(self, config: ChineseCLIPVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -689,7 +691,6 @@ class ChineseCLIPPreTrainedModel(PreTrainedModel): config_class = ChineseCLIPConfig base_model_prefix = "chinese_clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index c4dbcb03f34d..0f3986ada0ce 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -1166,7 +1166,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=True ) @@ -1677,7 +1679,6 @@ class ClapPreTrainedModel(PreTrainedModel): config_class = ClapConfig base_model_prefix = "clap" supports_gradient_checkpointing = False - _keys_to_ignore_on_load_missing = [r"position_ids", r"logit_scale_a", r"logit_scale_t"] def _init_weights(self, module): """Initialize the weights""" @@ -1781,7 +1782,6 @@ class ClapTextModel(ClapPreTrainedModel): """ config_class = ClapTextConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->ClapText def __init__(self, config, add_pooling_layer=True): @@ -1936,7 +1936,6 @@ def forward( @add_start_docstrings(CLAP_START_DOCSTRING) class ClapModel(ClapPreTrainedModel): config_class = ClapConfig - _keys_to_ignore_on_load_missing = [r"position_ids"] def __init__(self, config: ClapConfig): super().__init__(config) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index ee9d660ef713..487f756d3ff0 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -188,7 +188,7 @@ def __init__(self, config: CLIPVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -210,7 +210,9 @@ def __init__(self, config: CLIPTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -410,7 +412,6 @@ class CLIPPreTrainedModel(PreTrainedModel): config_class = CLIPConfig base_model_prefix = "clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/clipseg/modeling_clipseg.py b/src/transformers/models/clipseg/modeling_clipseg.py index 85b119653068..b1d120e365a8 100644 --- a/src/transformers/models/clipseg/modeling_clipseg.py +++ b/src/transformers/models/clipseg/modeling_clipseg.py @@ -181,7 +181,7 @@ def __init__(self, config: CLIPSegVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def interpolate_position_embeddings(self, new_size): if len(new_size) != 2: @@ -230,7 +230,9 @@ def __init__(self, config: CLIPSegTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -433,7 +435,6 @@ class CLIPSegPreTrainedModel(PreTrainedModel): config_class = CLIPSegConfig base_model_prefix = "clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py index 8b1d34f59e7b..4b87800cb1be 100644 --- a/src/transformers/models/codegen/modeling_codegen.py +++ b/src/transformers/models/codegen/modeling_codegen.py @@ -83,6 +83,7 @@ def __init__(self, config): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) self.attn_dropout = nn.Dropout(config.attn_pdrop) @@ -600,7 +601,6 @@ def custom_forward(*inputs): CODEGEN_START_DOCSTRING, ) class CodeGenForCausalLM(CodeGenPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.causal_mask"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/convbert/modeling_convbert.py b/src/transformers/models/convbert/modeling_convbert.py index bbdba210c233..a3910e20dbef 100755 --- a/src/transformers/models/convbert/modeling_convbert.py +++ b/src/transformers/models/convbert/modeling_convbert.py @@ -191,7 +191,9 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -245,8 +247,6 @@ class ConvBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_convbert base_model_prefix = "convbert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"convbert.embeddings_project.weight", r"convbert.embeddings_project.bias"] def _init_weights(self, module): """Initialize the weights""" @@ -765,8 +765,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: CONVBERT_START_DOCSTRING, ) class ConvBertModel(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.embeddings = ConvBertEmbeddings(config) @@ -880,7 +878,6 @@ def forward(self, generator_hidden_states: torch.FloatTensor) -> torch.FloatTens @add_start_docstrings("""ConvBERT Model with a `language modeling` head on top.""", CONVBERT_START_DOCSTRING) class ConvBertForMaskedLM(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids", "generator.lm_head.weight"] _tied_weights_keys = ["generator.lm_head.weight"] def __init__(self, config): @@ -992,8 +989,6 @@ def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor: CONVBERT_START_DOCSTRING, ) class ConvBertForSequenceClassification(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1089,8 +1084,6 @@ def forward( CONVBERT_START_DOCSTRING, ) class ConvBertForMultipleChoice(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) @@ -1184,8 +1177,6 @@ def forward( CONVBERT_START_DOCSTRING, ) class ConvBertForTokenClassification(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1267,8 +1258,6 @@ def forward( CONVBERT_START_DOCSTRING, ) class ConvBertForQuestionAnswering(ConvBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["embeddings.position_ids"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/cpmant/modeling_cpmant.py b/src/transformers/models/cpmant/modeling_cpmant.py index 33ead6a10464..808a341ac998 100755 --- a/src/transformers/models/cpmant/modeling_cpmant.py +++ b/src/transformers/models/cpmant/modeling_cpmant.py @@ -537,7 +537,6 @@ class CpmAntPreTrainedModel(PreTrainedModel): config_class = CpmAntConfig base_model_prefix = "cpmant" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -749,7 +748,6 @@ def forward( CPMANT_START_DOCSTRING, ) class CpmAntForCausalLM(CpmAntPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: CpmAntConfig): diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index dadcbb494cf9..7cf5168e74b9 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -509,7 +509,6 @@ def forward( CTRL_START_DOCSTRING, ) class CTRLLMHeadModel(CTRLPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/data2vec/modeling_data2vec_audio.py b/src/transformers/models/data2vec/modeling_data2vec_audio.py index 76b6b4d485f0..a42fb5eb0678 100755 --- a/src/transformers/models/data2vec/modeling_data2vec_audio.py +++ b/src/transformers/models/data2vec/modeling_data2vec_audio.py @@ -689,7 +689,6 @@ class Data2VecAudioPreTrainedModel(PreTrainedModel): config_class = Data2VecAudioConfig base_model_prefix = "data2vec_audio" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 206fe1603b00..4c07acd11072 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -80,7 +80,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -615,15 +617,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, Data2VecTextEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - DATA2VECTEXT_START_DOCSTRING = r""" Data2VecText was proposed in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and @@ -714,8 +707,6 @@ class Data2VecTextModel(Data2VecTextPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -883,9 +874,6 @@ def forward( """Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.""", DATA2VECTEXT_START_DOCSTRING ) class Data2VecTextForCausalLM(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -897,9 +885,6 @@ def __init__(self, config): self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False) self.lm_head = Data2VecTextLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1038,9 +1023,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""data2vec Model with a `language modeling` head on top.""", DATA2VECTEXT_START_DOCSTRING) class Data2VecTextForMaskedLM(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1055,9 +1037,6 @@ def __init__(self, config): self.data2vec_text = Data2VecTextModel(config, add_pooling_layer=False) self.lm_head = Data2VecTextLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1174,8 +1153,6 @@ def _tie_weights(self): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForSequenceClassification(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1273,8 +1250,6 @@ def forward( DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForMultipleChoice(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1369,9 +1344,6 @@ def forward( DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForTokenClassification(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1478,9 +1450,6 @@ def forward(self, features, **kwargs): DATA2VECTEXT_START_DOCSTRING, ) class Data2VecTextForQuestionAnswering(Data2VecTextPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py index 77b424354892..f8fe59587af0 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_vision.py +++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py @@ -470,7 +470,7 @@ def __init__(self, config: Data2VecVisionConfig, window_size: tuple) -> None: relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", relative_position_index) + self.register_buffer("relative_position_index", relative_position_index, persistent=False) def forward(self) -> torch.Tensor: relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 9a0d43db3a0a..c946592730e4 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -764,7 +764,9 @@ def __init__(self, config): self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): if input_ids is not None: @@ -821,7 +823,6 @@ class DebertaPreTrainedModel(PreTrainedModel): config_class = DebertaConfig base_model_prefix = "deberta" - _keys_to_ignore_on_load_missing = ["position_ids"] _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True @@ -1020,8 +1021,6 @@ def forward( @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaForMaskedLM(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1277,8 +1276,6 @@ def forward( DEBERTA_START_DOCSTRING, ) class DebertaForTokenClassification(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1352,8 +1349,6 @@ def forward( DEBERTA_START_DOCSTRING, ) class DebertaForQuestionAnswering(DebertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 1596ad4ffad4..608bca009580 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -862,7 +862,9 @@ def __init__(self, config): self.config = config # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, mask=None, inputs_embeds=None): if input_ids is not None: @@ -920,7 +922,6 @@ class DebertaV2PreTrainedModel(PreTrainedModel): config_class = DebertaV2Config base_model_prefix = "deberta" - _keys_to_ignore_on_load_missing = ["position_ids"] _keys_to_ignore_on_load_unexpected = ["position_embeddings"] supports_gradient_checkpointing = True @@ -1120,8 +1121,6 @@ def forward( @add_start_docstrings("""DeBERTa Model with a `language modeling` head on top.""", DEBERTA_START_DOCSTRING) class DebertaV2ForMaskedLM(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1380,8 +1379,6 @@ def forward( ) # Copied from transformers.models.deberta.modeling_deberta.DebertaForTokenClassification with Deberta->DebertaV2 class DebertaV2ForTokenClassification(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1455,8 +1452,6 @@ def forward( DEBERTA_START_DOCSTRING, ) class DebertaV2ForQuestionAnswering(DebertaV2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/decision_transformer/modeling_decision_transformer.py b/src/transformers/models/decision_transformer/modeling_decision_transformer.py index 926947b1617d..064b3cb0ad72 100755 --- a/src/transformers/models/decision_transformer/modeling_decision_transformer.py +++ b/src/transformers/models/decision_transformer/modeling_decision_transformer.py @@ -476,8 +476,6 @@ def _set_gradient_checkpointing(self, module, value=False): class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config): super().__init__(config) @@ -747,8 +745,6 @@ class DecisionTransformerPreTrainedModel(PreTrainedModel): base_model_prefix = "decision_transformer" main_input_name = "states" supports_gradient_checkpointing = False - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 6469cf7a65df..cdeb3c796225 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -1823,7 +1823,6 @@ def forward( ) class DeformableDetrForObjectDetection(DeformableDetrPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] def __init__(self, config: DeformableDetrConfig): diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index bee84a5bf72f..b5fe0ea8a8e5 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -1775,7 +1775,6 @@ def forward( ) class DetaForObjectDetection(DetaPreTrainedModel): # When using clones, all layers > 0 will be clones, but layer 0 *is* required - _keys_to_ignore_on_load_missing = [r"bbox_embed\.[1-9]\d*", r"class_embed\.[1-9]\d*"] _tied_weights_keys = [r"bbox_embed\.\d+"] # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrForObjectDetection.__init__ with DeformableDetr->Deta diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index 8b71c086bbcc..97300dec2d64 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -595,7 +595,6 @@ def forward( DISTILBERT_START_DOCSTRING, ) class DistilBertForMaskedLM(DistilBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["vocab_projector.weight"] _tied_weights_keys = ["vocab_projector.weight"] def __init__(self, config: PretrainedConfig): diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py index a551e507300b..588440d4a6c5 100644 --- a/src/transformers/models/dpr/modeling_dpr.py +++ b/src/transformers/models/dpr/modeling_dpr.py @@ -296,8 +296,6 @@ class DPRPretrainedContextEncoder(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "ctx_encoder" - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): @@ -309,8 +307,6 @@ class DPRPretrainedQuestionEncoder(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "question_encoder" - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] class DPRPretrainedReader(DPRPreTrainedModel): @@ -322,7 +318,6 @@ class DPRPretrainedReader(DPRPreTrainedModel): config_class = DPRConfig load_tf_weights = None base_model_prefix = "span_predictor" - _keys_to_ignore_on_load_missing = [r"position_ids"] ############### diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index a7ee4ec93202..23ca78e8e064 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -161,7 +161,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -672,8 +674,6 @@ class ElectraPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_electra base_model_prefix = "electra" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - _keys_to_ignore_on_load_unexpected = [r"electra.embeddings_project.weight", r"electra.embeddings_project.bias"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): @@ -1166,7 +1166,6 @@ def forward( ELECTRA_START_DOCSTRING, ) class ElectraForMaskedLM(ElectraPreTrainedModel): - _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"] _tied_weights_keys = ["generator_lm_head.weight"] def __init__(self, config): @@ -1534,7 +1533,6 @@ def forward( """ELECTRA Model with a `language modeling` head on top for CLM fine-tuning.""", ELECTRA_START_DOCSTRING ) class ElectraForCausalLM(ElectraPreTrainedModel): - _keys_to_ignore_on_load_missing = ["generator_lm_head.weight"] _tied_weights_keys = ["generator_lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index b8df1b2d5035..79b3c00280b7 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -89,7 +89,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -661,7 +663,6 @@ class ErniePreTrainedModel(PreTrainedModel): config_class = ErnieConfig base_model_prefix = "ernie" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -983,7 +984,6 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForPreTraining(ErniePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertForPreTraining.__init__ with Bert->Ernie,bert->ernie @@ -1095,8 +1095,6 @@ def forward( """Ernie Model with a `language modeling` head on top for CLM fine-tuning.""", ERNIE_START_DOCSTRING ) class ErnieForCausalLM(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->ErnieForCausalLM,Bert->Ernie,bert->ernie @@ -1243,8 +1241,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""Ernie Model with a `language modeling` head on top.""", ERNIE_START_DOCSTRING) class ErnieForMaskedLM(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->Ernie,bert->ernie @@ -1665,8 +1661,6 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForTokenClassification(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) @@ -1746,8 +1740,6 @@ def forward( ERNIE_START_DOCSTRING, ) class ErnieForQuestionAnswering(ErniePreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->Ernie,bert->ernie def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/ernie_m/modeling_ernie_m.py b/src/transformers/models/ernie_m/modeling_ernie_m.py index 6d995cf84cb0..82e402394913 100755 --- a/src/transformers/models/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/ernie_m/modeling_ernie_m.py @@ -412,7 +412,6 @@ class ErnieMPreTrainedModel(PreTrainedModel): config_class = ErnieMConfig base_model_prefix = "ernie_m" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index e0b26e0f7812..43ff7d7b52b5 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -96,7 +96,7 @@ def __init__(self, dim: int): # Generate and save the inverse frequency buffer (non trainable) inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) inv_freq = inv_freq - self.register_buffer("inv_freq", inv_freq) + self.register_buffer("inv_freq", inv_freq, persistent=False) self._seq_len_cached = None self._cos_cached = None @@ -178,7 +178,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -783,7 +785,6 @@ class EsmModel(EsmPreTrainedModel): `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. """ - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = False def __init__(self, config, add_pooling_layer=True): @@ -960,8 +961,6 @@ def predict_contacts(self, tokens, attention_mask): @add_start_docstrings("""ESM Model with a `language modeling` head on top.""", ESM_START_DOCSTRING) class EsmForMaskedLM(EsmPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", "lm_head.decoder.weight"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight"] def __init__(self, config): @@ -1081,8 +1080,6 @@ def forward(self, features, **kwargs): ESM_START_DOCSTRING, ) class EsmForSequenceClassification(EsmPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1177,9 +1174,6 @@ def forward( ESM_START_DOCSTRING, ) class EsmForTokenClassification(EsmPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/flaubert/modeling_flaubert.py b/src/transformers/models/flaubert/modeling_flaubert.py index 1b04da241036..318e9bfd471c 100644 --- a/src/transformers/models/flaubert/modeling_flaubert.py +++ b/src/transformers/models/flaubert/modeling_flaubert.py @@ -378,8 +378,6 @@ def _init_weights(self, module): class FlaubertModel(FlaubertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): # , dico, is_encoder, with_output): super().__init__(config) @@ -448,7 +446,6 @@ def __init__(self, config): # , dico, is_encoder, with_output): # Initialize weights and apply final processing self.post_init() - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) @@ -654,7 +651,6 @@ def forward( ) # Copied transformers.models.xlm.modeling_xlm.XLMWithLMHeadModel with XLM_INPUTS->FLAUBERT_INPUTS,XLM->Flaubert class FlaubertWithLMHeadModel(FlaubertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"] _tied_weights_keys = ["pred_layer.proj.weight"] def __init__(self, config): diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index 5d49197f8ca5..d986a17b7503 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -387,7 +387,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -1724,12 +1726,6 @@ def forward(self, image_embeddings, text_embeddings, logit_scale): ) class FlavaForPreTraining(FlavaPreTrainedModel): # Those are linked to xxx.bias - _keys_to_ignore_on_load_missing = [ - "mmm_text_head.decoder.bias", - "mmm_image_head.decoder.bias", - "mlm_head.decoder.bias", - "mim_head.decoder.bias", - ] _tied_weights_keys = [ "mmm_text_head.decoder.bias", "mmm_image_head.decoder.bias", diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index 6bc526eeebcb..45042147761d 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -114,7 +114,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False @@ -411,7 +413,6 @@ class FNetPreTrainedModel(PreTrainedModel): config_class = FNetConfig base_model_prefix = "fnet" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -621,7 +622,6 @@ def forward( FNET_START_DOCSTRING, ) class FNetForPreTraining(FNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -716,7 +716,6 @@ def forward( @add_start_docstrings("""FNet Model with a `language modeling` head on top.""", FNET_START_DOCSTRING) class FNetForMaskedLM(FNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/fsmt/modeling_fsmt.py b/src/transformers/models/fsmt/modeling_fsmt.py index 255cf91df76c..608efabf7885 100644 --- a/src/transformers/models/fsmt/modeling_fsmt.py +++ b/src/transformers/models/fsmt/modeling_fsmt.py @@ -1034,7 +1034,6 @@ def _get_shape(t): FSMT_START_DOCSTRING, ) class FSMTModel(PretrainedFSMTModel): - _keys_to_ignore_on_load_missing = ["decoder.output_projection.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight"] def __init__(self, config: FSMTConfig): @@ -1172,15 +1171,6 @@ def set_output_embeddings(self, value): ) class FSMTForConditionalGeneration(PretrainedFSMTModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - "model.encoder.embed_positions.weight", - "model.decoder.embed_positions.weight", - "decoder.output_projection.weight", - ] - _keys_to_ignore_on_save = [ - "model.encoder.embed_positions.weight", - "model.decoder.embed_positions.weight", - ] _tied_weights_keys = ["model.decoder.embed_tokens.weight"] def __init__(self, config: FSMTConfig): diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py index 805b651f2126..0ee9ed587ed9 100644 --- a/src/transformers/models/funnel/modeling_funnel.py +++ b/src/transformers/models/funnel/modeling_funnel.py @@ -1190,7 +1190,6 @@ def forward( @add_start_docstrings("""Funnel Transformer Model with a `language modeling` head on top.""", FUNNEL_START_DOCSTRING) class FunnelForMaskedLM(FunnelPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: FunnelConfig) -> None: diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 23ae6d64962f..89696694ff4e 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -109,7 +109,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -510,7 +512,6 @@ class GitPreTrainedModel(PreTrainedModel): config_class = GitConfig base_model_prefix = "git" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -623,7 +624,7 @@ def __init__(self, config: GitVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index b9a8568f00e7..58b419897a7c 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -668,9 +668,6 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): GPT2_START_DOCSTRING, ) class GPT2Model(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] - def __init__(self, config): super().__init__(config) @@ -957,8 +954,6 @@ def custom_forward(*inputs): GPT2_START_DOCSTRING, ) class GPT2LMHeadModel(GPT2PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1151,8 +1146,6 @@ def _reorder_cache( GPT2_START_DOCSTRING, ) class GPT2DoubleHeadsModel(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1381,9 +1374,6 @@ def _reorder_cache( GPT2_START_DOCSTRING, ) class GPT2ForSequenceClassification(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1605,9 +1595,6 @@ def forward( GPT2_START_DOCSTRING, ) class GPT2ForQuestionAnswering(GPT2PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.bias", r"h\.\d+\.attn\.masked_bias"] - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py index 705d07b1da25..a45b9bd4b261 100644 --- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py +++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py @@ -500,8 +500,6 @@ def _set_gradient_checkpointing(self, module, value=False): GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeModel(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config): super().__init__(config) self.multi_query = config.multi_query @@ -722,7 +720,6 @@ def custom_forward(*inputs): GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForCausalLM(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -876,8 +873,6 @@ def _reorder_cache( GPT_BIGCODE_START_DOCSTRING, ) class GPTBigCodeForSequenceClassification(GPTBigCodePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index b67f4ddbfaca..66471b6eac27 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -145,8 +145,8 @@ def __init__(self, config, attention_type): if attention_type == "local": bias = torch.bitwise_xor(bias, torch.tril(bias, -config.window_size)) - self.register_buffer("bias", bias) - self.register_buffer("masked_bias", torch.tensor(-1e9)) + self.register_buffer("bias", bias, persistent=False) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.attn_dropout = nn.Dropout(float(config.attention_dropout)) self.resid_dropout = nn.Dropout(float(config.resid_dropout)) @@ -663,12 +663,6 @@ def custom_forward(*inputs): GPT_NEO_START_DOCSTRING, ) class GPTNeoForCausalLM(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"h\.\d+\.attn\.masked_bias", - r"lm_head.weight", - r"h\.\d+\.attn\.attention\.bias", - ] - _keys_to_ignore_on_save = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -820,8 +814,6 @@ def _reorder_cache( GPT_NEO_START_DOCSTRING, ) class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1025,8 +1017,6 @@ def forward( GPT_NEO_START_DOCSTRING, ) class GPTNeoForQuestionAnswering(GPTNeoPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index 7c3bfd1035f9..841cbe1aa8f2 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -100,8 +100,9 @@ def __init__(self, config): torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) - self.register_buffer("masked_bias", torch.tensor(-1e9)) + self.register_buffer("masked_bias", torch.tensor(-1e9), persistent=False) self.rotary_emb = RotaryEmbedding( self.rotary_ndims, config.max_position_embeddings, base=config.rotary_emb_base ) @@ -600,7 +601,6 @@ def custom_forward(*inputs): """GPTNeoX Model with a `language modeling` head on top for CLM fine-tuning.""", GPT_NEOX_START_DOCSTRING ) class GPTNeoXForCausalLM(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["embed_out.weight"] def __init__(self, config): @@ -775,8 +775,6 @@ def _reorder_cache(self, past_key_values, beam_idx): GPT_NEOX_START_DOCSTRING, ) class GPTNeoXForSequenceClassification(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -971,8 +969,6 @@ def forward( GPT_NEOX_START_DOCSTRING, ) class GPTNeoXForQuestionAnswering(GPTNeoXPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index 1671e5916ef7..e7cb510e6222 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -591,7 +591,6 @@ def forward( GPT_NEOX_JAPANESE_START_DOCSTRING, ) class GPTNeoXJapaneseForCausalLM(GPTNeoXJapanesePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "embed_out.weight"] _tied_weights_keys = ["embed_out.weight"] def __init__(self, config): diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index de120167989d..e9a9045a6d2f 100644 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -734,7 +734,6 @@ def custom_forward(*inputs): GPTJ_START_DOCSTRING, ) class GPTJForCausalLM(GPTJPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -933,8 +932,6 @@ def _reorder_cache( GPTJ_START_DOCSTRING, ) class GPTJForSequenceClassification(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1059,8 +1056,6 @@ def forward( GPTJ_START_DOCSTRING, ) class GPTJForQuestionAnswering(GPTJPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"h\.\d+\.attn\.bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py index 8c1cdd0b1a55..f02aa2dc839c 100644 --- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py @@ -1111,7 +1111,6 @@ def forward( GPTSAN_JAPANESE_START_DOCSTRING, ) class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: GPTSanJapaneseConfig): diff --git a/src/transformers/models/graphormer/modeling_graphormer.py b/src/transformers/models/graphormer/modeling_graphormer.py index 2dd86b7b55ff..82ffd4b1637d 100755 --- a/src/transformers/models/graphormer/modeling_graphormer.py +++ b/src/transformers/models/graphormer/modeling_graphormer.py @@ -714,7 +714,6 @@ class GraphormerPreTrainedModel(PreTrainedModel): config_class = GraphormerConfig base_model_prefix = "graphormer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] main_input_name_nodes = "input_nodes" main_input_name_edges = "input_edges" diff --git a/src/transformers/models/groupvit/modeling_groupvit.py b/src/transformers/models/groupvit/modeling_groupvit.py index c19ebd13b91d..9c312c0ff811 100644 --- a/src/transformers/models/groupvit/modeling_groupvit.py +++ b/src/transformers/models/groupvit/modeling_groupvit.py @@ -450,7 +450,9 @@ def __init__(self, config: GroupViTTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -767,7 +769,6 @@ class GroupViTPreTrainedModel(PreTrainedModel): config_class = GroupViTConfig base_model_prefix = "groupvit" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index af3d4e2d0aca..8228520dfd5e 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -869,7 +869,6 @@ class HubertPreTrainedModel(PreTrainedModel): base_model_prefix = "hubert" main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 7f300e01ae4e..6cf484d96f78 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -80,7 +80,9 @@ def __init__(self, config): ) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy @@ -740,8 +742,6 @@ class IBertModel(IBertPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -854,8 +854,6 @@ def forward( @add_start_docstrings("""I-BERT Model with a `language modeling` head on top.""", IBERT_START_DOCSTRING) class IBertForMaskedLM(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.bias", "lm_head.decoder.weight"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.bias", "lm_head.decoder.weight"] def __init__(self, config): @@ -969,8 +967,6 @@ def _tie_weights(self): IBERT_START_DOCSTRING, ) class IBertForSequenceClassification(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1064,8 +1060,6 @@ def forward( IBERT_START_DOCSTRING, ) class IBertForMultipleChoice(IBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1156,9 +1150,6 @@ def forward( IBERT_START_DOCSTRING, ) class IBertForTokenClassification(IBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1256,9 +1247,6 @@ def forward(self, features, **kwargs): IBERT_START_DOCSTRING, ) class IBertForQuestionAnswering(IBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py index 539119fabf28..f24cf7ae7136 100755 --- a/src/transformers/models/imagegpt/modeling_imagegpt.py +++ b/src/transformers/models/imagegpt/modeling_imagegpt.py @@ -183,8 +183,9 @@ def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view( 1, 1, max_positions, max_positions ), + persistent=False, ) - self.register_buffer("masked_bias", torch.tensor(-1e4)) + self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False) self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads @@ -613,8 +614,6 @@ def _set_gradient_checkpointing(self, module, value=False): IMAGEGPT_START_DOCSTRING, ) class ImageGPTModel(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["attn.masked_bias"] - def __init__(self, config: ImageGPTConfig): super().__init__(config) @@ -893,7 +892,6 @@ def custom_forward(*inputs): IMAGEGPT_START_DOCSTRING, ) class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: ImageGPTConfig): @@ -1085,8 +1083,6 @@ def _reorder_cache( IMAGEGPT_START_DOCSTRING, ) class ImageGPTForImageClassification(ImageGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config: ImageGPTConfig): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/jukebox/modeling_jukebox.py b/src/transformers/models/jukebox/modeling_jukebox.py index f7be47c00588..236d1f4ff37b 100755 --- a/src/transformers/models/jukebox/modeling_jukebox.py +++ b/src/transformers/models/jukebox/modeling_jukebox.py @@ -602,7 +602,6 @@ def forward(self, input_audio): class JukeboxVQVAE(PreTrainedModel): config_class = JukeboxVQVAEConfig base_model_prefix = "vqvae" - _keys_to_ignore_on_load_unexpected = [r"priors"] def _init_weights(self, module): if isinstance(module, nn.Embedding): # embed_tokens @@ -1792,7 +1791,6 @@ class JukeboxPrior(PreTrainedModel): """ config_class = JukeboxPriorConfig - _keys_to_ignore_on_load_unexpected = ["vqvae"] def _init_weights(self, module): init_scale = self.config.init_scale @@ -1832,7 +1830,6 @@ def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_en self.level = level if level is not None else config.level self.base_model_prefix = f"priors.{self.level}" - self._keys_to_ignore_on_load_unexpected += [r"priors.[^%d]." % self.level] self.n_ctx = config.n_ctx diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 410f76509422..26c4cd92d6e5 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -68,7 +68,9 @@ def __init__(self, config): self.LayerNorm = LayoutLMLayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -619,7 +621,6 @@ class LayoutLMPreTrainedModel(PreTrainedModel): pretrained_model_archive_map = LAYOUTLM_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "layoutlm" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -857,11 +858,6 @@ def forward( @add_start_docstrings("""LayoutLM Model with a `language modeling` head on top.""", LAYOUTLM_START_DOCSTRING) class LayoutLMForMaskedLM(LayoutLMPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 5a6f39ce31a6..18927fb1fde8 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -77,7 +77,9 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def _calc_spatial_position_embeddings(self, bbox): try: @@ -506,7 +508,6 @@ class LayoutLMv2PreTrainedModel(PreTrainedModel): config_class = LayoutLMv2Config pretrained_model_archive_map = LAYOUTLMV2_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "layoutlmv2" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -567,8 +568,11 @@ def __init__(self, config): self.register_buffer( "pixel_mean", torch.Tensor(self.cfg.MODEL.PIXEL_MEAN).view(num_channels, 1, 1), + persistent=False, + ) + self.register_buffer( + "pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1), persistent=False ) - self.register_buffer("pixel_std", torch.Tensor(self.cfg.MODEL.PIXEL_STD).view(num_channels, 1, 1)) self.out_feature_key = "p2" if torch.are_deterministic_algorithms_enabled(): logger.warning("using `AvgPool2d` instead of `AdaptiveAvgPool2d`") diff --git a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py index db6618caaeaf..1648016b5740 100644 --- a/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py +++ b/src/transformers/models/layoutlmv3/modeling_layoutlmv3.py @@ -245,7 +245,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -750,8 +752,6 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3Model(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.config = config @@ -1038,9 +1038,6 @@ def forward(self, x): LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1153,9 +1150,6 @@ def forward( LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1286,8 +1280,6 @@ def forward( LAYOUTLMV3_START_DOCSTRING, ) class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index 8de14242bfc7..d98c8d29672e 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -2209,7 +2209,6 @@ def custom_forward(*inputs): LED_START_DOCSTRING, ) class LEDModel(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: LEDConfig): @@ -2335,14 +2334,7 @@ def forward( ) class LEDForConditionalGeneration(LEDPreTrainedModel): base_model_prefix = "led" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: LEDConfig): @@ -2530,7 +2522,6 @@ def _reorder_cache(past_key_values, beam_idx): LED_START_DOCSTRING, ) class LEDForSequenceClassification(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config: LEDConfig, **kwargs): @@ -2667,7 +2658,6 @@ def forward( LED_START_DOCSTRING, ) class LEDForQuestionAnswering(LEDPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] def __init__(self, config): diff --git a/src/transformers/models/levit/modeling_levit.py b/src/transformers/models/levit/modeling_levit.py index e45ffa05b157..0accc28391bd 100644 --- a/src/transformers/models/levit/modeling_levit.py +++ b/src/transformers/models/levit/modeling_levit.py @@ -195,7 +195,9 @@ def __init__(self, hidden_sizes, key_dim, num_attention_heads, attention_ratio, self.attention_bias_cache = {} self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) - self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points)) + self.register_buffer( + "attention_bias_idxs", torch.LongTensor(indices).view(len_points, len_points), persistent=False + ) @torch.no_grad() def train(self, mode=True): @@ -271,7 +273,9 @@ def __init__( indices.append(attention_offsets[offset]) self.attention_biases = torch.nn.Parameter(torch.zeros(num_attention_heads, len(attention_offsets))) - self.register_buffer("attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points)) + self.register_buffer( + "attention_bias_idxs", torch.LongTensor(indices).view(len_points_, len_points), persistent=False + ) @torch.no_grad() def train(self, mode=True): diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index 74454d244e8d..e5783b970f87 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -59,7 +59,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy @@ -610,15 +612,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, LiltEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - LILT_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -697,8 +690,6 @@ def update_keys_to_ignore(self, config, del_keys_to_ignore): LILT_START_DOCSTRING, ) class LiltModel(LiltPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -847,8 +838,6 @@ def forward( LILT_START_DOCSTRING, ) class LiltForSequenceClassification(LiltPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) @@ -967,9 +956,6 @@ def forward( LILT_START_DOCSTRING, ) class LiltForTokenClassification(LiltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) @@ -1096,9 +1082,6 @@ def forward(self, features, **kwargs): LILT_START_DOCSTRING, ) class LiltForQuestionAnswering(LiltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Lilt, roberta->lilt def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index c9debdd252dc..24231c3f777d 100755 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -344,7 +344,6 @@ class LlamaPreTrainedModel(PreTrainedModel): supports_gradient_checkpointing = True _no_split_modules = ["LlamaDecoderLayer"] _skip_keys_device_placement = "past_key_values" - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -784,8 +783,6 @@ def _reorder_cache(past_key_values, beam_idx): LLAMA_START_DOCSTRING, ) class LlamaForSequenceClassification(LlamaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py index 665e2cb56421..994157daa879 100755 --- a/src/transformers/models/longformer/modeling_longformer.py +++ b/src/transformers/models/longformer/modeling_longformer.py @@ -1421,7 +1421,6 @@ class LongformerPreTrainedModel(PreTrainedModel): config_class = LongformerConfig base_model_prefix = "longformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"position_ids"] _no_split_modules = ["LongformerSelfAttention"] def _init_weights(self, module): @@ -1770,8 +1769,6 @@ def forward( @add_start_docstrings("""Longformer Model with a `language modeling` head on top.""", LONGFORMER_START_DOCSTRING) class LongformerForMaskedLM(LongformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder"] def __init__(self, config): @@ -1886,8 +1883,6 @@ def forward( LONGFORMER_START_DOCSTRING, ) class LongformerForSequenceClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2015,8 +2010,6 @@ def forward(self, hidden_states, **kwargs): LONGFORMER_START_DOCSTRING, ) class LongformerForQuestionAnswering(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2154,8 +2147,6 @@ def forward( LONGFORMER_START_DOCSTRING, ) class LongformerForTokenClassification(LongformerPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/longt5/modeling_longt5.py b/src/transformers/models/longt5/modeling_longt5.py index 1a49444e8a50..303755ae4338 100644 --- a/src/transformers/models/longt5/modeling_longt5.py +++ b/src/transformers/models/longt5/modeling_longt5.py @@ -1763,10 +1763,6 @@ def custom_forward(*inputs): LONGT5_START_DOCSTRING, ) class LongT5Model(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] _keys_to_ignore_on_load_unexpected = [ r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] @@ -1917,11 +1913,6 @@ def forward( @add_start_docstrings("""LONGT5 Model with a `language modeling` head on top.""", LONGT5_START_DOCSTRING) class LongT5ForConditionalGeneration(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _keys_to_ignore_on_load_unexpected = [ r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] @@ -2160,7 +2151,6 @@ def _reorder_cache(self, past_key_values, beam_idx): LONGT5_START_DOCSTRING, ) class LongT5EncoderModel(LongT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: LongT5Config): diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index ba21d3deb32e..8a3ceb14d50e 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -1022,8 +1022,6 @@ def _set_gradient_checkpointing(self, module, value=False): LUKE_START_DOCSTRING, ) class LukeModel(LukePreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config: LukeConfig, add_pooling_layer: bool = True): super().__init__(config) self.config = config @@ -1278,17 +1276,6 @@ def _tie_weights(self): LUKE_START_DOCSTRING, ) class LukeForMaskedLM(LukePreTrainedModel): - _keys_to_ignore_on_save = [ - r"lm_head.decoder.weight", - r"lm_head.decoder.bias", - r"entity_predictions.decoder.weight", - ] - _keys_to_ignore_on_load_missing = [ - r"position_ids", - r"lm_head.decoder.weight", - r"lm_head.decoder.bias", - r"entity_predictions.decoder.weight", - ] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias", "entity_predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index 21a279ec29ca..2a1a21282ec0 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -1018,7 +1018,6 @@ def forward( LXMERT_START_DOCSTRING, ) class LxmertForPreTraining(LxmertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py index a9cde571f7d2..20db884c6366 100755 --- a/src/transformers/models/m2m_100/modeling_m2m_100.py +++ b/src/transformers/models/m2m_100/modeling_m2m_100.py @@ -131,7 +131,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt # in forward put the weights on the correct dtype and device of the param emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) - self.register_buffer("weights", emb_weights) + self.register_buffer("weights", emb_weights, persistent=False) @staticmethod def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): @@ -1137,14 +1137,6 @@ def custom_forward(*inputs): M2M_100_START_DOCSTRING, ) class M2M100Model(M2M100PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - "encoder.embed_positions.weights", - "encoder.embed_positions.bias", - "decoder.embed_positions.weights", - "decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: M2M100Config): @@ -1258,17 +1250,6 @@ def forward( ) class M2M100ForConditionalGeneration(M2M100PreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"encoder.embed_positions.weights", - r"encoder.embed_positions.bias", - r"decoder.embed_positions.weights", - r"decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: M2M100Config): diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 1d1cbe125e64..d25d1ed4bc22 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -1103,7 +1103,6 @@ def custom_forward(*inputs): "The bare Marian Model outputting raw hidden-states without any specific head on top.", MARIAN_START_DOCSTRING ) class MarianModel(MarianPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MarianConfig): @@ -1292,13 +1291,9 @@ def forward( class MarianMTModel(MarianPreTrainedModel): base_model_prefix = "model" _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", + "final_logits_bias", + "encoder.embed_positions.weight", + "decoder.embed_positions.weight", ] _keys_to_ignore_on_save = ["model.encoder.embed_positions.weight", "model.decoder.embed_positions.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] @@ -1561,7 +1556,6 @@ def forward(self, *args, **kwargs): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Marian, facebook/bart-base->Helsinki-NLP/opus-mt-fr-en class MarianForCausalLM(MarianPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/markuplm/modeling_markuplm.py b/src/transformers/models/markuplm/modeling_markuplm.py index 0c6847b47815..34435b898fcf 100755 --- a/src/transformers/models/markuplm/modeling_markuplm.py +++ b/src/transformers/models/markuplm/modeling_markuplm.py @@ -143,7 +143,9 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.padding_idx = config.pad_token_id self.position_embeddings = nn.Embedding( @@ -713,7 +715,6 @@ class MarkupLMPreTrainedModel(PreTrainedModel): config_class = MarkupLMConfig pretrained_model_archive_map = MARKUPLM_PRETRAINED_MODEL_ARCHIVE_LIST base_model_prefix = "markuplm" - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with Bert->MarkupLM def _init_weights(self, module): @@ -971,8 +972,6 @@ def _reorder_cache(self, past_key_values, beam_idx): MARKUPLM_START_DOCSTRING, ) class MarkupLMForQuestionAnswering(MarkupLMPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with bert->markuplm, Bert->MarkupLM def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 7bf6b1b37e98..577d950c932d 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -1156,7 +1156,6 @@ def custom_forward(*inputs): MBART_START_DOCSTRING, ) class MBartModel(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MBartConfig): @@ -1277,14 +1276,7 @@ def forward( ) class MBartForConditionalGeneration(MBartPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: MBartConfig): @@ -1452,7 +1444,6 @@ def _reorder_cache(past_key_values, beam_idx): MBART_START_DOCSTRING, ) class MBartForSequenceClassification(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] def __init__(self, config: MBartConfig, **kwargs): @@ -1582,7 +1573,6 @@ def forward( MBART_START_DOCSTRING, ) class MBartForQuestionAnswering(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"] def __init__(self, config): @@ -1716,7 +1706,6 @@ def forward(self, *args, **kwargs): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25 class MBartForCausalLM(MBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/mctct/modeling_mctct.py b/src/transformers/models/mctct/modeling_mctct.py index 7f2de9f952a9..4b965b27ec18 100755 --- a/src/transformers/models/mctct/modeling_mctct.py +++ b/src/transformers/models/mctct/modeling_mctct.py @@ -149,7 +149,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -443,7 +445,6 @@ class MCTCTPreTrainedModel(PreTrainedModel): config_class = MCTCTConfig base_model_prefix = "mctct" main_input_name = "input_features" - _keys_to_ignore_on_load_missing = ["position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/mega/modeling_mega.py b/src/transformers/models/mega/modeling_mega.py index 19e1cc107504..9381e60905cd 100644 --- a/src/transformers/models/mega/modeling_mega.py +++ b/src/transformers/models/mega/modeling_mega.py @@ -1387,15 +1387,6 @@ def _init_weights(self, module): module.bias.data.zero_() module.weight.data.fill_(1.0) - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - MEGA_START_DOCSTRING = r""" @@ -1474,8 +1465,6 @@ class MegaModel(MegaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [] - def __init__(self, config: MegaConfig, add_pooling_layer=True): super().__init__(config) self.config = config @@ -1656,9 +1645,6 @@ def forward( """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING ) class MegaForCausalLM(MegaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.weight", r"lm_head.bias"] - _keys_to_ignore_on_load_missing = [r"lm_head.weight", r"lm_head.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: MegaConfig): @@ -1678,9 +1664,6 @@ def __init__(self, config: MegaConfig): self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1821,9 +1804,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING) class MegaForMaskedLM(MegaPreTrainedModel): - _keys_to_ignore_on_save = [r"mlm_head.weight", r"mlm_head.bias"] - _keys_to_ignore_on_load_missing = [r"mlm_head.weight", r"mlm_head.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["mlm_head.weight"] def __init__(self, config: MegaConfig): @@ -1845,9 +1825,6 @@ def __init__(self, config: MegaConfig): self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size) self.dropout = nn.Dropout(config.dropout_prob) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["mlm_head.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1931,8 +1908,6 @@ def forward( MEGA_START_DOCSTRING, ) class MegaForSequenceClassification(MegaPreTrainedModel): - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2024,8 +1999,6 @@ def forward( MEGA_START_DOCSTRING, ) class MegaForMultipleChoice(MegaPreTrainedModel): - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) @@ -2111,9 +2084,6 @@ def forward( MEGA_START_DOCSTRING, ) class MegaForTokenClassification(MegaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -2214,9 +2184,6 @@ def forward(self, features, **kwargs): MEGA_START_DOCSTRING, ) class MegaForQuestionAnswering(MegaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index bba7e7369cb8..c28b681326c6 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -149,7 +149,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( @@ -713,7 +715,6 @@ class MegatronBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_megatron_bert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1014,7 +1015,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForPreTraining(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config, add_binary_head=True): @@ -1121,8 +1121,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForCausalLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1267,8 +1265,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""MegatronBert Model with a `language modeling` head on top.""", MEGATRON_BERT_START_DOCSTRING) class MegatronBertForMaskedLM(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", r"seq_relationship"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1376,8 +1372,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"predictions"] - def __init__(self, config): super().__init__(config) @@ -1672,8 +1666,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForTokenClassification(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1752,8 +1744,6 @@ def forward( MEGATRON_BERT_START_DOCSTRING, ) class MegatronBertForQuestionAnswering(MegatronBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index fcd49a8f8cf3..06318679faee 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -191,7 +191,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -686,7 +688,6 @@ class MobileBertPreTrainedModel(PreTrainedModel): pretrained_model_archive_map = MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST load_tf_weights = load_tf_weights_in_mobilebert base_model_prefix = "mobilebert" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -923,11 +924,6 @@ def forward( MOBILEBERT_START_DOCSTRING, ) class MobileBertForPreTraining(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1036,12 +1032,6 @@ def forward( @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING) class MobileBertForMaskedLM(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.weight", - "cls.predictions.decoder.bias", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1350,8 +1340,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering with Bert->MobileBert all-casing class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1553,8 +1541,6 @@ def forward( ) # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification with Bert->MobileBert all-casing class MobileBertForTokenClassification(MobileBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 93e5abe72a48..68bdad1c9fd4 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -83,7 +83,9 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, **kwargs): if position_ids is None: @@ -479,8 +481,6 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: MPNET_START_DOCSTRING, ) class MPNetModel(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -570,8 +570,6 @@ def forward( class MPNetForMaskedLM(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder"] def __init__(self, config): @@ -679,8 +677,6 @@ def forward(self, features, **kwargs): MPNET_START_DOCSTRING, ) class MPNetForSequenceClassification(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -773,8 +769,6 @@ def forward( MPNET_START_DOCSTRING, ) class MPNetForMultipleChoice(MPNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -863,9 +857,6 @@ def forward( MPNET_START_DOCSTRING, ) class MPNetForTokenClassification(MPNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -962,9 +953,6 @@ def forward(self, features, **kwargs): MPNET_START_DOCSTRING, ) class MPNetForQuestionAnswering(MPNetPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py index dfeb3c10a915..03e3581cf05b 100644 --- a/src/transformers/models/mt5/modeling_mt5.py +++ b/src/transformers/models/mt5/modeling_mt5.py @@ -1316,18 +1316,8 @@ class MT5Model(MT5PreTrainedModel): ```""" model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5 @@ -1552,15 +1542,7 @@ class MT5ForConditionalGeneration(MT5PreTrainedModel): model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 @@ -1897,13 +1879,6 @@ class MT5EncoderModel(MT5PreTrainedModel): model_type = "mt5" config_class = MT5Config - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_save = [ - r"encoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5EncoderModel.__init__ with T5->MT5 @@ -2029,14 +2004,7 @@ def forward( MT5_START_DOCSTRING, ) class MT5ForQuestionAnswering(MT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] # Copied from transformers.models.t5.modeling_t5.T5ForQuestionAnswering.__init__ with T5->MT5 diff --git a/src/transformers/models/mvp/modeling_mvp.py b/src/transformers/models/mvp/modeling_mvp.py index d135ee558d16..92e393b39e25 100644 --- a/src/transformers/models/mvp/modeling_mvp.py +++ b/src/transformers/models/mvp/modeling_mvp.py @@ -551,7 +551,6 @@ class MvpPreTrainedModel(PreTrainedModel): config_class = MvpConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_unexpected = [r"encoder.version", r"decoder.version"] def _init_weights(self, module): std = self.config.init_std @@ -1300,8 +1299,7 @@ def custom_forward(*inputs): MVP_START_DOCSTRING, ) class MvpModel(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + _keys_to_ignore_on_load_unexpected = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MvpConfig): @@ -1438,7 +1436,6 @@ def forward( "The MVP Model with a language modeling head. Can be used for various text generation tasks.", MVP_START_DOCSTRING ) class MvpForConditionalGeneration(MvpPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: MvpConfig): @@ -1611,8 +1608,6 @@ def _reorder_cache(past_key_values, beam_idx): MVP_START_DOCSTRING, ) class MvpForSequenceClassification(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: MvpConfig, **kwargs): @@ -1740,8 +1735,6 @@ def forward( MVP_START_DOCSTRING, ) class MvpForQuestionAnswering(MvpPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"final_logits_bias", r"lm_head.weight"] - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config): @@ -1873,7 +1866,6 @@ def forward(self, *args, **kwargs): class MvpForCausalLM(MvpPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/nezha/modeling_nezha.py b/src/transformers/models/nezha/modeling_nezha.py index 97c5b5a90ec3..8d66bfe41fab 100644 --- a/src/transformers/models/nezha/modeling_nezha.py +++ b/src/transformers/models/nezha/modeling_nezha.py @@ -163,7 +163,7 @@ def __init__(self, length, depth, max_relative_position=127): my_shape = list(final_mat.size()) my_shape.append(depth) positions_encoding = positions_encoding.view(my_shape) - self.register_buffer("positions_encoding", positions_encoding) + self.register_buffer("positions_encoding", positions_encoding, persistent=False) def forward(self, length): return self.positions_encoding[:length, :length, :] @@ -735,7 +735,6 @@ class NezhaPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_nezha base_model_prefix = "nezha" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"positions_encoding"] def _init_weights(self, module): """Initialize the weights""" @@ -1037,7 +1036,6 @@ def forward( NEZHA_START_DOCSTRING, ) class NezhaForPreTraining(NezhaPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1140,8 +1138,6 @@ def forward( @add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING) class NezhaForMaskedLM(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"cls.predictions.decoder", r"positions_encoding"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1542,8 +1538,6 @@ def forward( NEZHA_START_DOCSTRING, ) class NezhaForTokenClassification(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1623,8 +1617,6 @@ def forward( NEZHA_START_DOCSTRING, ) class NezhaForQuestionAnswering(NezhaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/nllb_moe/modeling_nllb_moe.py b/src/transformers/models/nllb_moe/modeling_nllb_moe.py index 3585b1d3b62f..217314555840 100644 --- a/src/transformers/models/nllb_moe/modeling_nllb_moe.py +++ b/src/transformers/models/nllb_moe/modeling_nllb_moe.py @@ -183,7 +183,7 @@ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Opt # in forward put the weights on the correct dtype and device of the param emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device) - self.register_buffer("weights", emb_weights) + self.register_buffer("weights", emb_weights, persistent=False) @staticmethod def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None): @@ -1500,14 +1500,6 @@ def custom_forward(*inputs): NLLB_MOE_START_DOCSTRING, ) class NllbMoeModel(NllbMoePreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - "encoder.embed_positions.weights", - "encoder.embed_positions.bias", - "decoder.embed_positions.weights", - "decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: NllbMoeConfig): @@ -1641,17 +1633,6 @@ def forward( ) class NllbMoeForConditionalGeneration(NllbMoePreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"encoder.embed_positions.weights", - r"encoder.embed_positions.bias", - r"decoder.embed_positions.weights", - r"decoder.embed_positions.bias", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: NllbMoeConfig): diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index b859b0db1d4f..607deb7b0ab6 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -64,7 +64,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -458,7 +460,6 @@ class NystromformerPreTrainedModel(PreTrainedModel): config_class = NystromformerConfig base_model_prefix = "nystromformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -658,7 +659,6 @@ def forward( @add_start_docstrings("""Nyströmformer Model with a `language modeling` head on top.""", NYSTROMFORMER_START_DOCSTRING) class NystromformerForMaskedLM(NystromformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): diff --git a/src/transformers/models/open_llama/modeling_open_llama.py b/src/transformers/models/open_llama/modeling_open_llama.py index 16ad554dc313..84d5c6e78fa2 100644 --- a/src/transformers/models/open_llama/modeling_open_llama.py +++ b/src/transformers/models/open_llama/modeling_open_llama.py @@ -368,7 +368,6 @@ class OpenLlamaPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OpenLlamaDecoderLayer"] - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.initializer_range @@ -825,8 +824,6 @@ def _reorder_cache(past_key_values, beam_idx): ) # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 0949b2f7dac7..23f8fc8bc7d3 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -141,7 +141,9 @@ def __init__(self, nx, n_positions, config, scale=False): if n_state % config.n_head != 0: raise ValueError(f"Attention n_state shape: {n_state} must be divisible by config.n_head {config.n_head}") self.register_buffer( - "bias", torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions) + "bias", + torch.tril(torch.ones(n_positions, n_positions)).view(1, 1, n_positions, n_positions), + persistent=False, ) self.n_head = config.n_head self.split_size = n_state @@ -274,7 +276,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel): config_class = OpenAIGPTConfig load_tf_weights = load_tf_weights_in_openai_gpt base_model_prefix = "transformer" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights.""" @@ -407,7 +408,7 @@ def __init__(self, config): self.drop = nn.Dropout(config.embd_pdrop) self.h = nn.ModuleList([Block(config.n_positions, config, scale=True) for _ in range(config.n_layer)]) - self.register_buffer("position_ids", torch.arange(config.n_positions)) + self.register_buffer("position_ids", torch.arange(config.n_positions), persistent=False) # Initialize weights and apply final processing self.post_init() @@ -529,7 +530,6 @@ def forward( OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -621,7 +621,6 @@ def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) - OPENAI_GPT_START_DOCSTRING, ) class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py index 5ad783b92daf..a473d9bd5b6d 100644 --- a/src/transformers/models/opt/modeling_opt.py +++ b/src/transformers/models/opt/modeling_opt.py @@ -399,7 +399,6 @@ class OPTPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["OPTDecoderLayer"] - _keys_to_ignore_on_load_unexpected = [r"decoder\.version"] def _init_weights(self, module): std = self.config.init_std @@ -817,7 +816,6 @@ def forward( class OPTForCausalLM(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): @@ -1025,8 +1023,6 @@ def _reorder_cache(past_key_values, beam_idx): OPT_START_DOCSTRING, ) class OPTForSequenceClassification(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config: OPTConfig): super().__init__(config) self.num_labels = config.num_labels @@ -1147,8 +1143,6 @@ def set_input_embeddings(self, value): OPT_START_DOCSTRING, ) class OPTForQuestionAnswering(OPTPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_head.weight"] - def __init__(self, config: OPTConfig): super().__init__(config) self.model = OPTModel(config) diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py index f65a0688578e..34ee828a7400 100644 --- a/src/transformers/models/owlvit/modeling_owlvit.py +++ b/src/transformers/models/owlvit/modeling_owlvit.py @@ -304,7 +304,7 @@ def __init__(self, config: OwlViTVisionConfig): self.num_patches = (config.image_size // config.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -325,7 +325,9 @@ def __init__(self, config: OwlViTTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -530,7 +532,6 @@ class OwlViTPreTrainedModel(PreTrainedModel): config_class = OwlViTConfig base_model_prefix = "owlvit" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] _no_split_modules = ["OwlViTEncoderLayer"] def _init_weights(self, module): diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py index 3eac50b327ff..e9121655d13f 100755 --- a/src/transformers/models/pegasus/modeling_pegasus.py +++ b/src/transformers/models/pegasus/modeling_pegasus.py @@ -1156,7 +1156,6 @@ def custom_forward(*inputs): PEGASUS_START_DOCSTRING, ) class PegasusModel(PegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PegasusConfig): @@ -1309,15 +1308,7 @@ def forward( ) class PegasusForConditionalGeneration(PegasusPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions.weight", - "encoder.embed_tokens.weight", - "decoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PegasusConfig): @@ -1518,7 +1509,6 @@ def forward(self, *args, **kwargs): class PegasusForCausalLM(PegasusPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/pegasus_x/modeling_pegasus_x.py b/src/transformers/models/pegasus_x/modeling_pegasus_x.py index 0763aec360fb..caf736ba3ad8 100755 --- a/src/transformers/models/pegasus_x/modeling_pegasus_x.py +++ b/src/transformers/models/pegasus_x/modeling_pegasus_x.py @@ -1391,7 +1391,6 @@ def custom_forward(*inputs): PEGASUS_X_START_DOCSTRING, ) class PegasusXModel(PegasusXPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PegasusXConfig): @@ -1536,14 +1535,6 @@ def forward( @add_start_docstrings("The PEGASUS-X for conditional generation (e.g. summarization).", PEGASUS_X_START_DOCSTRING) class PegasusXForConditionalGeneration(PegasusXPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - r"embed_positions.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PegasusXConfig): diff --git a/src/transformers/models/pix2struct/modeling_pix2struct.py b/src/transformers/models/pix2struct/modeling_pix2struct.py index 2db104a5a112..b9cfff26a26a 100644 --- a/src/transformers/models/pix2struct/modeling_pix2struct.py +++ b/src/transformers/models/pix2struct/modeling_pix2struct.py @@ -1597,14 +1597,6 @@ def custom_forward(*inputs): class Pix2StructForConditionalGeneration(Pix2StructPreTrainedModel): config_class = Pix2StructConfig main_input_name = "flattened_patches" - - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.layer.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] _tied_weights_keys = ["decoder.lm_head.weight"] def __init__(self, config: Pix2StructConfig): diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py index 30d9bd0ddc38..cf2901d43d26 100644 --- a/src/transformers/models/plbart/modeling_plbart.py +++ b/src/transformers/models/plbart/modeling_plbart.py @@ -1132,7 +1132,6 @@ def custom_forward(*inputs): PLBART_START_DOCSTRING, ) class PLBartModel(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PLBartConfig): @@ -1251,14 +1250,7 @@ def forward( ) class PLBartForConditionalGeneration(PLBartPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"final_logits_bias", - r"encoder.version", - r"decoder.version", - r"lm_head.weight", - "decoder.embed_tokens.weight", - "encoder.embed_tokens.weight", - ] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: PLBartConfig): @@ -1423,7 +1415,6 @@ def _reorder_cache(past_key_values, beam_idx): PLBART_START_DOCSTRING, ) class PLBartForSequenceClassification(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: PLBartConfig, **kwargs): @@ -1562,7 +1553,6 @@ def forward(self, *args, **kwargs): # Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->PLBart, facebook/bart-base->uclanlp/plbart-base class PLBartForCausalLM(PLBartPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py index 9160d5e1eb46..1b771705ab75 100644 --- a/src/transformers/models/prophetnet/modeling_prophetnet.py +++ b/src/transformers/models/prophetnet/modeling_prophetnet.py @@ -1744,7 +1744,6 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask): PROPHETNET_START_DOCSTRING, ) class ProphetNetModel(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] def __init__(self, config: ProphetNetConfig): @@ -1874,11 +1873,6 @@ def forward( PROPHETNET_START_DOCSTRING, ) class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "decoder.word_embeddings.weight", - "encoder.word_embeddings.weight", - "lm_head.weight", - ] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] def __init__(self, config: ProphetNetConfig): @@ -2091,7 +2085,6 @@ def get_decoder(self): PROPHETNET_START_DOCSTRING, ) class ProphetNetForCausalLM(ProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: ProphetNetConfig): diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py index 47a34e959072..da60b8efea1e 100755 --- a/src/transformers/models/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/qdqbert/modeling_qdqbert.py @@ -164,7 +164,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -738,7 +740,6 @@ class QDQBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_qdqbert base_model_prefix = "bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1012,8 +1013,6 @@ def forward( """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING ) class QDQBertLMHeadModel(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] def __init__(self, config): @@ -1166,8 +1165,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING) class QDQBertForMaskedLM(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"] def __init__(self, config): @@ -1570,8 +1567,6 @@ def forward( QDQBERT_START_DOCSTRING, ) class QDQBertForTokenClassification(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1650,8 +1645,6 @@ def forward( QDQBERT_START_DOCSTRING, ) class QDQBertForQuestionAnswering(QDQBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py index 019b26ef08e9..1e615512c91c 100644 --- a/src/transformers/models/rag/modeling_rag.py +++ b/src/transformers/models/rag/modeling_rag.py @@ -231,7 +231,6 @@ class RagPreTrainedModel(PreTrainedModel): """ config_class = RagConfig base_model_prefix = "rag" - _keys_to_ignore_on_load_missing = [r"position_ids"] @classmethod def from_pretrained(cls, *args, **kwargs): diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py index f68fc04105de..2e675a4d3425 100644 --- a/src/transformers/models/realm/modeling_realm.py +++ b/src/transformers/models/realm/modeling_realm.py @@ -178,7 +178,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -968,7 +970,6 @@ class RealmPreTrainedModel(PreTrainedModel): config_class = RealmConfig load_tf_weights = load_tf_weights_in_realm base_model_prefix = "realm" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1147,7 +1148,6 @@ def forward( REALM_START_DOCSTRING, ) class RealmEmbedder(RealmPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.bias"] def __init__(self, config): @@ -1378,7 +1378,6 @@ def forward( REALM_START_DOCSTRING, ) class RealmKnowledgeAugEncoder(RealmPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder"] _tied_weights_keys = ["cls.predictions.decoder"] def __init__(self, config): @@ -1529,8 +1528,6 @@ def forward( @add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING) class RealmReader(RealmPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py index 98b4577b67d9..7f3979ad21ee 100755 --- a/src/transformers/models/reformer/modeling_reformer.py +++ b/src/transformers/models/reformer/modeling_reformer.py @@ -352,10 +352,10 @@ def __init__(self, config): self.value = nn.Linear(self.hidden_size, self.all_head_size, bias=False) # save mask value here. Need fp32 and fp16 mask values - self.register_buffer("self_mask_value_float16", torch.tensor(-1e3)) - self.register_buffer("self_mask_value_float32", torch.tensor(-1e5)) - self.register_buffer("mask_value_float16", torch.tensor(-1e4)) - self.register_buffer("mask_value_float32", torch.tensor(-1e9)) + self.register_buffer("self_mask_value_float16", torch.tensor(-1e3), persistent=False) + self.register_buffer("self_mask_value_float32", torch.tensor(-1e5), persistent=False) + self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False) + self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False) def forward( self, @@ -1049,8 +1049,8 @@ def __init__(self, config): self.dropout = config.local_attention_probs_dropout_prob # save mask value here - self.register_buffer("mask_value_float16", torch.tensor(-1e4)) - self.register_buffer("mask_value_float32", torch.tensor(-1e9)) + self.register_buffer("mask_value_float16", torch.tensor(-1e4), persistent=False) + self.register_buffer("mask_value_float32", torch.tensor(-1e9), persistent=False) def forward( self, @@ -2185,7 +2185,6 @@ def _pad_to_mult_of_chunk_length( @add_start_docstrings("""Reformer Model with a `language modeling` head on top.""", REFORMER_START_DOCSTRING) class ReformerModelWithLMHead(ReformerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.decoder.bias"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index da4ad9608514..e0ab18088aae 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -158,7 +158,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -654,7 +656,6 @@ class RemBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_rembert base_model_prefix = "rembert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1016,7 +1017,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ """RemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", REMBERT_START_DOCSTRING ) class RemBertForCausalLM(RemBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index b0f136924601..cf71ceba7c45 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -80,7 +80,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -614,15 +616,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, RobertaEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - ROBERTA_START_DOCSTRING = r""" @@ -711,8 +704,6 @@ class RobertaModel(RobertaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -881,9 +872,6 @@ def forward( """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning.""", ROBERTA_START_DOCSTRING ) class RobertaForCausalLM(RobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -895,9 +883,6 @@ def __init__(self, config): self.roberta = RobertaModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1036,9 +1021,6 @@ def _reorder_cache(self, past_key_values, beam_idx): @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top.""", ROBERTA_START_DOCSTRING) class RobertaForMaskedLM(RobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1053,9 +1035,6 @@ def __init__(self, config): self.roberta = RobertaModel(config, add_pooling_layer=False) self.lm_head = RobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1173,8 +1152,6 @@ def _tie_weights(self): ROBERTA_START_DOCSTRING, ) class RobertaForSequenceClassification(RobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1274,8 +1251,6 @@ def forward( ROBERTA_START_DOCSTRING, ) class RobertaForMultipleChoice(RobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1368,9 +1343,6 @@ def forward( ROBERTA_START_DOCSTRING, ) class RobertaForTokenClassification(RobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1478,9 +1450,6 @@ def forward(self, features, **kwargs): ROBERTA_START_DOCSTRING, ) class RobertaForQuestionAnswering(RobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index b1e02e27f138..c9b455716fc2 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -83,7 +83,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -617,15 +619,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, RobertaPreLayerNormEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - ROBERTA_PRELAYERNORM_START_DOCSTRING = r""" @@ -714,8 +707,6 @@ class RobertaPreLayerNormModel(RobertaPreLayerNormPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config, add_pooling_layer=True): super().__init__(config) self.config = config @@ -886,9 +877,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with roberta-base->andreasmadsen/efficient_mlm_m0.40,ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm, RobertaPreLayerNormTokenizer->RobertaTokenizer class RobertaPreLayerNormForCausalLM(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -902,9 +890,6 @@ def __init__(self, config): self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.lm_head = RobertaPreLayerNormLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1045,9 +1030,6 @@ def _reorder_cache(self, past_key_values, beam_idx): """RoBERTa-PreLayerNorm Model with a `language modeling` head on top.""", ROBERTA_PRELAYERNORM_START_DOCSTRING ) class RobertaPreLayerNormForMaskedLM(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm @@ -1063,9 +1045,6 @@ def __init__(self, config): self.roberta_prelayernorm = RobertaPreLayerNormModel(config, add_pooling_layer=False) self.lm_head = RobertaPreLayerNormLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1185,8 +1164,6 @@ def _tie_weights(self): ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForSequenceClassification(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1286,8 +1263,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with ROBERTA->ROBERTA_PRELAYERNORM,Roberta->RobertaPreLayerNorm,roberta->roberta_prelayernorm class RobertaPreLayerNormForMultipleChoice(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1382,9 +1357,6 @@ def forward( ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForTokenClassification(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1492,9 +1464,6 @@ def forward(self, features, **kwargs): ROBERTA_PRELAYERNORM_START_DOCSTRING, ) class RobertaPreLayerNormForQuestionAnswering(RobertaPreLayerNormPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index 7647c14a9ea3..c57537ecf3e4 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -190,7 +190,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -777,7 +779,6 @@ class RoCBertPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_roc_bert base_model_prefix = "roc_bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -1081,7 +1082,6 @@ def forward( ROC_BERT_START_DOCSTRING, ) class RoCBertForPreTraining(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1267,8 +1267,6 @@ def forward( @add_start_docstrings("""RoCBert Model with a `language modeling` head on top.""", ROC_BERT_START_DOCSTRING) class RoCBertForMaskedLM(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] # Copied from transformers.models.bert.modeling_bert.BertForMaskedLM.__init__ with Bert->RoCBert,bert->roc_bert @@ -1409,8 +1407,6 @@ def prepare_inputs_for_generation( """RoCBert Model with a `language modeling` head on top for CLM fine-tuning.""", ROC_BERT_START_DOCSTRING ) class RoCBertForCausalLM(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] # Copied from transformers.models.bert.modeling_bert.BertLMHeadModel.__init__ with BertLMHeadModel->RoCBertForCausalLM,Bert->RoCBert,bert->roc_bert @@ -1804,8 +1800,6 @@ def forward( ROC_BERT_START_DOCSTRING, ) class RoCBertForTokenClassification(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForTokenClassification.__init__ with Bert->RoCBert,bert->roc_bert def __init__(self, config): super().__init__(config) @@ -1892,8 +1886,6 @@ def forward( ROC_BERT_START_DOCSTRING, ) class RoCBertForQuestionAnswering(RoCBertPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - # Copied from transformers.models.bert.modeling_bert.BertForQuestionAnswering.__init__ with Bert->RoCBert,bert->roc_bert def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index b966bf4490a9..ad91766f9660 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -696,11 +696,6 @@ class RoFormerPreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_roformer base_model_prefix = "roformer" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [] - _keys_to_ignore_on_load_unexpected = [ - r"roformer.embeddings_project.weight", - r"roformer.embeddings_project.bias", - ] def _init_weights(self, module): """Initialize the weights""" @@ -952,7 +947,6 @@ def forward( @add_start_docstrings("""RoFormer Model with a `language modeling` head on top.""", ROFORMER_START_DOCSTRING) class RoFormerForMaskedLM(RoFormerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): @@ -1055,7 +1049,6 @@ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_ """RoFormer Model with a `language modeling` head on top for CLM fine-tuning.""", ROFORMER_START_DOCSTRING ) class RoFormerForCausalLM(RoFormerPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): diff --git a/src/transformers/models/sam/modeling_sam.py b/src/transformers/models/sam/modeling_sam.py index c3cbaa9176f0..43d88232e364 100644 --- a/src/transformers/models/sam/modeling_sam.py +++ b/src/transformers/models/sam/modeling_sam.py @@ -1190,7 +1190,6 @@ def _init_weights(self, module): SAM_START_DOCSTRING, ) class SamModel(SamPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"prompt_encoder.shared_embedding.positional_embedding"] _tied_weights_keys = ["prompt_encoder.shared_embedding.positional_embedding"] def __init__(self, config): diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 6b0869c87ad6..67b4bf1a0c6c 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -723,7 +723,6 @@ class SEWPreTrainedModel(PreTrainedModel): base_model_prefix = "sew" main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 7f7c1977d692..6ae717d9a28a 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -1257,7 +1257,6 @@ class SEWDPreTrainedModel(PreTrainedModel): config_class = SEWDConfig base_model_prefix = "sew-d" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py index 862dcac2ce7c..1af805a17905 100755 --- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py +++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py @@ -1266,17 +1266,6 @@ def forward( ) class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"model.encoder.embed_positions.weights", - r"model.decoder.embed_positions.weights", - r"lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"model.encoder.embed_positions.weights", - r"model.decoder.embed_positions.weights", - ] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: Speech2TextConfig): diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py index a04fd82d4b8f..822025e40ae2 100755 --- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py +++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py @@ -756,7 +756,6 @@ def forward(self, *args, **kwargs): SPEECH_TO_TEXT_2_START_DOCSTRING, ) class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/speecht5/modeling_speecht5.py b/src/transformers/models/speecht5/modeling_speecht5.py index 301ed54af4d6..b77d775714ad 100644 --- a/src/transformers/models/speecht5/modeling_speecht5.py +++ b/src/transformers/models/speecht5/modeling_speecht5.py @@ -441,7 +441,7 @@ def __init__(self, dropout, dim, max_len=5000): pe[:, 1::2] = torch.cos(position.float() * div_term) pe = pe.unsqueeze(0) super().__init__() - self.register_buffer("pe", pe) + self.register_buffer("pe", pe, persistent=False) self.dropout = nn.Dropout(p=dropout) self.dim = dim self.alpha = torch.nn.Parameter(torch.tensor(1.0)) @@ -1251,8 +1251,6 @@ class SpeechT5PreTrainedModel(PreTrainedModel): main_input_name = "input_values" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] - def _init_weights(self, module): """Initialize the weights""" if isinstance(module, SpeechT5PositionalConvEmbedding): @@ -2326,13 +2324,6 @@ def forward( SPEECHT5_START_DOCSTRING, ) class SpeechT5ForSpeechToText(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - r"text_decoder_postnet.lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] _tied_weights_keys = ["text_decoder_postnet.lm_head.weight"] def __init__(self, config: SpeechT5Config): @@ -2638,9 +2629,6 @@ def _generate_speech( SPEECHT5_START_DOCSTRING, ) class SpeechT5ForTextToSpeech(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [] - _keys_to_ignore_on_save = [] - main_input_name = "input_ids" def __init__(self, config: SpeechT5Config): @@ -2859,13 +2847,6 @@ def generate_speech( SPEECHT5_START_DOCSTRING, ) class SpeechT5ForSpeechToSpeech(SpeechT5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] - _keys_to_ignore_on_save = [ - r"speecht5.encoder.prenet.pos_sinusoidal_embed.weights", - ] - def __init__(self, config: SpeechT5Config): super().__init__(config) diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 6e636fb695da..193481e57f25 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -61,7 +61,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( @@ -524,7 +526,6 @@ class SplinterPreTrainedModel(PreTrainedModel): config_class = SplinterConfig base_model_prefix = "splinter" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights def _init_weights(self, module): diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py index 3264d16ebbb3..b82de3a0b06b 100644 --- a/src/transformers/models/squeezebert/modeling_squeezebert.py +++ b/src/transformers/models/squeezebert/modeling_squeezebert.py @@ -64,7 +64,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if input_ids is not None: @@ -425,7 +427,6 @@ class SqueezeBertPreTrainedModel(PreTrainedModel): config_class = SqueezeBertConfig base_model_prefix = "transformer" - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -643,11 +644,6 @@ def forward( @add_start_docstrings("""SqueezeBERT Model with a `language modeling` head on top.""", SQUEEZEBERT_START_DOCSTRING) class SqueezeBertForMaskedLM(SqueezeBertPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py index 008e23531ac1..98899af150a4 100644 --- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py +++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py @@ -1337,7 +1337,6 @@ def custom_forward(*inputs): SWITCH_TRANSFORMERS_START_DOCSTRING, ) class SwitchTransformersModel(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight", r"decoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: SwitchTransformersConfig): @@ -1506,11 +1505,6 @@ def forward( """SWITCH_TRANSFORMERS Model with a `language modeling` head on top.""", SWITCH_TRANSFORMERS_START_DOCSTRING ) class SwitchTransformersForConditionalGeneration(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] def __init__(self, config: SwitchTransformersConfig): @@ -1819,7 +1813,6 @@ def _reorder_cache(self, past_key_values, beam_idx): SWITCH_TRANSFORMERS_START_DOCSTRING, ) class SwitchTransformersEncoderModel(SwitchTransformersPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: SwitchTransformersConfig): diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py index f1f7b17c7bfe..7934b10b0a27 100644 --- a/src/transformers/models/t5/modeling_t5.py +++ b/src/transformers/models/t5/modeling_t5.py @@ -1326,12 +1326,8 @@ def custom_forward(*inputs): T5_START_DOCSTRING, ) class T5Model(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - ] _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] @@ -1530,13 +1526,8 @@ def forward( @add_start_docstrings("""T5 Model with a `language modeling` head on top.""", T5_START_DOCSTRING) class T5ForConditionalGeneration(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", + "decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", ] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] @@ -1845,7 +1836,6 @@ def _reorder_cache(self, past_key_values, beam_idx): T5_START_DOCSTRING, ) class T5EncoderModel(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [r"encoder.embed_tokens.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight"] def __init__(self, config: T5Config): @@ -1963,14 +1953,7 @@ def forward( T5_START_DOCSTRING, ) class T5ForQuestionAnswering(T5PreTrainedModel): - _keys_to_ignore_on_load_missing = [ - r"encoder.embed_tokens.weight", - r"decoder.embed_tokens.weight", - r"lm_head.weight", - ] - _keys_to_ignore_on_load_unexpected = [ - r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", - ] + _keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] def __init__(self, config: T5Config): diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 1621653f3ee0..832a731b5bf4 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -998,7 +998,6 @@ def forward( @add_start_docstrings("""Tapas Model with a `language modeling` head on top.""", TAPAS_START_DOCSTRING) class TapasForMaskedLM(TapasPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] config_class = TapasConfig base_model_prefix = "tapas" diff --git a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py index e8ecedccb5ea..1f634a9893d6 100644 --- a/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py +++ b/src/transformers/models/trajectory_transformer/modeling_trajectory_transformer.py @@ -284,6 +284,7 @@ def __init__(self, config): torch.tril(torch.ones(config.block_size, config.block_size)).view( 1, 1, config.block_size, config.block_size ), + persistent=False, ) # mask previous value estimates diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index d0f6cc029fb3..8ba96905242d 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -1002,7 +1002,6 @@ def forward( TRANSFO_XL_START_DOCSTRING, ) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"] def __init__(self, config): @@ -1191,8 +1190,6 @@ def _reorder_cache(mems: List[torch.Tensor], beam_idx: torch.Tensor) -> List[tor TRANSFO_XL_START_DOCSTRING, ) class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head.weight"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py index 3ad4ff1bac52..cd4e522bf547 100644 --- a/src/transformers/models/trocr/modeling_trocr.py +++ b/src/transformers/models/trocr/modeling_trocr.py @@ -788,7 +788,6 @@ def forward(self, *args, **kwargs): TROCR_START_DOCSTRING, ) class TrOCRForCausalLM(TrOCRPreTrainedModel): - _keys_to_ignore_on_load_missing = ["output_projection.weight"] _tied_weights_keys = ["output_projection.weight"] def __init__(self, config): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 16c08bbbf3e0..9737433089f8 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -974,7 +974,6 @@ class UniSpeechPreTrainedModel(PreTrainedModel): config_class = UniSpeechConfig base_model_prefix = "unispeech" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index b57369ea6f75..4c4ab4b90f3b 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -988,7 +988,6 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): config_class = UniSpeechSatConfig base_model_prefix = "unispeech_sat" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 6ee1e396a625..4d5283bae60e 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -249,7 +249,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -886,7 +888,6 @@ def forward(self, hidden_states): VILT_START_DOCSTRING, ) class ViltForMaskedLM(ViltPreTrainedModel): - _keys_to_ignore_on_load_missing = ["mlm_score.decoder.bias"] _tied_weights_keys = ["mlm_score.decoder.weight", "mlm_score.decoder.bias"] def __init__(self, config): @@ -1419,8 +1420,6 @@ def forward( VILT_START_DOCSTRING, ) class ViltForTokenClassification(ViltPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index 0bef6e4af9d9..0706eb1f1c48 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -78,7 +78,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) # For Visual Features # Token type and position embedding for image features @@ -531,7 +533,6 @@ class VisualBertPreTrainedModel(PreTrainedModel): config_class = VisualBertConfig base_model_prefix = "visual_bert" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -871,7 +872,6 @@ def forward( VISUAL_BERT_START_DOCSTRING, ) class VisualBertForPreTraining(VisualBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): @@ -1462,7 +1462,6 @@ def forward(self, query, key, attention_mask): VISUAL_BERT_START_DOCSTRING, ) class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel): - _keys_to_ignore_on_load_missing = ["cls.predictions.decoder.bias"] _tied_weights_keys = ["cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 1c8965c96003..3e48dc530dec 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1089,7 +1089,6 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): config_class = Wav2Vec2Config base_model_prefix = "wav2vec2" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py index 7a757d0a51f9..d5836de3394f 100644 --- a/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py +++ b/src/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py @@ -1087,7 +1087,6 @@ class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel): config_class = Wav2Vec2ConformerConfig base_model_prefix = "wav2vec2_conformer" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index d782a47402f0..d573ee601b4c 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -974,7 +974,6 @@ class WavLMPreTrainedModel(PreTrainedModel): config_class = WavLMConfig base_model_prefix = "wavlm" main_input_name = "input_values" - _keys_to_ignore_on_load_missing = [r"position_ids"] supports_gradient_checkpointing = True def _init_weights(self, module): diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py index cffb2810838d..fa9eae4c4797 100644 --- a/src/transformers/models/whisper/modeling_whisper.py +++ b/src/transformers/models/whisper/modeling_whisper.py @@ -1225,8 +1225,6 @@ def custom_forward(*inputs): WHISPER_START_DOCSTRING, ) class WhisperModel(WhisperPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"proj_out.weight"] - def __init__(self, config: WhisperConfig): super().__init__(config) @@ -1396,14 +1394,6 @@ def forward( ) class WhisperForConditionalGeneration(WhisperPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"encoder.version", - r"decoder.version", - r"proj_out.weight", - ] - _keys_to_ignore_on_save = [ - r"proj_out.weight", - ] _tied_weights_keys = ["proj_out.weight"] def __init__(self, config: WhisperConfig): diff --git a/src/transformers/models/x_clip/modeling_x_clip.py b/src/transformers/models/x_clip/modeling_x_clip.py index 8db4ee0fd194..bcf91b0b51d5 100644 --- a/src/transformers/models/x_clip/modeling_x_clip.py +++ b/src/transformers/models/x_clip/modeling_x_clip.py @@ -139,7 +139,7 @@ def __init__(self, config: XCLIPVisionConfig): self.num_patches = (self.image_size // self.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1))) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] @@ -162,7 +162,9 @@ def __init__(self, config: XCLIPTextConfig): self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def forward( self, @@ -481,7 +483,6 @@ class XCLIPPreTrainedModel(PreTrainedModel): config_class = XCLIPConfig base_model_prefix = "x_clip" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" diff --git a/src/transformers/models/xglm/modeling_xglm.py b/src/transformers/models/xglm/modeling_xglm.py index b7172127d906..9e578cebf190 100755 --- a/src/transformers/models/xglm/modeling_xglm.py +++ b/src/transformers/models/xglm/modeling_xglm.py @@ -749,14 +749,6 @@ def custom_forward(*inputs): ) class XGLMForCausalLM(XGLMPreTrainedModel): base_model_prefix = "model" - _keys_to_ignore_on_load_missing = [ - r"model.embed_positions.weights", - r"embed_positions.weights", - r"lm_head.weight", - ] - _keys_to_ignore_on_save = [ - r"model.embed_positions.weights", - ] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config): diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py index a448b4b11631..d342cde80d3c 100755 --- a/src/transformers/models/xlm/modeling_xlm.py +++ b/src/transformers/models/xlm/modeling_xlm.py @@ -391,8 +391,6 @@ class XLMForQuestionAnsweringOutput(ModelOutput): XLM_START_DOCSTRING, ) class XLMModel(XLMPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -461,7 +459,9 @@ def __init__(self, config): # Initialize weights and apply final processing self.post_init() - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) def get_input_embeddings(self): return self.embeddings @@ -670,7 +670,6 @@ def forward(self, x, y=None): XLM_START_DOCSTRING, ) class XLMWithLMHeadModel(XLMPreTrainedModel): - _keys_to_ignore_on_load_missing = ["pred_layer.proj.weight"] _tied_weights_keys = ["pred_layer.proj.weight"] def __init__(self, config): diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py index 2d14bfb6a7b5..c84e3fac5aeb 100644 --- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py @@ -1768,7 +1768,6 @@ def prepare_predict_attention_mask(self, hidden_states, attention_mask): ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetModel with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetModel(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["decoder.word_embeddings.weight", "encoder.word_embeddings.weight"] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"] def __init__(self, config: XLMProphetNetConfig): @@ -1899,11 +1898,6 @@ def forward( ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForConditionalGeneration with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "decoder.word_embeddings.weight", - "encoder.word_embeddings.weight", - "lm_head.weight", - ] _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"] def __init__(self, config: XLMProphetNetConfig): @@ -2119,7 +2113,6 @@ def get_decoder(self): ) # Copied from transformers.models.prophetnet.modeling_prophetnet.ProphetNetForCausalLM with microsoft/prophetnet-large-uncased->patrickvonplaten/xprophetnet-large-uncased-standalone, ProphetNet->XLMProphetNet, PROPHETNET->XLM_PROPHETNET class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel): - _keys_to_ignore_on_load_missing = ["lm_head.weight"] _tied_weights_keys = ["lm_head.weight"] def __init__(self, config: XLMProphetNetConfig): diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index ae8d51a3f8eb..881f60875dbb 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -81,7 +81,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -616,15 +618,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, XLMRobertaEncoder): module.gradient_checkpointing = value - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - XLM_ROBERTA_START_DOCSTRING = r""" @@ -713,8 +706,6 @@ class XLMRobertaModel(XLMRobertaPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRoberta def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -885,9 +876,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForCausalLM(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -899,9 +887,6 @@ def __init__(self, config): self.roberta = XLMRobertaModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1044,9 +1029,6 @@ def _reorder_cache(self, past_key_values, beam_idx): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForMaskedLM(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1061,9 +1043,6 @@ def __init__(self, config): self.roberta = XLMRobertaModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1183,8 +1162,6 @@ def _tie_weights(self): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForSequenceClassification(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1285,8 +1262,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForMultipleChoice(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1382,9 +1357,6 @@ def forward( ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForTokenClassification(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1494,9 +1466,6 @@ def forward(self, features, **kwargs): ) # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->XLMRoberta, ROBERTA->XLM_ROBERTA class XLMRobertaForQuestionAnswering(XLMRobertaPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index fb86717e1d7f..4299880e0c4f 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -73,7 +73,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -599,15 +601,6 @@ def _init_weights(self, module): module.bias.data.zero_() module.weight.data.fill_(1.0) - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - XLM_ROBERTA_XL_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the @@ -679,8 +672,6 @@ class XLMRobertaXLModel(XLMRobertaXLPreTrainedModel): an input to the forward pass. .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->XLMRobertaXL def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -850,9 +841,6 @@ def forward( XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForCausalLM(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -864,9 +852,6 @@ def __init__(self, config): self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaXLLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() def get_output_embeddings(self): @@ -1001,9 +986,6 @@ def _reorder_cache(self, past_key_values, beam_idx): """XLM-RoBERTa-xlarge Model with a `language modeling` head on top.""", XLM_ROBERTA_XL_START_DOCSTRING ) class XLMRobertaXLForMaskedLM(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] def __init__(self, config): @@ -1018,9 +1000,6 @@ def __init__(self, config): self.roberta = XLMRobertaXLModel(config, add_pooling_layer=False) self.lm_head = XLMRobertaXLLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - self.init_weights() def get_output_embeddings(self): @@ -1129,8 +1108,6 @@ def _tie_weights(self): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForSequenceClassification(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1225,8 +1202,6 @@ def forward( XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForMultipleChoice(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) @@ -1318,9 +1293,6 @@ def forward( XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForTokenClassification(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels @@ -1432,9 +1404,6 @@ def forward(self, features, **kwargs): XLM_ROBERTA_XL_START_DOCSTRING, ) class XLMRobertaXLForQuestionAnswering(XLMRobertaXLPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py index bea8ab643b19..87bf48d61ed5 100755 --- a/src/transformers/models/xlnet/modeling_xlnet.py +++ b/src/transformers/models/xlnet/modeling_xlnet.py @@ -1292,7 +1292,6 @@ def forward( XLNET_START_DOCSTRING, ) class XLNetLMHeadModel(XLNetPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"lm_loss.weight"] _tied_weights_keys = ["lm_loss.weight"] def __init__(self, config): diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index d99b77fedda3..c44cded49952 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -74,7 +74,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -682,16 +684,6 @@ def _set_gradient_checkpointing(self, module, value=False): if isinstance(module, XmodEncoder): module.gradient_checkpointing = value - # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel.update_keys_to_ignore - def update_keys_to_ignore(self, config, del_keys_to_ignore): - """Remove some keys from ignore list""" - if not config.tie_word_embeddings: - # must make a new list, or the class variable gets modified! - self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] - self._keys_to_ignore_on_load_missing = [ - k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore - ] - def set_default_language(self, language: str): """ Set the default language code for the model. This is used when the language is not specified in the input. @@ -811,8 +803,6 @@ class XmodModel(XmodPreTrainedModel): """ - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Xmod def __init__(self, config, add_pooling_layer=True): super().__init__(config) @@ -989,9 +979,6 @@ def forward( XMOD_START_DOCSTRING, ) class XmodForCausalLM(XmodPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM.__init__ with Roberta->Xmod @@ -1004,9 +991,6 @@ def __init__(self, config): self.roberta = XmodModel(config, add_pooling_layer=False) self.lm_head = XmodLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1152,9 +1136,6 @@ def _reorder_cache(self, past_key_values, beam_idx): XMOD_START_DOCSTRING, ) class XmodForMaskedLM(XmodPreTrainedModel): - _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - _keys_to_ignore_on_load_unexpected = [r"pooler"] _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM.__init__ with Roberta->Xmod @@ -1170,9 +1151,6 @@ def __init__(self, config): self.roberta = XmodModel(config, add_pooling_layer=False) self.lm_head = XmodLMHead(config) - # The LM head weights require special treatment only when they are tied with the word embeddings - self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) - # Initialize weights and apply final processing self.post_init() @@ -1285,8 +1263,6 @@ def _tie_weights(self): XMOD_START_DOCSTRING, ) class XmodForSequenceClassification(XmodPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1380,8 +1356,6 @@ def forward( XMOD_START_DOCSTRING, ) class XmodForMultipleChoice(XmodPreTrainedModel): - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1471,9 +1445,6 @@ def forward( XMOD_START_DOCSTRING, ) class XmodForTokenClassification(XmodPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) @@ -1576,9 +1547,6 @@ def forward(self, features, **kwargs): XMOD_START_DOCSTRING, ) class XmodForQuestionAnswering(XmodPreTrainedModel): - _keys_to_ignore_on_load_unexpected = [r"pooler"] - _keys_to_ignore_on_load_missing = [r"position_ids"] - # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering.__init__ with Roberta->Xmod def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index 8c2ff9fa4e07..4d4ef9a4f509 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -252,7 +252,9 @@ def __init__(self, config): self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2) + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False + ) self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", @@ -649,7 +651,6 @@ class YosoPreTrainedModel(PreTrainedModel): config_class = YosoConfig base_model_prefix = "yoso" supports_gradient_checkpointing = True - _keys_to_ignore_on_load_missing = [r"position_ids"] def _init_weights(self, module): """Initialize the weights""" @@ -849,11 +850,6 @@ def forward( @add_start_docstrings("""YOSO Model with a `language modeling` head on top.""", YOSO_START_DOCSTRING) class YosoForMaskedLM(YosoPreTrainedModel): - _keys_to_ignore_on_load_missing = [ - "cls.predictions.decoder.bias", - "cls.predictions.decoder.weight", - "embeddings.position_ids", - ] _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] def __init__(self, config): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 49caa67d4f6c..7ca78e23b7f1 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -15,7 +15,6 @@ import unittest -from copy import deepcopy from transformers import RobertaConfig, is_torch_available from transformers.testing_utils import TestCasePlus, require_torch, slow, torch_device @@ -579,23 +578,3 @@ def test_inference_classification_head(self): # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach() self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4)) - - # XXX: this might be a candidate for common tests if we have many of those - def test_lm_head_ignore_keys(self): - keys_to_ignore_on_save_tied = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] - keys_to_ignore_on_save_untied = [r"lm_head.decoder.bias"] - config = RobertaConfig.from_pretrained(ROBERTA_TINY) - config_tied = deepcopy(config) - config_tied.tie_word_embeddings = True - config_untied = deepcopy(config) - config_untied.tie_word_embeddings = False - for cls in [RobertaForMaskedLM, RobertaForCausalLM]: - model = cls(config_tied) - self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_tied, cls) - - # the keys should be different when embeddings aren't tied - model = cls(config_untied) - self.assertEqual(model._keys_to_ignore_on_save, keys_to_ignore_on_save_untied, cls) - - # test that saving works with updated ignore keys - just testing that it doesn't fail - model.save_pretrained(self.get_auto_remove_tmp_dir()) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 07a8b16bfef7..878e3c647302 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1562,7 +1562,7 @@ def check_same_values(layer_1, layer_2): @require_safetensors def test_can_use_safetensors(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: model_tied = model_class(config) with tempfile.TemporaryDirectory() as d: @@ -1579,6 +1579,8 @@ def test_can_use_safetensors(self): torch.testing.assert_close( v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}" ) + # Checking there was no complain of missing weights + self.assertEqual(infos["missing_keys"], []) # Checking the tensor sharing are correct ptrs = defaultdict(list) @@ -1595,6 +1597,25 @@ def test_can_use_safetensors(self): f"The shared pointers are incorrect, found different pointers for keys {shared_names}", ) + def test_load_save_without_tied_weights(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.tie_word_embeddings = False + for model_class in self.all_model_classes: + model = model_class(config) + with tempfile.TemporaryDirectory() as d: + model.save_pretrained(d) + + model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True) + # Checking the state dicts are correct + reloaded_state = model_reloaded.state_dict() + for k, v in model.state_dict().items(): + self.assertIn(k, reloaded_state, f"Key {k} is missing from reloaded") + torch.testing.assert_close( + v, reloaded_state[k], msg=lambda x: f"{model_class.__name__}: Tensor {k}: {x}" + ) + # Checking there was no complain of missing weights + self.assertEqual(infos["missing_keys"], []) + def test_tied_weights_keys(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() config.tie_word_embeddings = True @@ -1620,55 +1641,72 @@ def test_tied_weights_keys(self): tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] tied_params = [group for group in tied_params if len(group) > 1] - self.assertListEqual(tied_params, []) + self.assertListEqual( + tied_params, + [], + f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", + ) - def test_tied_model_weights_key_ignore(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + def test_model_weights_reload_no_missing_tied_weights(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() for model_class in self.all_model_classes: - model_tied = model_class(config) - with tempfile.TemporaryDirectory() as d: - model_tied.save_pretrained(d) + model = model_class(config) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) # We are nuking ALL weights on file, so every parameter should # yell on load. We're going to detect if we yell too much, or too little. - with open(os.path.join(d, "pytorch_model.bin"), "wb") as f: + with open(os.path.join(tmp_dir, "pytorch_model.bin"), "wb") as f: torch.save({}, f) - model_reloaded, infos = model_class.from_pretrained(d, output_loading_info=True) - - # ! Actually we could use `state_dict()` and check iteratively the tensors which are the same (for instance using `tensor.data_ptr()`). to detect the duplicates. - # ```python - # model = GPT2LMHeadModel.from_pretrained("gpt2") - # "lm_head.weight" in model.state_dict().keys() # True - # "lm_head.weight" in model.named_parameters() # False - # In [6]: model.lm_head.weight.data_ptr() - # Out[6]: 139901378371648 - # In [9]: model.transformer.wte.weight.data_ptr() - # Out[9]: 139901378371648 # Same PTR, it's the same DATA ! we would need to check for stride too to be 100% accurate. - # ``` + model_reloaded, infos = model_class.from_pretrained(tmp_dir, output_loading_info=True) prefix = f"{model_reloaded.base_model_prefix}." params = dict(model_reloaded.named_parameters()) params.update(dict(model_reloaded.named_buffers())) - # param_names = set(k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()) param_names = {k[len(prefix) :] if k.startswith(prefix) else k for k in params.keys()} missing_keys = set(infos["missing_keys"]) extra_missing = missing_keys - param_names - # missed_missing = param_names - missing_keys + # Remove tied weights from extra missing: they are normally not warned as missing if their tied + # counterpart is present but here there are no weights at all so we do get the warning. + ptrs = collections.defaultdict(list) + for name, tensor in model_reloaded.state_dict().items(): + ptrs[id_tensor_storage(tensor)].append(name) + tied_params = [names for _, names in ptrs.items() if len(names) > 1] + for group in tied_params: + group = {k[len(prefix) :] if k.startswith(prefix) else k for k in group} + # We remove the group from extra_missing if not all weights from group are in it + if len(group - extra_missing) > 0: + extra_missing = extra_missing - set(group) self.assertEqual( extra_missing, set(), - f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}", + f"This model {model_class.__name__} might be missing some `keys_to_ignore`: {extra_missing}. " + f"For debugging, tied parameters are {tied_params}", ) - # self.assertEqual( - # missed_missing, - # set(), - # f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real" - # " parameters", - # ) + missed_missing = param_names - missing_keys + # Remove nonpersistent buffers from missed_missing + buffers = [n for n, _ in model_reloaded.named_buffers()] + nonpersistent_buffers = {n for n in buffers if n not in model_reloaded.state_dict()} + nonpersistent_buffers = { + k[len(prefix) :] if k.startswith(prefix) else k for k in nonpersistent_buffers + } + missed_missing = missed_missing - nonpersistent_buffers + + if model_reloaded._keys_to_ignore_on_load_missing is None: + expected_missing = set() + else: + expected_missing = set(model_reloaded._keys_to_ignore_on_load_missing) + self.assertEqual( + missed_missing, + expected_missing, + f"This model {model_class.__name__} ignores keys {missed_missing} but they look like real" + " parameters. If they are non persistent buffers make sure to instantiate them with" + " `persistent=False`", + ) def test_model_outputs_equivalence(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 3b441ec7e582..17ddf1963a28 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -500,8 +500,8 @@ def test_checkpoint_variant_local_sharded(self): self.assertTrue(os.path.isfile(weights_index_file)) self.assertFalse(os.path.isfile(os.path.join(tmp_dir, WEIGHTS_INDEX_NAME))) - for i in range(1, 6): - weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["bin"]) + for i in range(1, 5): + weights_name = ".".join(WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["bin"]) weights_name_file = os.path.join(tmp_dir, weights_name) self.assertTrue(os.path.isfile(weights_name_file)) @@ -546,8 +546,8 @@ def test_checkpoint_variant_local_sharded_safe(self): self.assertTrue(os.path.isfile(weights_index_file)) self.assertFalse(os.path.isfile(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))) - for i in range(1, 6): - weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00006"] + ["safetensors"]) + for i in range(1, 5): + weights_name = ".".join(SAFE_WEIGHTS_NAME.split(".")[:-1] + [f"v2-0000{i}-of-00005"] + ["safetensors"]) weights_name_file = os.path.join(tmp_dir, weights_name) self.assertTrue(os.path.isfile(weights_name_file))