diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py index eeebb3728ee8..c4904af06eaf 100755 --- a/convert_tf_checkpoint_to_pytorch.py +++ b/convert_tf_checkpoint_to_pytorch.py @@ -68,6 +68,21 @@ def convert(): arrays.append(array) for name, array in zip(names, arrays): + + # include the output_layer in the model + if (name=="bert/embeddings/word_embeddings"): + pointer = model + pointer = getattr(pointer, 'output_layer') + pointer = getattr(pointer, 'weight') + assert pointer.shape == array.shape + pointer.data = torch.from_numpy(array) + elif (name=="cls/predictions/output_bias"): + pointer = model + pointer = getattr(pointer, 'output_layer') + pointer = getattr(pointer, 'bias') + assert pointer.shape == array.shape + pointer.data = torch.from_numpy(array) + if not name.startswith("bert"): print("Skipping {}".format(name)) continue diff --git a/modeling.py b/modeling.py index 66b0de68d9b1..456dd0e01512 100644 --- a/modeling.py +++ b/modeling.py @@ -277,7 +277,7 @@ class BERTEncoder(nn.Module): def __init__(self, config): super(BERTEncoder, self).__init__() layer = BERTLayer(config) - self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) def forward(self, hidden_states, attention_mask): all_encoder_layers = [] @@ -330,6 +330,10 @@ def __init__(self, config: BertConfig): self.encoder = BERTEncoder(config) self.pooler = BERTPooler(config) + # the output weights are the same as the input embeddings, + # but there is an output-only bias for each token + self.output_layer = nn.Linear(config.hidden_size, config.vocab_size, bias=True) + def forward(self, input_ids, token_type_ids=None, attention_mask=None): if attention_mask is None: attention_mask = torch.ones_like(input_ids)