diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py
index eeebb3728ee8..c4904af06eaf 100755
--- a/convert_tf_checkpoint_to_pytorch.py
+++ b/convert_tf_checkpoint_to_pytorch.py
@@ -68,6 +68,21 @@ def convert():
         arrays.append(array)
 
     for name, array in zip(names, arrays):
+
+        # include the output_layer in the model
+        if (name=="bert/embeddings/word_embeddings"):
+            pointer = model
+            pointer = getattr(pointer, 'output_layer')
+            pointer = getattr(pointer, 'weight')
+            assert pointer.shape == array.shape
+            pointer.data = torch.from_numpy(array)
+        elif (name=="cls/predictions/output_bias"):
+            pointer = model
+            pointer = getattr(pointer, 'output_layer')
+            pointer = getattr(pointer, 'bias')
+            assert pointer.shape == array.shape
+            pointer.data = torch.from_numpy(array)
+
         if not name.startswith("bert"):
             print("Skipping {}".format(name))
             continue
diff --git a/modeling.py b/modeling.py
index 66b0de68d9b1..456dd0e01512 100644
--- a/modeling.py
+++ b/modeling.py
@@ -277,7 +277,7 @@ class BERTEncoder(nn.Module):
     def __init__(self, config):
         super(BERTEncoder, self).__init__()
         layer = BERTLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])    
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask):
         all_encoder_layers = []
@@ -330,6 +330,10 @@ def __init__(self, config: BertConfig):
         self.encoder = BERTEncoder(config)
         self.pooler = BERTPooler(config)
 
+        # the output weights are the same as the input embeddings,
+        # but there is an output-only bias for each token
+        self.output_layer = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)