diff --git a/README.md b/README.md
index b7d2b724939d..5dad8aa3a830 100644
--- a/README.md
+++ b/README.md
@@ -164,8 +164,9 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
 15. **[FlauBERT](https://github.com/getalp/Flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 16. **[BART](https://github.com/pytorch/fairseq/tree/master/examples/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-17. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-18. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+17. **[ELECTRA](https://github.com/google-research/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+18. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+19. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
diff --git a/docs/README.md b/docs/README.md
index d1a8b24103ba..b4f576501940 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -47,6 +47,8 @@ Once you have setup `sphinx`, you can build the documentation by running the fol
 make html
 ```
 
+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your browser. 
+
 ---
 **NOTE**
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 833832755101..5679ffc201fb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.6.0'
+release = u'2.8.0'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5ce54d5d9727..aae05dd6047f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -104,3 +104,4 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     model_doc/flaubert
     model_doc/bart
     model_doc/t5
+    model_doc/electra
\ No newline at end of file
diff --git a/docs/source/migration.md b/docs/source/migration.md
index f50d1dff0a8e..d838bf5c43b9 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -27,7 +27,7 @@ loss = outputs[0]
 # In transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
diff --git a/docs/source/model_doc/electra.rst b/docs/source/model_doc/electra.rst
new file mode 100644
index 000000000000..d33a596256ac
--- /dev/null
+++ b/docs/source/model_doc/electra.rst
@@ -0,0 +1,115 @@
+ELECTRA
+----------------------------------------------------
+
+The ELECTRA model was proposed in the paper.
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
+ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
+generator's role is to replace tokens in a sequence, and is therefore trained as a masked language model. The discriminator,
+which is the model we're interested in, tries to identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pre-training methods such as BERT corrupt
+the input by replacing some tokens with [MASK] and then train a model to
+reconstruct the original tokens. While they produce good results when transferred
+to downstream NLP tasks, they generally require large amounts of compute to be
+effective. As an alternative, we propose a more sample-efficient pre-training task
+called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small
+generator network. Then, instead of training a model that predicts the original
+identities of the corrupted tokens, we train a discriminative model that predicts
+whether each token in the corrupted input was replaced by a generator sample
+or not. Thorough experiments demonstrate this new pre-training task is more
+efficient than MLM because the task is defined over all input tokens rather than
+just the small subset that was masked out. As a result, the contextual representations
+learned by our approach substantially outperform the ones learned by BERT
+given the same model size, data, and compute. The gains are particularly strong
+for small models; for example, we train a model on one GPU for 4 days that
+outperforms GPT (trained using 30x more compute) on the GLUE natural language
+understanding benchmark. Our approach also works well at scale, where it
+performs comparably to RoBERTa and XLNet while using less than 1/4 of their
+compute and outperforms them when using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pre-training approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size -> The embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from
+  their embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no
+  projection layer is used.
+- The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the `ElectraForMaskedLM` model,
+  and the generator may be loaded in the `ElectraForPreTraining` model (the classification head will be randomly
+  initialized as it doesn't exist in the generator).
+
+
+ElectraConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraConfig
+    :members:
+
+
+ElectraTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizer
+    :members:
+
+
+ElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraModel
+    :members:
+
+
+ElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForPreTraining
+    :members:
+
+
+ElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMaskedLM
+    :members:
+
+
+ElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForTokenClassification
+    :members:
+
+
+TFElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraModel
+    :members:
+
+
+TFElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForPreTraining
+    :members:
+
+
+TFElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMaskedLM
+    :members:
+
+
+TFElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForTokenClassification
+    :members:
diff --git a/docs/source/model_doc/t5.rst b/docs/source/model_doc/t5.rst
index 3e4c28cf31e1..9ae1499450d5 100644
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -16,13 +16,45 @@ To facilitate future work on transfer learning for NLP, we release our dataset,
 
 The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
 
+Training
+~~~~~~~~~~~~~~~~~~~~
+T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
+This means that for training we always need an input sequence and a target sequence. 
+The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* perprended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
+T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+- Unsupervised denoising training
+  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
+  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
+  Each sentinel tokens represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extrac_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
+  *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: 
+
+::
+
+  input_ids = tokenizer.encode('The <extra_id_1> walks in <extra_id_2> park', return_tensors='pt')
+  lm_labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
+- Supervised training
+  In this setup the input sequence and output sequence are standard sequence to sequence input output mapping.
+  In translation, *e.g.* the input sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar." should 
+  be processed as follows:
+  
+::
+
+  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
+  lm_labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  # the forward function automatically creates the correct decoder_input_ids
+  model(input_ids=input_ids, lm_labels=lm_labels)
+
 Tips
 ~~~~~~~~~~~~~~~~~~~~
 - T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
-  and supervised tasks and which each task is cast as a sequence to sequence task. 
-  Therefore T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
-  For more information about the which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
-- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generating the decoder output.
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
 - T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
 
 
diff --git a/docs/source/notebooks.md b/docs/source/notebooks.md
new file mode 120000
index 000000000000..1ffa21de255f
--- /dev/null
+++ b/docs/source/notebooks.md
@@ -0,0 +1 @@
+../../notebooks/README.md
\ No newline at end of file
diff --git a/docs/source/notebooks.rst b/docs/source/notebooks.rst
deleted file mode 100644
index fe669e8e47f8..000000000000
--- a/docs/source/notebooks.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-Notebooks
-================================================
-
-We include `three Jupyter Notebooks <https://github.com/huggingface/transformers/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
-
-
-*
-  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
-
-*
-  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
-
-*
-  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/transformers/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
-
-Please follow the instructions given in the notebooks to run and modify them.
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
index 8fb7a447279b..5f7147e9c368 100644
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -420,7 +420,7 @@ to generate the tokens following the initial sequence in PyTorch, and creating a
     sequence = f"Hugging Face is based in DUMBO, New York City, and is"
 
     input = tokenizer.encode(sequence, return_tensors="pt")
-    generated = model.generate(input, max_length=50)
+    generated = model.generate(input, max_length=50, do_sample=True)
 
     resulting_string = tokenizer.decode(generated.tolist()[0])
     print(resulting_string)
@@ -432,14 +432,10 @@ to generate the tokens following the initial sequence in PyTorch, and creating a
     model = TFAutoModelWithLMHead.from_pretrained("gpt2")
 
     sequence = f"Hugging Face is based in DUMBO, New York City, and is"
-    generated = tokenizer.encode(sequence)
-
-    for i in range(50):
-        predictions = model(tf.constant([generated]))[0]
-        token = tf.argmax(predictions[0], axis=1)[-1].numpy()
-        generated += [token]
+    input = tokenizer.encode(sequence, return_tensors="tf")
+    generated = model.generate(input, max_length=50, do_sample=True)
 
-    resulting_string = tokenizer.decode(generated)
+    resulting_string = tokenizer.decode(generated.tolist()[0])
     print(resulting_string)
 
 
@@ -594,4 +590,138 @@ following array should be the output:
 
 ::
 
-    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]   
+Summarization
+----------------------------------------------------
+
+Summarization is the task of summarizing a text / an article into a shorter text.
+
+An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
+If you would like to fine-tune a model on a summarization task, you may leverage the ``examples/summarization/bart/run_train.sh`` (leveraging pytorch-lightning) script.
+
+Here is an example using the pipelines do to summarization. 
+It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
+
+::
+
+    from transformers import pipeline
+
+    summarizer = pipeline("summarization")
+
+    ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. 
+    A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. 
+    Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. 
+    In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. 
+    Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 
+    2010 marriage license application, according to court documents. 
+    Prosecutors said the marriages were part of an immigration scam. 
+    On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. 
+    After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective 
+    Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. 
+    All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. 
+    Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. 
+    Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. 
+    The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s 
+    Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. 
+    Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. 
+    If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    """
+    
+    print(summarizer(ARTICLE, max_length=130, min_length=30))
+
+Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
+This outputs the following summary:
+
+::
+
+  Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the Bronx on Friday.
+  
+Here is an example doing summarization using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "summarize: ".
+
+Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    print(outputs)  
+Translation
+----------------------------------------------------
+
+Translation is the task of translating a text from one language to another.
+
+An example of a translation dataset is the WMT English to German dataset, which has English sentences as the input data 
+and German sentences as the target data.
+
+Here is an example using the pipelines do to translation. 
+It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
+translation results nevertheless.
+
+::
+
+    from transformers import pipeline
+
+    translator = pipeline("translation_en_to_de")
+    print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+
+Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
+This outputs the following translation into German:
+
+::
+
+  Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+  
+Here is an example doing translation using a model and a tokenizer. The process is the following:
+
+- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+- Define the article that should be summarizaed.
+- Leverage the ``PretrainedModel.generate()`` method.
+- Add the T5 specific prefix "translate English to German: "
+
+::
+
+    ## PYTORCH CODE
+    from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    model = AutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
+    
+    ## TENSORFLOW CODE
+    from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+    print(outputs)
diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index b5eec4566aba..fb3f51d1c462 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -20,9 +20,10 @@
 
 import argparse
 import csv
+import logging
 import timeit
 from time import time
-from typing import List
+from typing import Callable, List
 
 from transformers import (
     AutoConfig,
@@ -46,10 +47,8 @@
 
 input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as
 the Director of Hatcheries and Conditioning entered the room, in the
-
-
-
 scarcely breathing silence, the absent-minded, soliloquizing hum or
+
 whistle, of absorbed concentration. A troop of newly arrived students,
 very young, pink and callow, followed nervously, rather abjectly, at the
 Director's heels. Each of them carried a notebook, in which, whenever
@@ -271,8 +270,9 @@ def create_setup_and_compute(
     amp: bool = False,
     fp16: bool = False,
     save_to_csv: bool = False,
-    csv_filename: str = f"results_{round(time())}.csv",
+    csv_time_filename: str = f"time_{round(time())}.csv",
     csv_memory_filename: str = f"memory_{round(time())}.csv",
+    print_fn: Callable[[str], None] = print,
 ):
     if xla:
         tf.config.optimizer.set_jit(True)
@@ -282,7 +282,16 @@ def create_setup_and_compute(
     if tensorflow:
         dictionary = {model_name: {} for model_name in model_names}
         results = _compute_tensorflow(
-            model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
+            model_names,
+            batch_sizes,
+            slice_sizes,
+            dictionary,
+            average_over,
+            amp,
+            no_speed,
+            no_memory,
+            verbose,
+            print_fn,
         )
     else:
         device = "cuda" if (gpu and torch.cuda.is_available()) else "cpu"
@@ -299,100 +308,107 @@ def create_setup_and_compute(
             no_speed,
             no_memory,
             verbose,
+            print_fn,
         )
 
-    print("=========== RESULTS ===========")
+    print_fn("=========== RESULTS ===========")
     for model_name in model_names:
-        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
+        print_fn("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
         for batch_size in results[model_name]["bs"]:
-            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
+            print_fn("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
             for slice_size in results[model_name]["ss"]:
-                result = results[model_name]["results"][batch_size][slice_size]
+                time = results[model_name]["time"][batch_size][slice_size]
                 memory = results[model_name]["memory"][batch_size][slice_size]
-                if isinstance(result, str):
-                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{result} " f"{memory}")
+                if isinstance(time, str):
+                    print_fn(f"\t\t{model_name}/{batch_size}/{slice_size}: " f"{time} " f"{memory}")
                 else:
-                    print(
+                    print_fn(
                         f"\t\t{model_name}/{batch_size}/{slice_size}: "
-                        f"{(round(1000 * result) / 1000)}"
+                        f"{(round(1000 * time) / 1000)}"
                         f"s "
                         f"{memory}"
                     )
 
     if save_to_csv:
-        with open(csv_filename, mode="w") as csv_file, open(csv_memory_filename, mode="w") as csv_memory_file:
-            fieldnames = [
-                "model",
-                "1x8",
-                "1x64",
-                "1x128",
-                "1x256",
-                "1x512",
-                "1x1024",
-                "2x8",
-                "2x64",
-                "2x128",
-                "2x256",
-                "2x512",
-                "2x1024",
-                "4x8",
-                "4x64",
-                "4x128",
-                "4x256",
-                "4x512",
-                "4x1024",
-                "8x8",
-                "8x64",
-                "8x128",
-                "8x256",
-                "8x512",
-                "8x1024",
-            ]
-
-            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
-            writer.writeheader()
-            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames)
+        with open(csv_time_filename, mode="w") as csv_time_file, open(
+            csv_memory_filename, mode="w"
+        ) as csv_memory_file:
+
+            assert len(model_names) > 0, "At least 1 model should be defined, but got {}".format(model_names)
+
+            fieldnames = ["model", "batch_size", "sequence_length"]
+            time_writer = csv.DictWriter(csv_time_file, fieldnames=fieldnames + ["time_in_s"])
+            time_writer.writeheader()
+            memory_writer = csv.DictWriter(csv_memory_file, fieldnames=fieldnames + ["memory"])
             memory_writer.writeheader()
 
             for model_name in model_names:
-                model_results = {
-                    f"{bs}x{ss}": results[model_name]["results"][bs][ss]
-                    for bs in results[model_name]["results"]
-                    for ss in results[model_name]["results"][bs]
-                }
-                writer.writerow({"model": model_name, **model_results})
-
-                model_memory_results = {
-                    f"{bs}x{ss}": results[model_name]["memory"][bs][ss]
-                    for bs in results[model_name]["memory"]
-                    for ss in results[model_name]["memory"][bs]
-                }
-                memory_writer.writerow({"model": model_name, **model_memory_results})
-
-
-def print_summary_statistics(summary: MemorySummary):
-    print(
+                time_dict = results[model_name]["time"]
+                memory_dict = results[model_name]["memory"]
+                for bs in time_dict:
+                    for ss in time_dict[bs]:
+                        time_writer.writerow(
+                            {
+                                "model": model_name,
+                                "batch_size": bs,
+                                "sequence_length": ss,
+                                "time_in_s": "{:.4f}".format(time_dict[bs][ss]),
+                            }
+                        )
+
+                for bs in memory_dict:
+                    for ss in time_dict[bs]:
+                        memory_writer.writerow(
+                            {
+                                "model": model_name,
+                                "batch_size": bs,
+                                "sequence_length": ss,
+                                "memory": memory_dict[bs][ss],
+                            }
+                        )
+
+
+def print_summary_statistics(summary: MemorySummary, print_fn: Callable[[str], None]):
+    print_fn(
         "\nLines by line memory consumption:\n"
         + "\n".join(
             f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
             for state in summary.sequential
         )
     )
-    print(
+    print_fn(
         "\nLines with top memory consumption:\n"
         + "\n".join(
             f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
             for state in summary.cumulative[:6]
         )
     )
-    print(
+    print_fn(
         "\nLines with lowest memory consumption:\n"
         + "\n".join(
             f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
             for state in summary.cumulative[-6:]
         )
     )
-    print(f"\nTotal memory increase: {summary.total}")
+    print_fn(f"\nTotal memory increase: {summary.total}")
+
+
+def get_print_function(save_print_log, log_filename):
+    if save_print_log:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            filename=log_filename,
+            filemode="a+",
+            format="%(asctime)-15s %(levelname)-8s %(message)s",
+        )
+
+        def print_with_print_log(*args):
+            logging.info(*args)
+            print(*args)
+
+        return print_with_print_log
+    else:
+        return print
 
 
 def _compute_pytorch(
@@ -407,9 +423,10 @@ def _compute_pytorch(
     no_speed,
     no_memory,
     verbose,
+    print_fn,
 ):
     for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
+        print_fn(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
         model = AutoModel.from_pretrained(model_name, config=config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -418,10 +435,13 @@ def _compute_pytorch(
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
 
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "time": {}, "memory": {}}
+        dictionary[model_name]["time"] = {i: {} for i in batch_sizes}
         dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
 
+        print_fn("Using model {}".format(model))
+        print_fn("Number of all parameters {}".format(model.num_parameters()))
+
         for batch_size in batch_sizes:
             if fp16:
                 model.half()
@@ -430,12 +450,12 @@ def _compute_pytorch(
 
             for slice_size in slice_sizes:
                 if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                    dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
                 else:
                     sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
                     try:
                         if torchscript:
-                            print("Tracing model with sequence size", sequence.shape)
+                            print_fn("Tracing model with sequence size {}".format(sequence.shape))
                             inference = torch.jit.trace(model, sequence)
                             inference(sequence)
                         else:
@@ -451,33 +471,33 @@ def _compute_pytorch(
                             summary = stop_memory_tracing(trace)
 
                             if verbose:
-                                print_summary_statistics(summary)
+                                print_summary_statistics(summary, print_fn)
 
                             dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
                         else:
                             dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
 
                         if not no_speed:
-                            print("Going through model with sequence of shape", sequence.shape)
+                            print_fn("Going through model with sequence of shape".format(sequence.shape))
                             runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                             average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                            dictionary[model_name]["time"][batch_size][slice_size] = average_time
                         else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                            dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
 
                     except RuntimeError as e:
-                        print("Doesn't fit on GPU.", e)
+                        print_fn("Doesn't fit on GPU. {}".format(e))
                         torch.cuda.empty_cache()
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                        dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
                         dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
     return dictionary
 
 
 def _compute_tensorflow(
-    model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose
+    model_names, batch_sizes, slice_sizes, dictionary, average_over, amp, no_speed, no_memory, verbose, print_fn
 ):
     for c, model_name in enumerate(model_names):
-        print(f"{c + 1} / {len(model_names)}")
+        print_fn(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name)
         model = TFAutoModel.from_pretrained(model_name, config=config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -486,11 +506,12 @@ def _compute_tensorflow(
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
 
-        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}, "memory": {}}
-        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "time": {}, "memory": {}}
+        dictionary[model_name]["time"] = {i: {} for i in batch_sizes}
         dictionary[model_name]["memory"] = {i: {} for i in batch_sizes}
 
-        print("Using model", model)
+        print_fn("Using model {}".format(model))
+        print_fn("Number of all parameters {}".format(model.num_parameters()))
 
         @tf.function
         def inference(inputs):
@@ -499,14 +520,14 @@ def inference(inputs):
         for batch_size in batch_sizes:
             for slice_size in slice_sizes:
                 if max_input_size is not None and slice_size > max_input_size:
-                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                    dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
                 else:
                     sequence = tf.stack(
                         [tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size
                     )
 
                     try:
-                        print("Going through model with sequence of shape", sequence.shape)
+                        print_fn("Going through model with sequence of shape {}".format(sequence.shape))
                         # To make sure that the model is traced + that the tensors are on the appropriate device
                         inference(sequence)
 
@@ -517,7 +538,7 @@ def inference(inputs):
                             summary = stop_memory_tracing(trace)
 
                             if verbose:
-                                print_summary_statistics(summary)
+                                print_summary_statistics(summary, print_fn)
 
                             dictionary[model_name]["memory"][batch_size][slice_size] = str(summary.total)
                         else:
@@ -526,13 +547,13 @@ def inference(inputs):
                         if not no_speed:
                             runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
                             average_time = sum(runtimes) / float(len(runtimes)) / 3.0
-                            dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                            dictionary[model_name]["time"][batch_size][slice_size] = average_time
                         else:
-                            dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                            dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
 
                     except tf.errors.ResourceExhaustedError as e:
-                        print("Doesn't fit on GPU.", e)
-                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                        print_fn("Doesn't fit on GPU. {}".format(e))
+                        dictionary[model_name]["time"][batch_size][slice_size] = "N/A"
                         dictionary[model_name]["memory"][batch_size][slice_size] = "N/A"
     return dictionary
 
@@ -593,7 +614,25 @@ def main():
     )
     parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
     parser.add_argument(
-        "--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv."
+        "--log_print", required=False, action="store_true", help="Save all print statements in log file."
+    )
+    parser.add_argument(
+        "--csv_time_filename",
+        required=False,
+        default=f"time_{round(time())}.csv",
+        help="CSV filename used if saving time results to csv.",
+    )
+    parser.add_argument(
+        "--csv_memory_filename",
+        required=False,
+        default=f"memory_{round(time())}.csv",
+        help="CSV filename used if saving memory results to csv.",
+    )
+    parser.add_argument(
+        "--log_filename",
+        required=False,
+        default=f"log_{round(time())}.txt",
+        help="Log filename used if print statements are saved in log.",
     )
     parser.add_argument(
         "--average_over", required=False, default=30, type=int, help="Times an experiment will be run."
@@ -614,11 +653,14 @@ def main():
             "distilgpt2",
             "roberta-base",
             "ctrl",
+            "t5-base",
+            "bart-large",
         ]
     else:
         args.models = args.models.split()
 
-    print("Running with arguments", args)
+    print_fn = get_print_function(args.log_print, args.log_filename)
+    print_fn("Running with arguments: {}".format(args))
 
     if args.torch:
         if is_torch_available():
@@ -631,11 +673,13 @@ def main():
                 torchscript=args.torchscript,
                 fp16=args.fp16,
                 save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
+                csv_time_filename=args.csv_time_filename,
+                csv_memory_filename=args.csv_memory_filename,
                 average_over=args.average_over,
                 no_speed=args.no_speed,
                 no_memory=args.no_memory,
                 verbose=args.verbose,
+                print_fn=print_fn,
             )
         else:
             raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
@@ -650,11 +694,13 @@ def main():
                 xla=args.xla,
                 amp=args.amp,
                 save_to_csv=args.save_to_csv,
-                csv_filename=args.csv_filename,
+                csv_time_filename=args.csv_time_filename,
+                csv_memory_filename=args.csv_memory_filename,
                 average_over=args.average_over,
                 no_speed=args.no_speed,
                 no_memory=args.no_memory,
                 verbose=args.verbose,
+                print_fn=print_fn,
             )
         else:
             raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
diff --git a/examples/glue/run_pl_glue.py b/examples/glue/run_pl_glue.py
index 18361f36db17..80cc1f8124f9 100644
--- a/examples/glue/run_pl_glue.py
+++ b/examples/glue/run_pl_glue.py
@@ -68,7 +68,7 @@ def prepare_data(self):
                     output_mode=args.glue_output_mode,
                     pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
                     pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
-                    pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
                 )
                 logger.info("Saving features into cached file %s", cached_features_file)
                 torch.save(features, cached_features_file)
@@ -192,5 +192,5 @@ def add_model_specific_args(parser, root_dir):
     # Optionally, predict on dev set and write to output_dir
     if args.do_predict:
         checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpointepoch=*.ckpt"), recursive=True)))
-        GLUETransformer.load_from_checkpoint(checkpoints[-1])
+        model = model.load_from_checkpoint(checkpoints[-1])
         trainer.test(model)
diff --git a/examples/hans/test_hans.py b/examples/hans/test_hans.py
index a5d4e76149d4..56416b28bd82 100644
--- a/examples/hans/test_hans.py
+++ b/examples/hans/test_hans.py
@@ -342,8 +342,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             max_length=args.max_seq_length,
             output_mode=output_mode,
             pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/ner/run_ner.py b/examples/ner/run_ner.py
index 818ff91136b1..ba08e51da882 100644
--- a/examples/ner/run_ner.py
+++ b/examples/ner/run_ner.py
@@ -348,8 +348,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ["xlnet"]),
             # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
             pad_token_label_id=pad_token_label_id,
         )
         if args.local_rank in [-1, 0]:
diff --git a/examples/ner/run_pl_ner.py b/examples/ner/run_pl_ner.py
index 6b484faa3887..6b84697891b8 100644
--- a/examples/ner/run_pl_ner.py
+++ b/examples/ner/run_pl_ner.py
@@ -64,8 +64,8 @@ def prepare_data(self):
                     sep_token=self.tokenizer.sep_token,
                     sep_token_extra=bool(args.model_type in ["roberta"]),
                     pad_on_left=bool(args.model_type in ["xlnet"]),
-                    pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0],
-                    pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+                    pad_token=self.tokenizer.pad_token_id,
+                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
                     pad_token_label_id=self.pad_token_label_id,
                 )
                 logger.info("Saving features into cached file %s", cached_features_file)
@@ -192,5 +192,5 @@ def add_model_specific_args(parser, root_dir):
         # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
         # /pytorch_lightning/callbacks/model_checkpoint.py#L169
         checkpoints = list(sorted(glob.glob(os.path.join(args.output_dir, "checkpointepoch=*.ckpt"), recursive=True)))
-        NERTransformer.load_from_checkpoint(checkpoints[-1])
+        model = model.load_from_checkpoint(checkpoints[-1])
         trainer.test(model)
diff --git a/examples/ner/run_tf_ner.py b/examples/ner/run_tf_ner.py
index 0a607ff662ea..cc76989cd737 100644
--- a/examples/ner/run_tf_ner.py
+++ b/examples/ner/run_tf_ner.py
@@ -157,7 +157,9 @@ def train(
     writer = tf.summary.create_file_writer("/tmp/mylogs")
 
     with strategy.scope():
-        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
+        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(
+            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+        )
         optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])
 
         if args["fp16"]:
@@ -205,11 +207,9 @@ def step_fn(train_features, train_labels):
 
             with tf.GradientTape() as tape:
                 logits = model(train_features["input_ids"], **inputs)[0]
-                logits = tf.reshape(logits, (-1, len(labels) + 1))
-                active_loss = tf.reshape(train_features["input_mask"], (-1,))
-                active_logits = tf.boolean_mask(logits, active_loss)
-                train_labels = tf.reshape(train_labels, (-1,))
-                active_labels = tf.boolean_mask(train_labels, active_loss)
+                active_loss = tf.reshape(train_labels, (-1,)) != pad_token_label_id
+                active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
+                active_labels = tf.boolean_mask(tf.reshape(train_labels, (-1,)), active_loss)
                 cross_entropy = loss_fct(active_labels, active_logits)
                 loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size)
                 grads = tape.gradient(loss, model.trainable_variables)
@@ -329,11 +329,9 @@ def evaluate(args, strategy, model, tokenizer, labels, pad_token_label_id, mode)
 
         with strategy.scope():
             logits = model(eval_features["input_ids"], **inputs)[0]
-            tmp_logits = tf.reshape(logits, (-1, len(labels) + 1))
-            active_loss = tf.reshape(eval_features["input_mask"], (-1,))
-            active_logits = tf.boolean_mask(tmp_logits, active_loss)
-            tmp_eval_labels = tf.reshape(eval_labels, (-1,))
-            active_labels = tf.boolean_mask(tmp_eval_labels, active_loss)
+            active_loss = tf.reshape(eval_labels, (-1,)) != pad_token_label_id
+            active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
+            active_labels = tf.boolean_mask(tf.reshape(eval_labels, (-1,)), active_loss)
             cross_entropy = loss_fct(active_labels, active_logits)
             loss += tf.reduce_sum(cross_entropy) * (1.0 / eval_batch_size)
 
@@ -436,8 +434,8 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, batch_s
             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args["model_type"] in ["xlnet"]),
             # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
             pad_token_label_id=pad_token_label_id,
         )
         logging.info("Saving features into cached file %s", cached_features_file)
@@ -497,8 +495,8 @@ def main(_):
     )
 
     labels = get_labels(args["labels"])
-    num_labels = len(labels) + 1
-    pad_token_label_id = 0
+    num_labels = len(labels)
+    pad_token_label_id = -1
     config = AutoConfig.from_pretrained(
         args["config_name"] if args["config_name"] else args["model_name_or_path"],
         num_labels=num_labels,
@@ -522,7 +520,6 @@ def main(_):
                 config=config,
                 cache_dir=args["cache_dir"] if args["cache_dir"] else None,
             )
-            model.layers[-1].activation = tf.keras.activations.softmax
 
         train_batch_size = args["per_device_train_batch_size"] * args["n_device"]
         train_dataset, num_train_examples = load_and_cache_examples(
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 72fdc2b497ab..818223bf80c1 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -360,8 +360,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             max_length=args.max_seq_length,
             output_mode=output_mode,
             pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/run_language_modeling.py b/examples/run_language_modeling.py
index 5ab9ca31f995..2b0163d96a67 100644
--- a/examples/run_language_modeling.py
+++ b/examples/run_language_modeling.py
@@ -233,6 +233,9 @@ def collate(examples: List[torch.Tensor]):
     else:
         t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
 
+    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+    model.resize_token_embeddings(len(tokenizer))
+
     # Prepare optimizer and schedule (linear warmup and decay)
     no_decay = ["bias", "LayerNorm.weight"]
     optimizer_grouped_parameters = [
@@ -309,9 +312,6 @@ def collate(examples: List[torch.Tensor]):
 
     tr_loss, logging_loss = 0.0, 0.0
 
-    model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
-    model_to_resize.resize_token_embeddings(len(tokenizer))
-
     model.zero_grad()
     train_iterator = trange(
         epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
@@ -624,6 +624,7 @@ def main():
         and os.listdir(args.output_dir)
         and args.do_train
         and not args.overwrite_output_dir
+        and not args.should_continue
     ):
         raise ValueError(
             "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index c4f90bbad75e..578ce0122610 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -361,7 +361,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
             args.max_seq_length,
             tokenizer,
             pad_on_left=bool(args.model_type in ["xlnet"]),  # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/run_xnli.py b/examples/run_xnli.py
index 9dcae8568fb3..e51a8408b8d7 100644
--- a/examples/run_xnli.py
+++ b/examples/run_xnli.py
@@ -350,8 +350,8 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             max_length=args.max_seq_length,
             output_mode=output_mode,
             pad_on_left=False,
-            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
-            pad_token_segment_id=0,
+            pad_token=tokenizer.pad_token_id,
+            pad_token_segment_id=tokenizer.pad_token_type_id,
         )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
diff --git a/examples/summarization/bart/evaluate_cnn.py b/examples/summarization/bart/evaluate_cnn.py
index 5c69dc921fc4..fe682257e311 100644
--- a/examples/summarization/bart/evaluate_cnn.py
+++ b/examples/summarization/bart/evaluate_cnn.py
@@ -16,15 +16,17 @@ def chunks(lst, n):
         yield lst[i : i + n]
 
 
-def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
+def generate_summaries(
+    examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE
+):
     fout = Path(out_file).open("w")
-    model = BartForConditionalGeneration.from_pretrained("bart-large-cnn", output_past=True,).to(device)
+    model = BartForConditionalGeneration.from_pretrained(model_name, output_past=True,).to(device)
     tokenizer = BartTokenizer.from_pretrained("bart-large")
 
     max_length = 140
     min_length = 55
 
-    for batch in tqdm(list(chunks(lns, batch_size))):
+    for batch in tqdm(list(chunks(examples, batch_size))):
         dct = tokenizer.batch_encode_plus(batch, max_length=1024, return_tensors="pt", pad_to_max_length=True)
         summaries = model.generate(
             input_ids=dct["input_ids"].to(device),
@@ -43,7 +45,7 @@ def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
             fout.flush()
 
 
-def _run_generate():
+def run_generate():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "source_path", type=str, help="like cnn_dm/test.source",
@@ -51,6 +53,9 @@ def _run_generate():
     parser.add_argument(
         "output_path", type=str, help="where to save summaries",
     )
+    parser.add_argument(
+        "model_name", type=str, default="bart-large-cnn", help="like bart-large-cnn",
+    )
     parser.add_argument(
         "--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.",
     )
@@ -58,9 +63,9 @@ def _run_generate():
         "--bs", type=int, default=8, required=False, help="batch size: how many to summarize at a time",
     )
     args = parser.parse_args()
-    lns = [" " + x.rstrip() for x in open(args.source_path).readlines()]
-    generate_summaries(lns, args.output_path, batch_size=args.bs, device=args.device)
+    examples = [" " + x.rstrip() for x in open(args.source_path).readlines()]
+    generate_summaries(examples, args.output_path, args.model_name, batch_size=args.bs, device=args.device)
 
 
 if __name__ == "__main__":
-    _run_generate()
+    run_generate()
diff --git a/examples/summarization/bart/test_bart_examples.py b/examples/summarization/bart/test_bart_examples.py
index 18064cc5d23c..40be3b5668d2 100644
--- a/examples/summarization/bart/test_bart_examples.py
+++ b/examples/summarization/bart/test_bart_examples.py
@@ -1,16 +1,13 @@
 import logging
-import os
 import sys
 import tempfile
 import unittest
 from pathlib import Path
 from unittest.mock import patch
 
-from .evaluate_cnn import _run_generate
+from .evaluate_cnn import run_generate
 
 
-output_file_name = "output_bart_sum.txt"
-
 articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
 
 logging.basicConfig(level=logging.DEBUG)
@@ -25,8 +22,11 @@ def test_bart_cnn_cli(self):
         tmp = Path(tempfile.gettempdir()) / "utest_generations_bart_sum.hypo"
         with tmp.open("w") as f:
             f.write("\n".join(articles))
-        testargs = ["evaluate_cnn.py", str(tmp), output_file_name]
+
+        output_file_name = Path(tempfile.gettempdir()) / "utest_output_bart_sum.hypo"
+
+        testargs = ["evaluate_cnn.py", str(tmp), str(output_file_name), "sshleifer/bart-tiny-random"]
+
         with patch.object(sys, "argv", testargs):
-            _run_generate()
+            run_generate()
             self.assertTrue(Path(output_file_name).exists())
-            os.remove(Path(output_file_name))
diff --git a/examples/summarization/t5/evaluate_cnn.py b/examples/summarization/t5/evaluate_cnn.py
index 535c11093b68..3c923a46d7ad 100644
--- a/examples/summarization/t5/evaluate_cnn.py
+++ b/examples/summarization/t5/evaluate_cnn.py
@@ -64,7 +64,7 @@ def run_generate():
     parser.add_argument(
         "model_size",
         type=str,
-        help="T5 model size, either 't5-small', 't5-base' or 't5-large'. Defaults to base.",
+        help="T5 model size, either 't5-small', 't5-base', 't5-large', 't5-3b', 't5-11b'. Defaults to 't5-base'.",
         default="t5-base",
     )
     parser.add_argument(
diff --git a/examples/summarization/t5/test_t5_examples.py b/examples/summarization/t5/test_t5_examples.py
index 57f3e342d77b..340ea49d8cbb 100644
--- a/examples/summarization/t5/test_t5_examples.py
+++ b/examples/summarization/t5/test_t5_examples.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import sys
 import tempfile
 import unittest
@@ -26,10 +25,20 @@ def test_t5_cli(self):
         tmp = Path(tempfile.gettempdir()) / "utest_generations_t5_sum.hypo"
         with tmp.open("w") as f:
             f.write("\n".join(articles))
-        testargs = ["evaluate_cnn.py", "t5-small", str(tmp), output_file_name, str(tmp), score_file_name]
+
+        output_file_name = Path(tempfile.gettempdir()) / "utest_output_t5_sum.hypo"
+        score_file_name = Path(tempfile.gettempdir()) / "utest_score_t5_sum.hypo"
+
+        testargs = [
+            "evaluate_cnn.py",
+            "patrickvonplaten/t5-tiny-random",
+            str(tmp),
+            str(output_file_name),
+            str(tmp),
+            str(score_file_name),
+        ]
+
         with patch.object(sys, "argv", testargs):
             run_generate()
             self.assertTrue(Path(output_file_name).exists())
             self.assertTrue(Path(score_file_name).exists())
-            os.remove(Path(output_file_name))
-            os.remove(Path(score_file_name))
diff --git a/examples/translation/t5/evaluate_wmt.py b/examples/translation/t5/evaluate_wmt.py
index 307065d0a996..533811271b7a 100644
--- a/examples/translation/t5/evaluate_wmt.py
+++ b/examples/translation/t5/evaluate_wmt.py
@@ -14,13 +14,13 @@ def chunks(lst, n):
         yield lst[i : i + n]
 
 
-def generate_translations(lns, output_file_path, batch_size, device):
+def generate_translations(lns, output_file_path, model_size, batch_size, device):
     output_file = Path(output_file_path).open("w")
 
-    model = T5ForConditionalGeneration.from_pretrained("t5-base")
+    model = T5ForConditionalGeneration.from_pretrained(model_size)
     model.to(device)
 
-    tokenizer = T5Tokenizer.from_pretrained("t5-base")
+    tokenizer = T5Tokenizer.from_pretrained(model_size)
 
     # update config with summarization specific params
     task_specific_params = model.config.task_specific_params
@@ -52,6 +52,12 @@ def calculate_bleu_score(output_lns, refs_lns, score_path):
 
 def run_generate():
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "model_size",
+        type=str,
+        help="T5 model size, either 't5-small', 't5-base', 't5-large', 't5-3b', 't5-11b'. Defaults to 't5-base'.",
+        default="t5-base",
+    )
     parser.add_argument(
         "input_path", type=str, help="like wmt/newstest2013.en",
     )
@@ -78,7 +84,7 @@ def run_generate():
 
     input_lns = [x.strip().replace(dash_pattern[0], dash_pattern[1]) for x in open(args.input_path).readlines()]
 
-    generate_translations(input_lns, args.output_path, args.batch_size, args.device)
+    generate_translations(input_lns, args.output_path, args.model_size, args.batch_size, args.device)
 
     output_lns = [x.strip() for x in open(args.output_path).readlines()]
     refs_lns = [x.strip().replace(dash_pattern[0], dash_pattern[1]) for x in open(args.reference_path).readlines()]
diff --git a/examples/translation/t5/test_t5_examples.py b/examples/translation/t5/test_t5_examples.py
index eea17c227a14..b33cba11c2da 100644
--- a/examples/translation/t5/test_t5_examples.py
+++ b/examples/translation/t5/test_t5_examples.py
@@ -1,5 +1,4 @@
 import logging
-import os
 import sys
 import tempfile
 import unittest
@@ -33,11 +32,19 @@ def test_t5_cli(self):
         with tmp_target.open("w") as f:
             f.write("\n".join(translation))
 
-        testargs = ["evaluate_wmt.py", str(tmp_source), output_file_name, str(tmp_target), score_file_name]
+        output_file_name = Path(tempfile.gettempdir()) / "utest_output_trans.hypo"
+        score_file_name = Path(tempfile.gettempdir()) / "utest_score.hypo"
+
+        testargs = [
+            "evaluate_wmt.py",
+            "patrickvonplaten/t5-tiny-random",
+            str(tmp_source),
+            str(output_file_name),
+            str(tmp_target),
+            str(score_file_name),
+        ]
 
         with patch.object(sys, "argv", testargs):
             run_generate()
             self.assertTrue(Path(output_file_name).exists())
             self.assertTrue(Path(score_file_name).exists())
-            os.remove(Path(output_file_name))
-            os.remove(Path(score_file_name))
diff --git a/model_cards/NLP4H/ms_bert/README.md b/model_cards/NLP4H/ms_bert/README.md
new file mode 100644
index 000000000000..a31ec2827a85
--- /dev/null
+++ b/model_cards/NLP4H/ms_bert/README.md
@@ -0,0 +1,54 @@
+# MS-bert
+
+## Introduction
+
+This repository provides codes and models of MS-BERT.
+MS-BERT was pre-trained on notes from neurological examination for Multiple Sclerosis (MS) patients at St. Michael's Hospital in Toronto, Canada.
+
+## Data
+
+The dataset contained approximately 75,000 clinical notes, for about 5000 patients, totaling to over 35.7 million words.
+These notes were collected from patients who visited St. Michael's Hospital MS Clinic between 2015 to 2019.
+The notes contained a variety of information pertaining to a neurological exam.
+For example, a note can contain information on the patient's condition, their progress over time and diagnosis.
+The gender split within the dataset was observed to be 72% female and 28% male ([which reflects the natural discrepancy seen in MS][1]).
+Further sections will describe how MS-BERT was pre trained through the use of these clinically relevant and rich neurological notes.
+
+## Data pre-processing
+
+The data was pre-processed to remove any identifying information. This includes information on: patient names, doctor names, hospital names, patient identification numbers, phone numbers, addresses, and time. In order to de-identify the information, we used a curated database that contained patient and doctor information. This curated database was paired with regular expressions to find and remove any identifying pieces of information. Each of these identifiers were replaced with a specific token. These tokens were chosen based on three criteria: (1) they belong to the current BERT vocab, (2), they have relatively the same semantic meaning as the word they are replacing, and (3), the token is not found in the original unprocessed dataset. The replacements that met the criteria above were as follows: 
+
+Female first names -> Lucie
+
+Male first names -> Ezekiel
+
+Last/family names -> Salamanca.
+
+Dates -> 2010s
+
+Patient IDs -> 999
+
+Phone numbers -> 1718
+
+Addresses -> Silesia
+
+Time -> 1610
+
+Locations/Hospital/Clinic names -> Troy
+
+## Pre-training
+
+The starting point for our model is the already pre-trained and fine-tuned BLUE-BERT base. We further pre-train it using the masked language modelling task from the huggingface transformers [library](https://github.com/huggingface). 
+
+The hyperparameters can be found in the config file in this repository or [here](https://s3.amazonaws.com/models.huggingface.co/bert/NLP4H/ms_bert/config.json)
+
+## Acknowledgements
+
+We would like to thank the researchers and staff at the Data Science and Advanced Analytics (DSAA) team, St. Michael’s hospital, for providing consistent support and guidance throughout this project.
+We would also like to thank Dr. Marzyeh Ghassemi, Taylor Killan, Nathan Ng and Haoran Zhang for providing us the opportunity to work on this exciting project.
+
+## Disclaimer
+
+MS-BERT shows the results of research conducted at the Data Science and Advanced Analytics (DSAA) team, St. Michael’s hospital. The results produced by MS-BERT are not intended for direct diagnostic use or medical decision-making without review and oversight by a clinical professional. Individuals should not make decisions about their health solely on the basis of the results produced by MS-BERT. St. Michael’s hospital does not independently verify the validity or utility of the results produced by MS-BERT. If you have questions about the results produced by MS-BERT please consult a healthcare professional. More information about St. Michael’s hospital’s disclaimer policy can be found <here>. 
+
+[1]: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3707353/
diff --git a/model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md b/model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md
index a7cccda85cb0..61e0c291a5c9 100644
--- a/model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md
+++ b/model_cards/ahotrod/albert_xxlargev1_squad2_512/README.md
@@ -64,27 +64,8 @@ TensorFlow: 2.1.0
 Python: 3.7.6
 ```
 
-### Inferencing / prediction works with the current Transformers v2.4.1
-
-### Access this albert_xxlargev1_sqd2_512 fine-tuned model with "tried & true" code:
+### Access this albert_xxlargev1_sqd2_512 fine-tuned model with:
 
 ```python
-config_class, model_class, tokenizer_class = \
-        AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer
-
-model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512"
-config = config_class.from_pretrained(model_name_or_path)
-tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True)
-model = model_class.from_pretrained(model_name_or_path, config=config)
-```
-
-### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my app & confirm:
-
-```python
-from transformers import AutoConfig, AutoTokenizer, AutoModel
-
-model_name_or_path = "ahotrod/albert_xxlargev1_squad2_512"
-config = AutoConfig.from_pretrained(model_name_or_path)
-tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
-model = AutoModel.from_pretrained(model_name_or_path, config=config)
-```
\ No newline at end of file
+tokenizer = AutoTokenizer.from_pretrained("ahotrod/albert_xxlargev1_squad2_512")
+model = AutoModelForQuestionAnswering.from_pretrained("ahotrod/albert_xxlargev1_squad2_512")
diff --git a/model_cards/ahotrod/roberta_large_squad2/README.md b/model_cards/ahotrod/roberta_large_squad2/README.md
new file mode 100644
index 000000000000..77a0c5841b63
--- /dev/null
+++ b/model_cards/ahotrod/roberta_large_squad2/README.md
@@ -0,0 +1,68 @@
+## RoBERTa-large language model fine-tuned on SQuAD2.0
+
+### with the following results:
+
+```
+  "exact": 84.02257222269014,
+  "f1": 87.47063479332766,
+  "total": 11873,
+  "HasAns_exact": 81.19095816464238,
+  "HasAns_f1": 88.0969714745582,
+  "HasAns_total": 5928,
+  "NoAns_exact": 86.84608915054667,
+  "NoAns_f1": 86.84608915054667,
+  "NoAns_total": 5945,
+  "best_exact": 84.02257222269014,
+  "best_exact_thresh": 0.0,
+  "best_f1": 87.47063479332759,
+  "best_f1_thresh": 0.0
+```
+### from script:
+```
+python -m torch.distributed.launch --nproc_per_node=2 ${RUN_SQUAD_DIR}/run_squad.py \
+  --model_type roberta \
+  --model_name_or_path roberta-large \
+  --do_train \
+  --train_file ${SQUAD_DIR}/train-v2.0.json \
+  --predict_file ${SQUAD_DIR}/dev-v2.0.json \
+  --version_2_with_negative \
+  --num_train_epochs 2 \
+  --warmup_steps 328 \
+  --weight_decay 0.01 \
+  --do_lower_case \
+  --learning_rate 1.5e-5 \
+  --max_seq_length 512 \
+  --doc_stride 128 \
+  --save_steps 1000 \
+  --per_gpu_train_batch_size 1 \
+  --gradient_accumulation_steps 24 \
+  --logging_steps 50 \
+  --threads 10 \
+  --overwrite_cache \
+  --overwrite_output_dir \
+  --output_dir ${MODEL_PATH}
+
+python ${RUN_SQUAD_DIR}/run_squad.py \
+  --model_type roberta \
+  --model_name_or_path ${MODEL_PATH} \
+  --do_eval \
+  --train_file ${SQUAD_DIR}/train-v2.0.json \
+  --predict_file ${SQUAD_DIR}/dev-v2.0.json \
+  --version_2_with_negative \
+  --do_lower_case \
+  --max_seq_length 512 \
+  --per_gpu_eval_batch_size 24 \
+  --eval_all_checkpoints \
+  --overwrite_output_dir \
+  --output_dir ${MODEL_PATH}
+$@
+```
+### using the following system & software:
+```
+OS/Platform: Linux-4.15.0-91-generic-x86_64-with-debian-buster-sid
+GPU/CPU: 2 x NVIDIA 1080Ti / Intel i7-8700
+Transformers: 2.7.0
+PyTorch: 1.4.0
+TensorFlow: 2.1.0
+Python: 3.7.7
+```
diff --git a/model_cards/ahotrod/xlnet_large_squad2_512/README.md b/model_cards/ahotrod/xlnet_large_squad2_512/README.md
index f2850032ec2f..a680704af15b 100644
--- a/model_cards/ahotrod/xlnet_large_squad2_512/README.md
+++ b/model_cards/ahotrod/xlnet_large_squad2_512/README.md
@@ -56,22 +56,8 @@ PyTorch: 1.4.0
 TensorFlow: 2.1.0
 Python: 3.7.6
 ```
-### Inferencing / prediction works with Transformers v2.4.1, the latest version tested
-
 ### Utilize this xlnet_large_squad2_512 fine-tuned model with:
 ```python
-config_class, model_class, tokenizer_class = \
-        XLNetConfig, XLNetforQuestionAnswering, XLNetTokenizer
-model_name_or_path = "ahotrod/xlnet_large_squad2_512"
-config = config_class.from_pretrained(model_name_or_path)
-tokenizer = tokenizer_class.from_pretrained(model_name_or_path, do_lower_case=True)
-model = model_class.from_pretrained(model_name_or_path, config=config)
-```
-### or the AutoModels (AutoConfig, AutoTokenizer & AutoModel) should also work, however I have yet to use them in my apps & confirm:
-```python
-from transformers import AutoConfig, AutoTokenizer, AutoModel
-model_name_or_path = "ahotrod/xlnet_large_squad2_512"
-config = AutoConfig.from_pretrained(model_name_or_path)
-tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
-model = AutoModel.from_pretrained(model_name_or_path, config=config)
+tokenizer = AutoTokenizer.from_pretrained("ahotrod/xlnet_large_squad2_512")
+model = AutoModelForQuestionAnswering.from_pretrained("ahotrod/xlnet_large_squad2_512")
 ```
diff --git a/model_cards/albert-base-v1-README.md b/model_cards/albert-base-v1-README.md
new file mode 100644
index 000000000000..909441c232b1
--- /dev/null
+++ b/model_cards/albert-base-v1-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=albert-base-v1)
diff --git a/model_cards/albert-xxlarge-v2-README.md b/model_cards/albert-xxlarge-v2-README.md
new file mode 100644
index 000000000000..c8df2f754444
--- /dev/null
+++ b/model_cards/albert-xxlarge-v2-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=albert-xxlarge-v2)
diff --git a/model_cards/allenai/biomed_roberta_base/README.md b/model_cards/allenai/biomed_roberta_base/README.md
new file mode 100644
index 000000000000..66c0371d1134
--- /dev/null
+++ b/model_cards/allenai/biomed_roberta_base/README.md
@@ -0,0 +1,38 @@
+---
+thumbnail: https://huggingface.co/front/thumbnails/allenai.png
+---
+
+# BioMed-RoBERTa-base
+
+BioMed-RoBERTa-base is a language model based on the RoBERTa-base (Liu et. al, 2019) architecture. We adapt RoBERTa-base to 2.68 million scientific papers from the [Semantic Scholar](https://www.semanticscholar.org) corpus via continued pretraining. This amounts to 7.55B tokens and 47GB of data. We use the full text of the papers in training, not just abstracts.
+
+Specific details of the adaptive pretraining procedure can be found in Gururangan et. al, 2020. 
+
+
+## Evaluation
+
+BioMed-RoBERTa achieves competitive performance to state of the art models on a number of NLP tasks in the biomedical domain (numbers are mean (standard deviation) over 3+ random seeds)
+
+
+| Task         | Task Type           | RoBERTa-base | BioMed-RoBERTa-base |
+|--------------|---------------------|--------------|---------------------|
+| RCT-180K     | Text Classification | 86.4 (0.3)   | 86.9 (0.2)          |
+| ChemProt     | Relation Extraction | 81.1 (1.1)   | 83.0 (0.7)          |
+| JNLPBA       | NER                 | 74.3 (0.2)   | 75.2 (0.1)          |
+| BC5CDR       | NER                 | 85.6 (0.1)   | 87.8 (0.1)          |
+| NCBI-Disease | NER                 | 86.6 (0.3)   | 87.1 (0.8)          |
+
+More evaluations TBD.
+
+## Citation
+
+If using this model, please cite the following paper:
+
+```bibtex
+@inproceedings{domains,
+ author = {Suchin Gururangan and Ana Marasović and Swabha Swayamdipta and Kyle Lo and Iz Beltagy and Doug Downey and Noah A. Smith},
+ title = {Don't Stop Pretraining: Adapt Language Models to Domains and Tasks},
+ year = {2020},
+ booktitle = {Proceedings of ACL},
+}
+```
diff --git a/model_cards/bert-base-cased-README.md b/model_cards/bert-base-cased-README.md
new file mode 100644
index 000000000000..e6e1f4b5ea94
--- /dev/null
+++ b/model_cards/bert-base-cased-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=bert-base-cased)
diff --git a/model_cards/bert-base-german-cased-README.md b/model_cards/bert-base-german-cased-README.md
index b18a118c18ce..f058bdd956eb 100644
--- a/model_cards/bert-base-german-cased-README.md
+++ b/model_cards/bert-base-german-cased-README.md
@@ -1,8 +1,12 @@
 ---
 language: german
 thumbnail: https://static.tildacdn.com/tild6438-3730-4164-b266-613634323466/german_bert.png
+tags:
+- exbert
 ---
 
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=bert-base-german-cased)
+
 # German BERT
 ![bert_image](https://static.tildacdn.com/tild6438-3730-4164-b266-613634323466/german_bert.png)
 ## Overview
@@ -18,6 +22,7 @@ thumbnail: https://static.tildacdn.com/tild6438-3730-4164-b266-613634323466/germ
 - We trained 810k steps with a batch size of 1024 for sequence length 128 and 30k steps with sequence length 512. Training took about 9 days.
 - As training data we used the latest German Wikipedia dump (6GB of raw txt files), the OpenLegalData dump (2.4 GB) and news articles (3.6 GB).
 - We cleaned the data dumps with tailored scripts and segmented sentences with spacy v2.1. To create tensorflow records we used the recommended sentencepiece library for creating the word piece vocabulary and tensorflow scripts to convert the text to data usable by BERT.
+- Update April 3rd, 2020: updated the vocab file on deepset s3 to adjust tokenization of punctuation.
 
 See https://deepset.ai/german-bert for more details
 
diff --git a/model_cards/bert-base-uncased-README.md b/model_cards/bert-base-uncased-README.md
new file mode 100644
index 000000000000..9dd9d22b3abe
--- /dev/null
+++ b/model_cards/bert-base-uncased-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=bert-base-uncased)
diff --git a/model_cards/camembert-base-README.md b/model_cards/camembert-base-README.md
index 0bf0ad55aa95..3389052c05a2 100644
--- a/model_cards/camembert-base-README.md
+++ b/model_cards/camembert-base-README.md
@@ -1,3 +1,7 @@
+---
+language: french
+---
+
 # CamemBERT 
 
 CamemBERT is a state-of-the-art language model for French based on the RoBERTa architecture pretrained on the French subcorpus of the newly available multilingual corpus OSCAR.  
diff --git a/model_cards/deepset/quora_dedup_bert_base/README.md b/model_cards/deepset/quora_dedup_bert_base/README.md
new file mode 100644
index 000000000000..317b14e69e41
--- /dev/null
+++ b/model_cards/deepset/quora_dedup_bert_base/README.md
@@ -0,0 +1,59 @@
+This language model is trained using sentence_transformers (https://github.com/UKPLab/sentence-transformers)
+Started with bert-base-nli-stsb-mean-tokens
+Continue training on quora questions deduplication dataset (https://www.kaggle.com/c/quora-question-pairs)
+See train_script.py for script used
+
+Below is the performance over the course of training
+epoch,steps,cosine_pearson,cosine_spearman,euclidean_pearson,euclidean_spearman,manhattan_pearson,manhattan_spearman,dot_pearson,dot_spearman
+0,1000,0.5944576426835938,0.6010801382777033,0.5942803776859142,0.5934485776801595,0.5939676679774666,0.593162725602328,0.5905591590826669,0.5921674789994058
+0,2000,0.6404080440207146,0.6416811632113405,0.6384419354012121,0.6352050423100778,0.6379917744471867,0.6347884067391001,0.6410544760582826,0.6379252046791412
+0,3000,0.6710168301884945,0.6676529324662036,0.6660195209784969,0.6618423144808695,0.6656461098096684,0.6615366331956389,0.6724401903484759,0.666073727723655
+0,4000,0.6886373265097949,0.6808948140300153,0.67907655686838,0.6714218133850957,0.6786809551564443,0.6711577956884357,0.6926435869763303,0.68190855298609
+0,5000,0.6991409753700026,0.6919630610321864,0.6991041519437052,0.6868961486499775,0.6987076032270729,0.6865385550504007,0.7035518148330993,0.6916275246101342
+0,6000,0.7120367327025509,0.6975005265298305,0.7065567493967201,0.6922375503495235,0.7060005509843024,0.6916475765570651,0.7147094303373102,0.6981390706722722
+0,7000,0.7254672394728687,0.7130118465900485,0.7261844956277705,0.7086213543110718,0.7257479964972307,0.7079315661881832,0.728729909455115,0.7122743793160531
+0,8000,0.7402421930101399,0.7216774208330149,0.7367901914441078,0.7166256588352043,0.7362607046874481,0.7158881916281887,0.7433902441373252,0.7220998491980078
+0,9000,0.7381005358120434,0.7197216844469877,0.7343228719349923,0.7139462687943793,0.7345247569255238,0.7145106206467152,0.7421843672419275,0.720686853053079
+0,10000,0.7465436564646095,0.7260327107480364,0.7467524239596304,0.7230195666847953,0.7467721566237211,0.7231367593302213,0.749792199122442,0.7263143296580317
+0,11000,0.7521805421706547,0.7323771570146701,0.7530672061250105,0.729223203496722,0.7530616532823367,0.7293818369675622,0.7552399002305836,0.7320808333541338
+0,12000,0.7579359969644401,0.7340677616737238,0.7570017235719905,0.7305965412825544,0.7570601853520393,0.730718189957289,0.7611254136080384,0.7351501229591327
+0,-1,0.7573407371218097,0.7329952035782198,0.755595312163209,0.7291445551777086,0.7557737117990928,0.7295404703700227,0.7607276219361719,0.7342415455980179
+1,1000,0.7619907683805341,0.7374667949734767,0.7629820517114324,0.7330364216044966,0.7628369522755882,0.7331912674450544,0.7658583898073758,0.7381503446695727
+1,2000,0.7618972640071228,0.7362151058969478,0.764582212425539,0.7335856230046062,0.7643125513700815,0.7334501607097152,0.7652852805583232,0.7369104639809163
+1,3000,0.7687362955240467,0.7404674623181671,0.7708304819979073,0.7380959815601529,0.7707835692712482,0.7379796800453193,0.772074854759756,0.7414513460702766
+1,4000,0.7685047787908202,0.7403088288815168,0.7703522257474043,0.7379787888808298,0.7701221475099808,0.7377898546753812,0.7713755359045312,0.7409415801952219
+1,5000,0.7696438109797803,0.7410393893292365,0.773270389327895,0.7392953127251652,0.7729880866533291,0.7389853982789335,0.7726236305835863,0.7416278035580925
+1,6000,0.7749538363837081,0.7436499342062207,0.774879168058157,0.7401827241766746,0.7745754601165837,0.739763415043146,0.7788801166152383,0.7446249060022169
+1,7000,0.7794560817870597,0.7480970176267153,0.7803506944510302,0.7453305130502859,0.7799867949176531,0.7447100155494814,0.7828208193123926,0.7486740690324809
+1,8000,0.7855844359073243,0.7496742172376921,0.7828816645965887,0.747176409009761,0.7827584875358967,0.7471037762845532,0.7879159073496309,0.7507349669102151
+1,9000,0.7844110753729492,0.7507746252693759,0.7847208586489722,0.7485172180290892,0.7846408087474059,0.748491818820158,0.7872061334510225,0.7514470349769437
+1,10000,0.7881311227435004,0.7530048509727403,0.7886917756879734,0.7508018068765787,0.7883332502188707,0.7505037008187275,0.7910707228932787,0.7537200382362567
+1,11000,0.7883300109606874,0.7513494487126553,0.7879329130497712,0.749818368689255,0.7876525616593218,0.7494872882301785,0.7911454269743292,0.7522843165147303
+1,12000,0.7853334933336618,0.7516809747712728,0.7893895316714998,0.749780492728257,0.7890075986655403,0.7494079715118533,0.7885959664070629,0.7523827940133203
+1,-1,0.7887529238148887,0.7534076729932393,0.7896864404801204,0.7513080079201105,0.7894077512343298,0.7510009899066772,0.7919617393746149,0.7542173273241598
+2,1000,0.7919209063905188,0.7550167329363414,0.7917464066515253,0.7523043685293455,0.7914371703225378,0.7520285423781206,0.7950297421784158,0.7562599556207076
+2,2000,0.7924507768792486,0.7542908512484463,0.7934519001953887,0.7517491515010692,0.7931885648751081,0.751521004535999,0.7951637852162545,0.7551495215642072
+2,3000,0.7937606244038364,0.755599577136169,0.7933633347508111,0.7527922999916203,0.7931581019714242,0.7527132061436363,0.797275652800117,0.7569827180764233
+2,4000,0.7938389298721445,0.7578716892320315,0.7963783770097079,0.7555928931784702,0.796150381773947,0.7555438771581088,0.7972911620482322,0.759178632650707
+2,5000,0.7935330563129844,0.7551129824372304,0.7970775059297484,0.7527285792572385,0.7967359830546507,0.7524478515463257,0.7966395126138969,0.756319220359678
+2,6000,0.7929852776759999,0.7525490026774382,0.7952484474454824,0.7503695753216607,0.7950784132079611,0.7503677929234961,0.7956152082976395,0.7535275392698093
+2,7000,0.794956504054517,0.756119591765251,0.7982025041673655,0.7532521587180684,0.7980261618830962,0.7532107179960499,0.7983222918908033,0.7571226363678287
+2,8000,0.7934568432535339,0.7538336661192452,0.797015698241178,0.7514773358161916,0.7968076980315735,0.7513458838811067,0.7960694134685949,0.754143803399873
+2,9000,0.7970040626682157,0.7576497805894974,0.7987855332059015,0.7550996144509958,0.7984693921009676,0.7548260162973456,0.7999509314900626,0.758347143906916
+2,10000,0.7979442987735523,0.7585338500791028,0.8018677081664496,0.7557412777548302,0.8015397301245205,0.7552916678886369,0.8007921348414564,0.7589772216225288
+2,11000,0.7985519561040211,0.7579986850302035,0.8021236875460913,0.7555826443181872,0.8019861620475348,0.7553763317660516,0.8009230128897853,0.7586541619907702
+2,12000,0.7986842143860736,0.7599570950134775,0.8029131054823838,0.7577678644678973,0.8027922603736795,0.7575152095990927,0.8020896747930555,0.7608540869254408
+2,-1,0.7994135319568432,0.7596286881516635,0.8022087183675333,0.7570593611974978,0.8020218401019292,0.7567291719729909,0.8026346812258125,0.7603928913647044
+3,1000,0.7985505039929134,0.7592588405681144,0.8023296699449267,0.7569345933969436,0.8023622066009718,0.7570237132696928,0.8013054275981851,0.759643838536062
+3,2000,0.7995482191699455,0.759205368623176,0.8026859405513612,0.7565709841358819,0.8024845263367439,0.7562920388231202,0.8021318586127523,0.7596496313300967
+3,3000,0.7991070423195897,0.7582027696555826,0.8016352550470427,0.7555585819429662,0.8014268261947898,0.7551838327642736,0.8013136081494014,0.7584429477727118
+3,4000,0.7999188836884763,0.7586764419322649,0.802987646214278,0.7561111254802977,0.8026549791861386,0.7556463650525692,0.8024068858366156,0.7591238238715613
+3,5000,0.7988075932525881,0.7583533823004922,0.8019498750207454,0.755792967372457,0.8016459824731964,0.7553834613587099,0.8015528810821693,0.7589527136833425
+3,6000,0.8003341798460688,0.7585432077405799,0.8032464035902267,0.7563722467405277,0.8028695045742804,0.7557626665682309,0.8027937010871594,0.7590404967573696
+3,7000,0.799187592384933,0.7579358555659604,0.8028413548398412,0.7555875459131398,0.8025187078191003,0.7551196665011402,0.8018680475193432,0.7585565756912578
+3,8000,0.797725037202641,0.757439012042047,0.802048241301358,0.7548888458326453,0.8017608103042271,0.7544606246736175,0.8005479449399782,0.758037452190282
+3,9000,0.7990232649360067,0.7573703896772077,0.8021375332910405,0.754873027155089,0.8018733796679427,0.7545680141630304,0.8016400687760605,0.7579461042843499
+3,10000,0.7994934439260372,0.758368978248884,0.8035693504115055,0.75619400688862,0.8032990505007025,0.7559016935896375,0.8022819185772518,0.7589558328445544
+3,11000,0.8002954591825011,0.758710753096932,0.8043310859792212,0.7566387152306694,0.8040865016706966,0.7564221538891368,0.8030873114870971,0.7592722085543488
+3,12000,0.8003726616196549,0.7588056657991931,0.8044000317617518,0.7566146528909147,0.8041705213966136,0.7563419459362758,0.8031760015719815,0.7593194421057111
+3,-1,0.8004926728141455,0.7587192194882135,0.8043340929890026,0.756546030526114,0.8041028559910275,0.7563103085106637,0.8032542493776693,0.7592325501951863
diff --git a/model_cards/distilbert-base-uncased-README.md b/model_cards/distilbert-base-uncased-README.md
new file mode 100644
index 000000000000..76f22ef9b8c3
--- /dev/null
+++ b/model_cards/distilbert-base-uncased-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=distilbert-base-uncased)
diff --git a/model_cards/distilgpt2-README.md b/model_cards/distilgpt2-README.md
new file mode 100644
index 000000000000..460a72eefc92
--- /dev/null
+++ b/model_cards/distilgpt2-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=distilgpt2)
diff --git a/model_cards/distilroberta-base-README.md b/model_cards/distilroberta-base-README.md
new file mode 100644
index 000000000000..dd3ffe628bbb
--- /dev/null
+++ b/model_cards/distilroberta-base-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=distilroberta-base)
diff --git a/model_cards/google/electra-base-discriminator/README.md b/model_cards/google/electra-base-discriminator/README.md
new file mode 100644
index 000000000000..06de5a4d41ab
--- /dev/null
+++ b/model_cards/google/electra-base-discriminator/README.md
@@ -0,0 +1,34 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the discriminator in `transformers`
+
+```python
+from transformers import ElectraForPreTraining, ElectraTokenizerFast
+import torch
+
+discriminator = ElectraForPreTraining.from_pretrained("google/electra-base-discriminator")
+tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-base-discriminator")
+
+sentence = "The quick brown fox jumps over the lazy dog"
+fake_sentence = "The quick brown fox fake over the lazy dog"
+
+fake_tokens = tokenizer.tokenize(fake_sentence)
+fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
+discriminator_outputs = discriminator(fake_inputs)
+predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
+
+[print("%7s" % token, end="") for token in fake_tokens]
+
+[print("%7s" % int(prediction), end="") for prediction in predictions.tolist()]
+```
diff --git a/model_cards/google/electra-base-generator/README.md b/model_cards/google/electra-base-generator/README.md
new file mode 100644
index 000000000000..ed6d7be6481d
--- /dev/null
+++ b/model_cards/google/electra-base-generator/README.md
@@ -0,0 +1,29 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the generator in `transformers`
+
+```python
+from transformers import pipeline
+
+fill_mask = pipeline(
+	"fill-mask",
+	model="google/electra-base-generator",
+	tokenizer="google/electra-base-generator"
+)
+
+print(
+	fill_mask(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks.")
+)
+
+```
diff --git a/model_cards/google/electra-large-discriminator/README.md b/model_cards/google/electra-large-discriminator/README.md
new file mode 100644
index 000000000000..efdc1020a108
--- /dev/null
+++ b/model_cards/google/electra-large-discriminator/README.md
@@ -0,0 +1,34 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the discriminator in `transformers`
+
+```python
+from transformers import ElectraForPreTraining, ElectraTokenizerFast
+import torch
+
+discriminator = ElectraForPreTraining.from_pretrained("google/electra-large-discriminator")
+tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-large-discriminator")
+
+sentence = "The quick brown fox jumps over the lazy dog"
+fake_sentence = "The quick brown fox fake over the lazy dog"
+
+fake_tokens = tokenizer.tokenize(fake_sentence)
+fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
+discriminator_outputs = discriminator(fake_inputs)
+predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
+
+[print("%7s" % token, end="") for token in fake_tokens]
+
+[print("%7s" % int(prediction), end="") for prediction in predictions.tolist()]
+```
diff --git a/model_cards/google/electra-large-generator/README.md b/model_cards/google/electra-large-generator/README.md
new file mode 100644
index 000000000000..34812aa10903
--- /dev/null
+++ b/model_cards/google/electra-large-generator/README.md
@@ -0,0 +1,29 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the generator in `transformers`
+
+```python
+from transformers import pipeline
+
+fill_mask = pipeline(
+	"fill-mask",
+	model="google/electra-large-generator",
+	tokenizer="google/electra-large-generator"
+)
+
+print(
+	fill_mask(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks.")
+)
+
+```
diff --git a/model_cards/google/electra-small-discriminator/README.md b/model_cards/google/electra-small-discriminator/README.md
new file mode 100644
index 000000000000..ae9097ebfb82
--- /dev/null
+++ b/model_cards/google/electra-small-discriminator/README.md
@@ -0,0 +1,34 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the discriminator in `transformers`
+
+```python
+from transformers import ElectraForPreTraining, ElectraTokenizerFast
+import torch
+
+discriminator = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
+tokenizer = ElectraTokenizerFast.from_pretrained("google/electra-small-discriminator")
+
+sentence = "The quick brown fox jumps over the lazy dog"
+fake_sentence = "The quick brown fox fake over the lazy dog"
+
+fake_tokens = tokenizer.tokenize(fake_sentence)
+fake_inputs = tokenizer.encode(fake_sentence, return_tensors="pt")
+discriminator_outputs = discriminator(fake_inputs)
+predictions = torch.round((torch.sign(discriminator_outputs[0]) + 1) / 2)
+
+[print("%7s" % token, end="") for token in fake_tokens]
+
+[print("%7s" % int(prediction), end="") for prediction in predictions.tolist()]
+```
diff --git a/model_cards/google/electra-small-generator/README.md b/model_cards/google/electra-small-generator/README.md
new file mode 100644
index 000000000000..c6cc1a851bbf
--- /dev/null
+++ b/model_cards/google/electra-small-generator/README.md
@@ -0,0 +1,29 @@
+---
+language: english
+thumbnail: https://huggingface.co/front/thumbnails/google.png
+---
+
+## ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators
+
+**ELECTRA** is a new method for self-supervised language representation learning. It can be used to pre-train transformer networks using relatively little compute. ELECTRA models are trained to distinguish "real" input tokens vs "fake" input tokens generated by another neural network, similar to the discriminator of a [GAN](https://arxiv.org/pdf/1406.2661.pdf). At small scale, ELECTRA achieves strong results even when trained on a single GPU. At large scale, ELECTRA achieves state-of-the-art results on the [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.
+
+For a detailed description and experimental results, please refer to our paper [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators](https://openreview.net/pdf?id=r1xMH1BtvB).
+
+This repository contains code to pre-train ELECTRA, including small ELECTRA models on a single GPU. It also supports fine-tuning ELECTRA on downstream tasks including classification tasks (e.g,. [GLUE](https://gluebenchmark.com/)), QA tasks (e.g., [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)), and sequence tagging tasks (e.g., [text chunking](https://www.clips.uantwerpen.be/conll2000/chunking/)).
+
+## How to use the generator in `transformers`
+
+```python
+from transformers import pipeline
+
+fill_mask = pipeline(
+	"fill-mask",
+	model="google/electra-small-generator",
+	tokenizer="google/electra-small-generator"
+)
+
+print(
+	fill_mask(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks.")
+)
+
+```
diff --git a/model_cards/gpt2-README.md b/model_cards/gpt2-README.md
new file mode 100644
index 000000000000..89642b75d42b
--- /dev/null
+++ b/model_cards/gpt2-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=gpt2)
diff --git a/model_cards/gsarti/covidbert-nli/README.md b/model_cards/gsarti/covidbert-nli/README.md
new file mode 100644
index 000000000000..45037dcbc0da
--- /dev/null
+++ b/model_cards/gsarti/covidbert-nli/README.md
@@ -0,0 +1,38 @@
+# CovidBERT-NLI
+
+This is the model **CovidBERT** trained by DeepSet on AllenAI's [CORD19 Dataset](https://pages.semanticscholar.org/coronavirus-research) of scientific articles about coronaviruses.
+
+The model uses the original BERT wordpiece vocabulary and was subsequently fine-tuned on the [SNLI](https://nlp.stanford.edu/projects/snli/) and the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) datasets using the [`sentence-transformers` library](https://github.com/UKPLab/sentence-transformers/) to produce universal sentence embeddings [1] using the **average pooling strategy** and a **softmax loss**.
+
+Parameter details for the original training on CORD-19 are available on [DeepSet's MLFlow](https://public-mlflow.deepset.ai/#/experiments/2/runs/ba27d00c30044ef6a33b1d307b4a6cba)
+
+**Base model**: `deepset/covid_bert_base` from HuggingFace's `AutoModel`.
+
+**Training time**: ~6 hours on the NVIDIA Tesla P100 GPU provided in Kaggle Notebooks.
+
+**Parameters**:
+
+| Parameter        | Value |
+|------------------|-------|
+| Batch size       | 64    |
+| Training steps   | 23000 |
+| Warmup steps     | 1450  |
+| Lowercasing      | True  |
+| Max. Seq. Length | 128   |
+
+**Performances**: The performance was evaluated on the test portion of the [STS dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) using Spearman rank correlation and compared to the performances of similar models obtained with the same procedure to verify its performances.
+
+| Model                         | Score       |
+|-------------------------------|-------------|
+| `covidbert-nli` (this)        | 67.52       |
+| `gsarti/biobert-nli`          | 73.40       |
+| `gsarti/scibert-nli`          | 74.50       |
+| `bert-base-nli-mean-tokens`[2]| 77.12       |
+
+An example usage for similarity-based scientific paper retrieval is provided in the [Covid-19 Semantic Browser](https://github.com/gsarti/covid-papers-browser) repository.
+
+**References:**
+
+[1] A. Conneau et al., [Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://www.aclweb.org/anthology/D17-1070/)
+
+[2] N. Reimers et I. Gurevych, [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://www.aclweb.org/anthology/D19-1410/)
diff --git a/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad1/README.md b/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad1/README.md
new file mode 100644
index 000000000000..63916774cb99
--- /dev/null
+++ b/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad1/README.md
@@ -0,0 +1,94 @@
+---
+language: polish
+---
+
+# Multilingual + Polish SQuAD1.1
+
+This model is the multilingual model provided by the Google research team with a fine-tuned polish Q&A downstream task.
+
+## Details of the language model
+
+Language model ([**bert-base-multilingual-cased**](https://github.com/google-research/bert/blob/master/multilingual.md)):
+12-layer, 768-hidden, 12-heads, 110M parameters.
+Trained on cased text in the top 104 languages with the largest Wikipedias.
+
+## Details of the downstream task
+Using the `mtranslate` Python module, [**SQuAD1.1**](https://rajpurkar.github.io/SQuAD-explorer/) was machine-translated. In order to find the start tokens, the direct translations of the answers were searched in the corresponding paragraphs. Due to the different translations depending on the context (missing context in the pure answer), the answer could not always be found in the text, and thus a loss of question-answer examples occurred. This is a potential problem where errors can occur in the data set.
+
+| Dataset                | # Q&A |
+| ---------------------- | ----- |
+| SQuAD1.1 Train         | 87.7 K |
+| Polish SQuAD1.1 Train   | 39.5 K |
+| SQuAD1.1 Dev           |  10.6 K |
+| Polish SQuAD1.1 Dev     |  2.6 K |
+
+
+## Model benchmark
+
+| Model                | EM | F1 |
+| ---------------------- | ----- | ----- |
+| [SlavicBERT](https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased)   | **60.89** | 71.68 |
+| [polBERT](https://huggingface.co/dkleczek/bert-base-polish-uncased-v1)   | 57.46 | 68.87 |
+| [multiBERT](https://huggingface.co/bert-base-multilingual-cased) | 60.67 | **71.89** |
+| [xlm](https://huggingface.co/xlm-mlm-100-1280)     | 47.98 | 59.42 |
+## Model training
+
+The model was trained on a **Tesla V100** GPU with the following command:
+
+```python
+export SQUAD_DIR=path/to/pl_squad
+
+python run_squad.py 
+  --model_type bert \
+  --model_name_or_path bert-base-multilingual-cased \
+  --do_train \
+  --do_eval \
+  --train_file $SQUAD_DIR/pl_squadv1_train_clean.json \
+  --predict_file $SQUAD_DIR/pl_squadv1_dev_clean.json \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --save_steps=8000 \
+  --output_dir ../../output \
+  --overwrite_cache \
+  --overwrite_output_dir
+```
+
+**Results**:
+
+{'exact': 60.670731707317074, 'f1': 71.8952193697293, 'total': 2624, 'HasAns_exact': 60.670731707317074, 'HasAns_f1': 71.8952193697293,
+'HasAns_total': 2624, 'best_exact': 60.670731707317074, 'best_exact_thresh': 0.0, 'best_f1': 71.8952193697293, 'best_f1_thresh': 0.0}
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="henryk/bert-base-multilingual-cased-finetuned-polish-squad1",
+    tokenizer="henryk/bert-base-multilingual-cased-finetuned-polish-squad1"
+)
+
+qa_pipeline({
+    'context': "Warszawa jest największym miastem w Polsce pod względem liczby ludności i powierzchni",
+    'question': "Jakie jest największe miasto w Polsce?"})
+
+```
+
+# Output:
+
+```json
+{
+  "score": 0.9988,
+  "start": 0, 
+  "end": 8,
+  "answer": "Warszawa"
+}
+```
+
+## Contact
+
+Please do not hesitate to contact me via [LinkedIn](https://www.linkedin.com/in/henryk-borzymowski-0755a2167/) if you want to discuss or get access to the Polish version of SQuAD.
\ No newline at end of file
diff --git a/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad2/README.md b/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad2/README.md
new file mode 100644
index 000000000000..52f738a1f5ba
--- /dev/null
+++ b/model_cards/henryk/bert-base-multilingual-cased-finetuned-polish-squad2/README.md
@@ -0,0 +1,96 @@
+---
+language: polish
+---
+
+# Multilingual + Polish SQuAD2.0
+
+This model is the multilingual model provided by the Google research team with a fine-tuned polish Q&A downstream task.
+
+## Details of the language model
+
+Language model ([**bert-base-multilingual-cased**](https://github.com/google-research/bert/blob/master/multilingual.md)):
+12-layer, 768-hidden, 12-heads, 110M parameters.
+Trained on cased text in the top 104 languages with the largest Wikipedias.
+
+## Details of the downstream task
+Using the `mtranslate` Python module, [**SQuAD2.0**](https://rajpurkar.github.io/SQuAD-explorer/) was machine-translated. In order to find the start tokens, the direct translations of the answers were searched in the corresponding paragraphs. Due to the different translations depending on the context (missing context in the pure answer), the answer could not always be found in the text, and thus a loss of question-answer examples occurred. This is a potential problem where errors can occur in the data set.
+
+| Dataset                | # Q&A |
+| ---------------------- | ----- |
+| SQuAD2.0 Train         | 130 K |
+| Polish SQuAD2.0 Train   | 83.1 K |
+| SQuAD2.0 Dev           |  12 K |
+| Polish SQuAD2.0 Dev     | 8.5  K |
+
+
+## Model benchmark
+
+| Model                | EM/F1 |HasAns (EM/F1) | NoAns |
+| ---------------------- | ----- | ----- | ----- |
+| [SlavicBERT](https://huggingface.co/DeepPavlov/bert-base-bg-cs-pl-ru-cased)   | 69.35/71.51  | 47.02/54.09 | 79.20 |
+| [polBERT](https://huggingface.co/dkleczek/bert-base-polish-uncased-v1)   | 67.33/69.80| 45.73/53.80  | 76.87 |
+| [multiBERT](https://huggingface.co/bert-base-multilingual-cased) | **70.76**/**72.92**  |45.00/52.04 | 82.13 |
+
+## Model training
+
+The model was trained on a **Tesla V100** GPU with the following command:
+
+```python
+export SQUAD_DIR=path/to/pl_squad
+
+python run_squad.py 
+  --model_type bert \
+  --model_name_or_path bert-base-multilingual-cased \
+  --do_train \
+  --do_eval \
+  --version_2_with_negative \
+  --train_file $SQUAD_DIR/pl_squadv2_train.json \
+  --predict_file $SQUAD_DIR/pl_squadv2_dev.json \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --save_steps=8000 \
+  --output_dir ../../output \
+  --overwrite_cache \
+  --overwrite_output_dir
+```
+
+**Results**:
+
+{'exact': 70.76671723655035, 'f1': 72.92156947155917, 'total': 8569, 'HasAns_exact': 45.00762195121951, 'HasAns_f1': 52.04456128116991, 'HasAns_total': 2624, 'NoAns_exact': 82.13624894869638, '
+NoAns_f1': 82.13624894869638, 'NoAns_total': 5945, 'best_exact': 71.72365503559342, 'best_exact_thresh': 0.0, 'best_f1': 73.62662512059369, 'best_f1_thresh': 0.0}
+
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="henryk/bert-base-multilingual-cased-finetuned-polish-squad2",
+    tokenizer="henryk/bert-base-multilingual-cased-finetuned-polish-squad2"
+)
+
+qa_pipeline({
+    'context': "Warszawa jest największym miastem w Polsce pod względem liczby ludności i powierzchni",
+    'question': "Jakie jest największe miasto w Polsce?"})
+
+```
+
+# Output:
+
+```json
+{
+  "score": 0.9986,
+  "start": 0, 
+  "end": 8,
+  "answer": "Warszawa"
+}
+```
+
+## Contact
+
+Please do not hesitate to contact me via [LinkedIn](https://www.linkedin.com/in/henryk-borzymowski-0755a2167/) if you want to discuss or get access to the Polish version of SQuAD.
\ No newline at end of file
diff --git a/model_cards/huseinzol05/albert-base-bahasa-cased/README.md b/model_cards/huseinzol05/albert-base-bahasa-cased/README.md
new file mode 100644
index 000000000000..27f56308605c
--- /dev/null
+++ b/model_cards/huseinzol05/albert-base-bahasa-cased/README.md
@@ -0,0 +1,86 @@
+---
+language: malay
+---
+
+# Bahasa Albert Model
+
+Pretrained Albert base language model for Malay and Indonesian. 
+
+## Pretraining Corpus
+
+`albert-base-bahasa-cased` model was pretrained on ~1.8 Billion words. We trained on both standard and social media language structures, and below is list of data we trained on,
+
+1. [dumping wikipedia](https://github.com/huseinzol05/Malaya-Dataset#wikipedia-1).
+2. [local instagram](https://github.com/huseinzol05/Malaya-Dataset#instagram).
+3. [local twitter](https://github.com/huseinzol05/Malaya-Dataset#twitter-1).
+4. [local news](https://github.com/huseinzol05/Malaya-Dataset#public-news).
+5. [local parliament text](https://github.com/huseinzol05/Malaya-Dataset#parliament).
+6. [local singlish/manglish text](https://github.com/huseinzol05/Malaya-Dataset#singlish-text).
+7. [IIUM Confession](https://github.com/huseinzol05/Malaya-Dataset#iium-confession).
+8. [Wattpad](https://github.com/huseinzol05/Malaya-Dataset#wattpad).
+9. [Academia PDF](https://github.com/huseinzol05/Malaya-Dataset#academia-pdf).
+
+Preprocessing steps can reproduce from here, [Malaya/pretrained-model/preprocess](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/preprocess).
+
+## Pretraining details
+
+- This model was trained using Google Albert's github [repository](https://github.com/google-research/ALBERT) on v3-8 TPU.
+- All steps can reproduce from here, [Malaya/pretrained-model/albert](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/albert).
+
+## Load Pretrained Model
+
+You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:  
+
+```python
+from transformers import AlbertTokenizer, AlbertModel
+
+model = BertModel.from_pretrained('huseinzol05/albert-base-bahasa-cased')
+tokenizer = AlbertTokenizer.from_pretrained(
+    'huseinzol05/albert-base-bahasa-cased',
+    do_lower_case = False,
+)
+```
+
+## Example using AutoModelWithLMHead
+
+```python
+from transformers import AlbertTokenizer, AutoModelWithLMHead, pipeline
+
+model = AutoModelWithLMHead.from_pretrained('huseinzol05/albert-base-bahasa-cased')
+tokenizer = AlbertTokenizer.from_pretrained(
+    'huseinzol05/albert-base-bahasa-cased',
+    do_lower_case = False,
+)
+fill_mask = pipeline('fill-mask', model = model, tokenizer = tokenizer)
+print(fill_mask('makan ayam dengan [MASK]'))
+```
+
+Output is,
+
+```text
+[{'sequence': '[CLS] makan ayam dengan ayam[SEP]',
+  'score': 0.044952988624572754,
+  'token': 629},
+ {'sequence': '[CLS] makan ayam dengan sayur[SEP]',
+  'score': 0.03621877357363701,
+  'token': 1639},
+ {'sequence': '[CLS] makan ayam dengan ikan[SEP]',
+  'score': 0.034429922699928284,
+  'token': 758},
+ {'sequence': '[CLS] makan ayam dengan nasi[SEP]',
+  'score': 0.032447945326566696,
+  'token': 453},
+ {'sequence': '[CLS] makan ayam dengan rendang[SEP]',
+  'score': 0.028885239735245705,
+  'token': 2451}]
+```
+
+## Results
+
+For further details on the model performance, simply checkout accuracy page from Malaya, https://malaya.readthedocs.io/en/latest/Accuracy.html, we compared with traditional models.
+
+## Acknowledgement
+
+Thanks to [Im Big](https://www.facebook.com/imbigofficial/), [LigBlou](https://www.facebook.com/ligblou), [Mesolitica](https://mesolitica.com/) and [KeyReply](https://www.keyreply.com/) for sponsoring AWS, Google and GPU clouds to train Albert for Bahasa. 
+
+
diff --git a/model_cards/huseinzol05/tiny-bert-bahasa-cased/README.md b/model_cards/huseinzol05/tiny-bert-bahasa-cased/README.md
new file mode 100644
index 000000000000..967e870e997a
--- /dev/null
+++ b/model_cards/huseinzol05/tiny-bert-bahasa-cased/README.md
@@ -0,0 +1,92 @@
+---
+language: malay
+---
+
+# Bahasa Tiny-BERT Model
+
+General Distilled Tiny BERT language model for Malay and Indonesian. 
+
+## Pretraining Corpus
+
+`tiny-bert-bahasa-cased` model was distilled on ~1.8 Billion words. We distilled on both standard and social media language structures, and below is list of data we distilled on,
+
+1. [dumping wikipedia](https://github.com/huseinzol05/Malaya-Dataset#wikipedia-1).
+2. [local instagram](https://github.com/huseinzol05/Malaya-Dataset#instagram).
+3. [local twitter](https://github.com/huseinzol05/Malaya-Dataset#twitter-1).
+4. [local news](https://github.com/huseinzol05/Malaya-Dataset#public-news).
+5. [local parliament text](https://github.com/huseinzol05/Malaya-Dataset#parliament).
+6. [local singlish/manglish text](https://github.com/huseinzol05/Malaya-Dataset#singlish-text).
+7. [IIUM Confession](https://github.com/huseinzol05/Malaya-Dataset#iium-confession).
+8. [Wattpad](https://github.com/huseinzol05/Malaya-Dataset#wattpad).
+9. [Academia PDF](https://github.com/huseinzol05/Malaya-Dataset#academia-pdf).
+
+Preprocessing steps can reproduce from here, [Malaya/pretrained-model/preprocess](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/preprocess).
+
+## Distilling details
+
+- This model was distilled using huawei-noah Tiny-BERT's github [repository](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/TinyBERT) on 3 Titan V100 32GB VRAM.
+- All steps can reproduce from here, [Malaya/pretrained-model/tiny-bert](https://github.com/huseinzol05/Malaya/tree/master/pretrained-model/tiny-bert).
+
+## Load Distilled Model
+
+You can use this model by installing `torch` or `tensorflow` and Huggingface library `transformers`. And you can use it directly by initializing it like this:  
+
+```python
+from transformers import AlbertTokenizer, BertModel
+
+model = BertModel.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')
+tokenizer = AlbertTokenizer.from_pretrained(
+    'huseinzol05/tiny-bert-bahasa-cased',
+    unk_token = '[UNK]',
+    pad_token = '[PAD]',
+    do_lower_case = False,
+)
+```
+
+We use [google/sentencepiece](https://github.com/google/sentencepiece) to train the tokenizer, so to use it, need to load from `AlbertTokenizer`.
+
+## Example using AutoModelWithLMHead
+
+```python
+from transformers import AlbertTokenizer, AutoModelWithLMHead, pipeline
+
+model = AutoModelWithLMHead.from_pretrained('huseinzol05/tiny-bert-bahasa-cased')
+tokenizer = AlbertTokenizer.from_pretrained(
+    'huseinzol05/tiny-bert-bahasa-cased',
+    unk_token = '[UNK]',
+    pad_token = '[PAD]',
+    do_lower_case = False,
+)
+fill_mask = pipeline('fill-mask', model = model, tokenizer = tokenizer)
+print(fill_mask('makan ayam dengan [MASK]'))
+```
+
+Output is,
+
+```text
+[{'sequence': '[CLS] makan ayam dengan berbual[SEP]',
+  'score': 0.00015769545279908925,
+  'token': 17859},
+ {'sequence': '[CLS] makan ayam dengan kembar[SEP]',
+  'score': 0.0001448775001335889,
+  'token': 8289},
+ {'sequence': '[CLS] makan ayam dengan memaklumkan[SEP]',
+  'score': 0.00013484008377417922,
+  'token': 6881},
+ {'sequence': '[CLS] makan ayam dengan Senarai[SEP]',
+  'score': 0.00013061291247140616,
+  'token': 11698},
+ {'sequence': '[CLS] makan ayam dengan Tiga[SEP]',
+  'score': 0.00012453157978598028,
+  'token': 4232}]
+```
+
+## Results
+
+For further details on the model performance, simply checkout accuracy page from Malaya, https://malaya.readthedocs.io/en/latest/Accuracy.html, we compared with traditional models.
+
+## Acknowledgement
+
+Thanks to [Im Big](https://www.facebook.com/imbigofficial/), [LigBlou](https://www.facebook.com/ligblou), [Mesolitica](https://mesolitica.com/) and [KeyReply](https://www.keyreply.com/) for sponsoring AWS, Google and GPU clouds to train BERT for Bahasa. 
+
+
diff --git a/model_cards/huseinzol05/xlnet-base-bahasa-cased/README.md b/model_cards/huseinzol05/xlnet-base-bahasa-cased/README.md
index a4762e617541..f4d8bf88add4 100644
--- a/model_cards/huseinzol05/xlnet-base-bahasa-cased/README.md
+++ b/model_cards/huseinzol05/xlnet-base-bahasa-cased/README.md
@@ -50,7 +50,7 @@ tokenizer = XLNetTokenizer.from_pretrained(
     'huseinzol05/xlnet-base-bahasa-cased', do_lower_case = False
 )
 fill_mask = pipeline('fill-mask', model = model, tokenizer = tokenizer)
-print(fill_mask('makan ayam dengan [MASK]'))
+print(fill_mask('makan ayam dengan <mask>'))
 ```
 
 ## Results
diff --git a/model_cards/ixa-ehu/berteus-base-cased/README.md b/model_cards/ixa-ehu/berteus-base-cased/README.md
new file mode 100644
index 000000000000..d6785cdcd47c
--- /dev/null
+++ b/model_cards/ixa-ehu/berteus-base-cased/README.md
@@ -0,0 +1,29 @@
+---
+language:
+- basque
+---
+
+# BERTeus base cased
+
+This is the Basque language pretrained model presented in [Give your Text Representation Models some Love: the Case for Basque](https://arxiv.org/pdf/2004.00033.pdf). This model has been trained on a Basque corpus comprising Basque crawled news articles from online newspapers and the Basque Wikipedia. The training corpus contains 224.6 million tokens, of which 35 million come from the Wikipedia.
+
+BERTeus has been tested on four different downstream tasks for Basque: part-of-speech (POS) tagging, named entity recognition (NER), sentiment analysis and topic classification; improving the state of the art for all tasks. See summary of results below:
+
+
+| Downstream task | BERTeus | mBERT | Previous SOTA |
+| --------------- | ------- | ------| ------------- |
+| Topic Classification	  | **76.77**   | 68.42 | 63.00 	    |
+| Sentiment    	  | **78.10**   | 71.02 | 74.02 	    |
+| POS   	  | **97.76**   | 96.37 | 96.10 	    |
+| NER    	  | **87.06**   | 81.52 | 76.72 	    |
+
+
+If using this model, please cite the following paper:
+```
+@inproceedings{agerri2020give,
+  title={Give your Text Representation Models some Love: the Case for Basque},
+  author={Rodrigo Agerri and I{\~n}aki San Vicente and Jon Ander Campos and Ander Barrena and Xabier Saralegi and Aitor Soroa and Eneko Agirre},
+  booktitle={Proceedings of the 12th International Conference on Language Resources and Evaluation},
+  year={2020}
+}
+```
diff --git a/model_cards/ktrapeznikov/albert-xlarge-v2-squad-v2/README.md b/model_cards/ktrapeznikov/albert-xlarge-v2-squad-v2/README.md
new file mode 100644
index 000000000000..33bb4e18a21f
--- /dev/null
+++ b/model_cards/ktrapeznikov/albert-xlarge-v2-squad-v2/README.md
@@ -0,0 +1,61 @@
+### Model
+**[`albert-xlarge-v2`](https://huggingface.co/albert-xlarge-v2)** fine-tuned on **[`SQuAD V2`](https://rajpurkar.github.io/SQuAD-explorer/)** using **[`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py)**
+
+### Training Parameters
+Trained on 4 NVIDIA GeForce RTX 2080 Ti 11Gb
+```bash
+BASE_MODEL=albert-xlarge-v2
+python run_squad.py \
+  --version_2_with_negative \
+  --model_type albert \
+  --model_name_or_path $BASE_MODEL \
+  --output_dir $OUTPUT_MODEL \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v2.0.json \
+  --predict_file $SQUAD_DIR/dev-v2.0.json \
+  --per_gpu_train_batch_size 3 \
+  --per_gpu_eval_batch_size 64 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 3.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --save_steps 2000 \
+  --threads 24 \
+  --warmup_steps 814 \
+  --gradient_accumulation_steps 4 \
+  --fp16 \
+  --do_train
+```
+  
+### Evaluation
+
+Evaluation on the dev set. I did not sweep for best threshold.
+
+|                   | val               |
+|-------------------|-------------------|
+| exact             | 84.41842836688285 |
+| f1                | 87.4628460501696  |
+| total             | 11873.0           |
+| HasAns_exact      | 80.68488529014844 |
+| HasAns_f1         | 86.78245127423482 |
+| HasAns_total      | 5928.0            |
+| NoAns_exact       | 88.1412952060555  |
+| NoAns_f1          | 88.1412952060555  |
+| NoAns_total       | 5945.0            |
+| best_exact        | 84.41842836688285 |
+| best_exact_thresh | 0.0               |
+| best_f1           | 87.46284605016956 |
+| best_f1_thresh    | 0.0               |
+
+
+### Usage
+
+See [huggingface documentation](https://huggingface.co/transformers/model_doc/albert.html#albertforquestionanswering). Training on `SQuAD V2` allows the model to score if a paragraph contains an answer:
+```python
+start_scores, end_scores = model(input_ids) 
+span_scores = start_scores.softmax(dim=1).log()[:,:,None] + end_scores.softmax(dim=1).log()[:,None,:]
+ignore_score = span_scores[:,0,0] #no answer scores
+    
+```
+
diff --git a/model_cards/ktrapeznikov/biobert_v1.1_pubmed_squad_v2/README.md b/model_cards/ktrapeznikov/biobert_v1.1_pubmed_squad_v2/README.md
new file mode 100644
index 000000000000..2f4c081dc372
--- /dev/null
+++ b/model_cards/ktrapeznikov/biobert_v1.1_pubmed_squad_v2/README.md
@@ -0,0 +1,64 @@
+### Model
+**[`monologg/biobert_v1.1_pubmed`](https://huggingface.co/monologg/biobert_v1.1_pubmed)** fine-tuned on **[`SQuAD V2`](https://rajpurkar.github.io/SQuAD-explorer/)** using **[`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py)**
+
+This model is cased.
+
+### Training Parameters
+Trained on 4 NVIDIA GeForce RTX 2080 Ti 11Gb
+```bash
+BASE_MODEL=monologg/biobert_v1.1_pubmed
+python run_squad.py \
+  --version_2_with_negative \
+  --model_type albert \
+  --model_name_or_path $BASE_MODEL \
+  --output_dir $OUTPUT_MODEL \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v2.0.json \
+  --predict_file $SQUAD_DIR/dev-v2.0.json \
+  --per_gpu_train_batch_size 18 \
+  --per_gpu_eval_batch_size 64 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 3.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --save_steps 2000 \
+  --threads 24 \
+  --warmup_steps 550 \
+  --gradient_accumulation_steps 1 \
+  --fp16 \
+  --logging_steps 50 \
+  --do_train
+```
+  
+### Evaluation
+
+Evaluation on the dev set. I did not sweep for best threshold.
+
+|                   | val               |
+|-------------------|-------------------|
+| exact             | 75.97068980038743 |
+| f1                | 79.37043950121722 |
+| total             | 11873.0           |
+| HasAns_exact      | 74.13967611336032 |
+| HasAns_f1         | 80.94892513460755 |
+| HasAns_total      | 5928.0            |
+| NoAns_exact       | 77.79646761984861 |
+| NoAns_f1          | 77.79646761984861 |
+| NoAns_total       | 5945.0            |
+| best_exact        | 75.97068980038743 |
+| best_exact_thresh | 0.0               |
+| best_f1           | 79.37043950121729 |
+| best_f1_thresh    | 0.0               |
+
+
+### Usage
+
+See [huggingface documentation](https://huggingface.co/transformers/model_doc/bert.html#bertforquestionanswering). Training on `SQuAD V2` allows the model to score if a paragraph contains an answer:
+```python
+start_scores, end_scores = model(input_ids) 
+span_scores = start_scores.softmax(dim=1).log()[:,:,None] + end_scores.softmax(dim=1).log()[:,None,:]
+ignore_score = span_scores[:,0,0] #no answer scores
+    
+```
+
diff --git a/model_cards/ktrapeznikov/scibert_scivocab_uncased_squad_v2/README.md b/model_cards/ktrapeznikov/scibert_scivocab_uncased_squad_v2/README.md
new file mode 100644
index 000000000000..b368750138d4
--- /dev/null
+++ b/model_cards/ktrapeznikov/scibert_scivocab_uncased_squad_v2/README.md
@@ -0,0 +1,61 @@
+### Model
+**[`allenai/scibert_scivocab_uncased`](https://huggingface.co/allenai/scibert_scivocab_uncased)** fine-tuned on **[`SQuAD V2`](https://rajpurkar.github.io/SQuAD-explorer/)** using **[`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py)**
+
+### Training Parameters
+Trained on 4 NVIDIA GeForce RTX 2080 Ti 11Gb
+```bash
+BASE_MODEL=allenai/scibert_scivocab_uncased
+python run_squad.py \
+  --version_2_with_negative \
+  --model_type albert \
+  --model_name_or_path $BASE_MODEL \
+  --output_dir $OUTPUT_MODEL \
+  --do_eval \
+  --do_lower_case \
+  --train_file $SQUAD_DIR/train-v2.0.json \
+  --predict_file $SQUAD_DIR/dev-v2.0.json \
+  --per_gpu_train_batch_size 18 \
+  --per_gpu_eval_batch_size 64 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 3.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --save_steps 2000 \
+  --threads 24 \
+  --warmup_steps 550 \
+  --gradient_accumulation_steps 1 \
+  --fp16 \
+  --logging_steps 50 \
+  --do_train
+```
+  
+### Evaluation
+
+Evaluation on the dev set. I did not sweep for best threshold.
+
+|                   | val               |
+|-------------------|-------------------|
+| exact             | 75.07790785816559 |
+| f1                | 78.47735207283013 |
+| total             | 11873.0           |
+| HasAns_exact      | 70.76585695006747 |
+| HasAns_f1         | 77.57449412292718 |
+| HasAns_total      | 5928.0            |
+| NoAns_exact       | 79.37762825904122 |
+| NoAns_f1          | 79.37762825904122 |
+| NoAns_total       | 5945.0            |
+| best_exact        | 75.08633032931863 |
+| best_exact_thresh | 0.0               |
+| best_f1           | 78.48577454398324 |
+| best_f1_thresh    | 0.0               |
+
+### Usage
+
+See [huggingface documentation](https://huggingface.co/transformers/model_doc/bert.html#bertforquestionanswering). Training on `SQuAD V2` allows the model to score if a paragraph contains an answer:
+```python
+start_scores, end_scores = model(input_ids) 
+span_scores = start_scores.softmax(dim=1).log()[:,:,None] + end_scores.softmax(dim=1).log()[:,None,:]
+ignore_score = span_scores[:,0,0] #no answer scores
+    
+```
+
diff --git a/model_cards/lvwerra/bert-imdb/README.md b/model_cards/lvwerra/bert-imdb/README.md
new file mode 100644
index 000000000000..dcc9932979db
--- /dev/null
+++ b/model_cards/lvwerra/bert-imdb/README.md
@@ -0,0 +1,14 @@
+# BERT-IMDB
+
+## What is it?
+BERT (`bert-large-cased`) trained for sentiment classification on the [IMDB dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews).
+
+## Training setting
+
+The model was trained on 80% of the IMDB dataset for sentiment classification for three epochs with a learning rate of `1e-5` with the `simpletransformers` library. The library uses a learning rate schedule.
+
+## Result
+The model achieved 90% classification accuracy on the validation set.
+
+## Reference
+The full experiment is available in the [tlr repo](https://lvwerra.github.io/trl/03-bert-imdb-training/).
diff --git a/model_cards/lvwerra/gpt2-imdb-pos/README.md b/model_cards/lvwerra/gpt2-imdb-pos/README.md
new file mode 100644
index 000000000000..f9d9e4942531
--- /dev/null
+++ b/model_cards/lvwerra/gpt2-imdb-pos/README.md
@@ -0,0 +1,18 @@
+# GPT2-IMDB-pos
+
+## What is it?
+A small GPT2 (`lvwerra/gpt2-imdb`) language model fine-tuned to produce positive movie reviews based the [IMDB dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews). The model is trained with rewards from a BERT sentiment classifier (`lvwerra/gpt2-imdb`) via PPO.
+
+## Training setting
+The model was trained for `100` optimisation steps with a batch size of `256` which corresponds to `25600` training samples. The full experiment setup can be found in the Jupyter notebook in the [trl repo](https://lvwerra.github.io/trl/04-gpt2-sentiment-ppo-training/).
+
+## Examples
+A few examples of the model response to a query before and after optimisation:
+
+| query | response (before) | response (after) | rewards (before) | rewards (after) |
+|-------|-------------------|------------------|------------------|-----------------|
+|I'd never seen a |heavier, woodier example of Victorian archite... |film of this caliber, and I think it's wonder... |3.297736 |4.158653|
+|I love John's work	|but I actually have to write language as in w... |and I hereby recommend this film. I am really... |-1.904006 |4.159198 |
+|I's a big struggle |to see anyone who acts in that way. by Jim Th... |, but overall I'm happy with the changes even ... |-1.595925 |2.651260|
+
+
diff --git a/model_cards/lvwerra/gpt2-imdb/README.md b/model_cards/lvwerra/gpt2-imdb/README.md
new file mode 100644
index 000000000000..6922a169e2cc
--- /dev/null
+++ b/model_cards/lvwerra/gpt2-imdb/README.md
@@ -0,0 +1,27 @@
+# GPT2-IMDB
+
+## What is it?
+A GPT2 (`gpt2`) language model fine-tuned on the [IMDB dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews).
+
+## Training setting
+
+The GPT2 language model was fine-tuned for 1 epoch on the IMDB dataset. All comments were joined into a single text file separated by the EOS token:
+
+```
+import pandas as pd
+df = pd.read_csv("imdb-dataset.csv")
+imdb_str = " <|endoftext|> ".join(df['review'].tolist())
+
+with open ('imdb.txt', 'w') as f:
+    f.write(imdb_str)
+```
+
+To train the model the `run_language_modeling.py` script in the `transformer` library was used:
+
+```
+python run_language_modeling.py 
+	--train_data_file imdb.txt 
+	--output_dir gpt2-imdb 
+	--model_type gpt2 
+	--model_name_or_path gpt2
+```
diff --git a/model_cards/mrm8488/GPT-2-finetuned-CORD19/README.md b/model_cards/mrm8488/GPT-2-finetuned-CORD19/README.md
index b88399bd848e..6d128d82d766 100644
--- a/model_cards/mrm8488/GPT-2-finetuned-CORD19/README.md
+++ b/model_cards/mrm8488/GPT-2-finetuned-CORD19/README.md
@@ -5,15 +5,16 @@ thumbnail:
 
 # GPT-2 + CORD19 dataset : 🦠 ✍ ⚕
 
-**GPT-2** fine-tuned on **biorxiv_medrxiv** and **comm_use_subset files** from [CORD-19](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge) dataset.
+**GPT-2** fine-tuned on **biorxiv_medrxiv**, **comm_use_subset** and **custom_license** files from [CORD-19](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge) dataset.
 
 
-## Datasets details:
+## Datasets details
 
 | Dataset                | # Files |
 | ---------------------- | ----- |
 | biorxiv_medrxiv        | 885  |
-| comm_use_subse         | 9K   |
+| comm_use_subset         | 9K   |
+| custom_license         | 20.6K   |
 
 ## Model training
 
@@ -37,7 +38,7 @@ python run_language_modeling.py \
 
 <img alt="training loss" src="https://svgshare.com/i/JTf.svg' title='GTP-2-finetuned-CORDS19-loss" width="600" height="300" />
 
-## Model in action / Example of usage: ✒
+## Model in action / Example of usage ✒
 
 You can get the following script [here](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py)
 
diff --git a/model_cards/mrm8488/GPT-2-finetuned-covid-bio-medrxiv/README.md b/model_cards/mrm8488/GPT-2-finetuned-covid-bio-medrxiv/README.md
new file mode 100644
index 000000000000..cb0251c5a601
--- /dev/null
+++ b/model_cards/mrm8488/GPT-2-finetuned-covid-bio-medrxiv/README.md
@@ -0,0 +1,62 @@
+---
+language: english
+thumbnail:
+---
+
+# GPT-2 + bio/medrxiv files from CORD19: 🦠 ✍ ⚕
+
+**GPT-2** fine-tuned on **biorxiv_medrxiv** files from [CORD-19](https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge) dataset.
+
+
+## Datasets details:
+
+| Dataset                | # Files |
+| ---------------------- | ----- |
+| biorxiv_medrxiv        | 885  |
+
+
+## Model training:
+
+The model was trained on a Tesla P100 GPU and 25GB of RAM with the following command:
+
+```bash
+
+export TRAIN_FILE=/path/to/dataset/train.txt
+
+python run_language_modeling.py \
+    --model_type gpt2 \
+    --model_name_or_path gpt2 \
+    --do_train \
+    --train_data_file $TRAIN_FILE \
+    --num_train_epochs 4 \
+    --output_dir model_output \
+    --overwrite_output_dir \
+    --save_steps 2000 \
+    --per_gpu_train_batch_size 3
+```
+
+## Model in action / Example of usage: ✒
+
+You can get the following script [here](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py)
+
+```bash
+python run_generation.py \
+    --model_type gpt2 \
+    --model_name_or_path mrm8488/GPT-2-finetuned-CORD19 \
+    --length 200
+```
+```txt
+👵👴🦠
+# Input: Old people with COVID-19 tends to suffer 
+# Output: === GENERATED SEQUENCE 1 ===
+Old people with COVID-19 tends to suffer more symptom onset time and death. It is well known that many people with COVID-19 have high homozygous ZIKV infection in the face of severe symptoms in both severe and severe cases.
+The origin of Wuhan Fever was investigated by Prof. Shen Jiang at the outbreak of Wuhan Fever [34]. As Huanan Province is the epicenter of this outbreak, Huanan, the epicenter of epidemic Wuhan Fever, is the most potential location for the direct transmission of infection (source: Zhongzhen et al., 2020). A negative risk ratio indicates more frequent underlying signs in the people in Huanan Province with COVID-19 patients. Further analysis of reported Huanan Fever onset data in the past two years indicated that the intensity of exposure is the key risk factor for developing MERS-CoV infection in this region, especially among children and elderly. To be continued to develop infected patients would be a very important area for
+```
+
+![Model in action](https://media.giphy.com/media/TgUdO72Iwk9h7hhm7G/giphy.gif)
+
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488) | [LinkedIn](https://www.linkedin.com/in/manuel-romero-cs/)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
index e50b8cfd0ce7..17fcb792046d 100644
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-ner/README.md
@@ -66,6 +66,8 @@ nlp_ner = pipeline(
         {"use_fast": False}
 ))
 
+text = 'Mis amigos están pensando viajar a Londres este verano'
+
 nlp_ner(text)
 
 #Output: [{'entity': 'B-LOC', 'score': 0.9998720288276672, 'word': 'Londres'}]
diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-pos-syntax/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos-syntax/README.md
new file mode 100644
index 000000000000..0ee382d9ed30
--- /dev/null
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos-syntax/README.md
@@ -0,0 +1,83 @@
+---
+language: spanish
+thumbnail:
+---
+
+# Spanish BERT (BETO) + Syntax POS tagging ✍🏷
+
+This model is a fine-tuned version of the Spanish BERT [(BETO)](https://github.com/dccuchile/beto) on Spanish **syntax** annotations in [CONLL CORPORA](https://www.kaggle.com/nltkdata/conll-corpora) dataset for **syntax POS** (Part of Speech tagging) downstream task.
+
+## Details of the downstream task (Syntax POS) - Dataset
+
+- [Dataset: CONLL Corpora ES](https://www.kaggle.com/nltkdata/conll-corpora)
+
+#### [Fine-tune script on NER dataset provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
+
+#### 21 Syntax annotations (Labels) covered:
+
+- \_
+- ATR
+- ATR.d
+- CAG
+- CC
+- CD
+- CD.Q
+- CI
+- CPRED
+- CPRED.CD
+- CPRED.SUJ
+- CREG
+- ET
+- IMPERS
+- MOD
+- NEG
+- PASS
+- PUNC
+- ROOT
+- SUJ
+- VOC
+
+## Metrics on test set 📋
+
+|  Metric   |  # score  |
+| :-------: | :-------: |
+|    F1     | **89.27** |
+| Precision | **89.44** |
+|  Recall   | **89.11** |
+
+## Model in action 🔨
+
+Fast usage with **pipelines** 🧪
+
+```python
+from transformers import pipeline
+
+nlp_pos_syntax = pipeline(
+    "ner",
+    model="mrm8488/bert-spanish-cased-finetuned-pos-syntax",
+    tokenizer="mrm8488/bert-spanish-cased-finetuned-pos-syntax"
+)
+
+text = 'Mis amigos están pensando viajar a Londres este verano.'
+
+nlp_pos_syntax(text)[1:len(nlp_pos_syntax(text))-1]
+```
+
+```json
+[
+  { "entity": "_", "score": 0.9999216794967651, "word": "Mis" },
+  { "entity": "SUJ", "score": 0.999882698059082, "word": "amigos" },
+  { "entity": "_", "score": 0.9998869299888611, "word": "están" },
+  { "entity": "ROOT", "score": 0.9980518221855164, "word": "pensando" },
+  { "entity": "_", "score": 0.9998420476913452, "word": "viajar" },
+  { "entity": "CD", "score": 0.999351978302002, "word": "a" },
+  { "entity": "_", "score": 0.999959409236908, "word": "Londres" },
+  { "entity": "_", "score": 0.9998968839645386, "word": "este" },
+  { "entity": "CC", "score": 0.99931401014328, "word": "verano" },
+  { "entity": "PUNC", "score": 0.9998534917831421, "word": "." }
+]
+```
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
index ed7c5cf74d0e..6c3cb8386f77 100644
--- a/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
+++ b/model_cards/mrm8488/bert-spanish-cased-finetuned-pos/README.md
@@ -21,7 +21,7 @@ I preprocessed the dataset and splitted it as train / dev (80/20)
 
 - [Fine-tune on NER script provided by Huggingface](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py)
 
-- Labels covered:
+- **60** Labels covered:
 
 ```
 AO, AQ, CC, CS, DA, DD, DE, DI, DN, DP, DT, Faa, Fat, Fc, Fd, Fe, Fg, Fh, Fia, Fit, Fp, Fpa, Fpt, Fs, Ft, Fx, Fz, I, NC, NP, P0, PD, PI, PN, PP, PR, PT, PX, RG, RN, SP, VAI, VAM, VAN, VAP, VAS, VMG, VMI, VMM, VMN, VMP, VMS, VSG, VSI, VSM, VSN, VSP, VSS, Y and Z
@@ -74,6 +74,8 @@ nlp_pos(text)
 ```
 ![model in action](https://media.giphy.com/media/jVC9m1cNrdIWuAAtjy/giphy.gif)
 
+16 POS tags version also available [here](https://huggingface.co/mrm8488/bert-spanish-cased-finetuned-pos-16-tags)
+
 
 > Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
 
diff --git a/model_cards/mrm8488/distilbert-multi-finetuned-for-xqua-on-tydiqa/README.md b/model_cards/mrm8488/distilbert-multi-finetuned-for-xqua-on-tydiqa/README.md
new file mode 100644
index 000000000000..67e0477170d2
--- /dev/null
+++ b/model_cards/mrm8488/distilbert-multi-finetuned-for-xqua-on-tydiqa/README.md
@@ -0,0 +1,82 @@
+---
+language: multilingual
+thumbnail:
+---
+
+# DistilBERT multilingual fine-tuned on TydiQA (GoldP task) dataset for multilingual Q&A 😛🌍❓
+
+
+## Details of the language model
+
+[distilbert-base-multilingual-cased](https://huggingface.co/distilbert-base-multilingual-cased)
+
+
+## Details of the Tydi QA dataset
+
+TyDi QA contains 200k human-annotated question-answer pairs in 11 Typologically Diverse languages, written without seeing the answer and without the use of translation, and is designed for the **training and evaluation** of automatic question answering systems. This repository provides evaluation code and a baseline system for the dataset. https://ai.google.com/research/tydiqa
+
+
+## Details of the downstream task (Gold Passage or GoldP aka the secondary task)
+
+Given a passage that is guaranteed to contain the answer, predict the single contiguous span of characters that answers the question. the gold passage task differs from the [primary task](https://github.com/google-research-datasets/tydiqa/blob/master/README.md#the-tasks) in several ways:
+*   only the gold answer passage is provided rather than the entire Wikipedia article;
+*   unanswerable questions have been discarded, similar to MLQA and XQuAD;
+*   we evaluate with the SQuAD 1.1 metrics like XQuAD; and
+*   Thai and Japanese are removed since the lack of whitespace breaks some tools.
+
+
+## Model training 💪🏋️‍
+
+The model was fine-tuned on a Tesla P100 GPU and 25GB of RAM.
+The script is the following:
+
+```python
+python transformers/examples/run_squad.py \
+  --model_type distilbert \
+  --model_name_or_path distilbert-base-multilingual-cased \
+  --do_train \
+  --do_eval \
+  --train_file /path/to/dataset/train.json \
+  --predict_file /path/to/dataset/dev.json \
+  --per_gpu_train_batch_size 24 \
+  --per_gpu_eval_batch_size 24 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 5 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /content/model_output \
+  --overwrite_output_dir \
+  --save_steps 1000 \
+  --threads 400
+  ```
+
+## Global Results (dev set) 📝
+
+| Metric    | # Value     |
+| --------- | ----------- |
+| **EM**    | **63.85** |
+| **F1**    | **75.70** |
+
+## Specific Results (per language) 🌍📝 
+
+| Language    | # Samples     | # EM | # F1 |
+| --------- | ----------- |--------| ------ |
+| Arabic    | 1314  | 66.66 | 80.02 |
+| Bengali   | 180   | 53.09 | 63.50 |
+| English   | 654   | 62.42 | 73.12 |
+| Finnish   | 1031  | 64.57 | 75.15 |
+| Indonesian| 773   | 67.89 | 79.70 |
+| Korean    | 414   | 51.29 | 61.73 |
+| Russian   | 1079  | 55.42 | 70.08 |
+| Swahili   | 596   | 74.51 | 81.15 |
+| Telegu    | 874   | 66.21 | 79.85 |
+
+
+## Similar models
+
+You can also try [bert-multi-cased-finedtuned-xquad-tydiqa-goldp](https://huggingface.co/mrm8488/bert-multi-cased-finedtuned-xquad-tydiqa-goldp) that achieves **F1 = 82.16** and **EM = 71.06** (And of course better marks per language).
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/gpt2-imdb-neg/README.md b/model_cards/mrm8488/gpt2-imdb-neg/README.md
new file mode 100644
index 000000000000..a58e1712bc9e
--- /dev/null
+++ b/model_cards/mrm8488/gpt2-imdb-neg/README.md
@@ -0,0 +1,27 @@
+# GPT2-IMDB-neg (LM + RL) 🎞😡✍
+
+All credits to [@lvwerra](https://twitter.com/lvwerra)
+
+## What is it?
+A small GPT2 (`lvwerra/gpt2-imdb`) language model fine-tuned to produce **negative** movie reviews based the [IMDB dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews). The model is trained with rewards from a BERT sentiment classifier (`lvwerra/gpt2-imdb`) via **PPO**.
+
+## Why?
+I wanted to reproduce the experiment [lvwerra/gpt2-imdb-pos](https://huggingface.co/lvwerra/gpt2-imdb-pos) but for generating **negative** movie reviews.
+
+## Training setting
+The model was trained for `100` optimisation steps with a batch size of `256` which corresponds to `25600` training samples. The full experiment setup (for positive samples) in [trl repo](https://lvwerra.github.io/trl/04-gpt2-sentiment-ppo-training/).
+
+## Examples
+A few examples of the model response to a query before and after optimisation:
+
+| query | response (before) | response (after) | rewards (before) | rewards (after) |
+|-------|-------------------|------------------|------------------|-----------------|
+|This movie is a fine |	attempt as far as live action is concerned, n...|example of how bad Hollywood in theatrics pla...|	2.118391 |	-3.31625|
+|I have watched 3 episodes |with this guy and he is such a talented actor...|	but the show is just plain awful and there ne...|	2.681171|	-4.512792|
+|We know that firefighters and|	police officers are forced to become populari...|	other chains have going to get this disaster ...|	1.367811|	-3.34017|
+
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md b/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md
new file mode 100644
index 000000000000..3296d2b41992
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-base-finetuned-squadv1/README.md
@@ -0,0 +1,80 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT base fine-tuned on SQuAD v1
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [SQuAD 1.1](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/) for **Q&A** downstream task ([by them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)).
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
+
+[SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/)
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_squad.py \
+  --do_train \
+  --do_eval \
+  --model spanbert-base-cased \
+  --train_file train-v1.1.json \
+  --dev_file dev-v1.1.json \
+  --train_batch_size 32 \
+  --eval_batch_size 32  \
+  --learning_rate 2e-5 \
+  --num_train_epochs 4 \
+  --max_seq_length 512 \
+  --doc_stride 128 \
+  --eval_metric f1 \
+  --output_dir squad_output \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5         | 76.5     | 73.1    |  67.7  |
+| SpanBERT (base)         | **92.4** (this one)         | [83.6](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv2)      | 77.4    |  [68.2](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)         | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/spanbert-base-finetuned-squadv1",
+    tokenizer="SpanBERT/spanbert-base-cased"
+)
+
+qa_pipeline({
+    'context': "Manuel Romero has been working very hard in the repository hugginface/transformers lately",
+    'question': "How has been working Manuel Romero lately?"
+
+})
+
+# Output: {'answer': 'very hard in the repository hugginface/transformers',
+ 'end': 82,
+ 'score': 0.327230326857725,
+ 'start': 31}
+```
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md b/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md
new file mode 100644
index 000000000000..9c4fb2059330
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-base-finetuned-squadv2/README.md
@@ -0,0 +1,82 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT base fine-tuned on SQuAD v2
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) for **Q&A** downstream task ([by them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)).
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
+
+[SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
+
+| Dataset  | Split | # samples |
+| -------- | ----- | --------- |
+| SQuAD2.0 | train | 130k      |
+| SQuAD2.0 | eval  | 12.3k     |
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_squad.py \
+  --do_train \
+  --do_eval \
+  --model spanbert-base-cased \
+  --train_file train-v2.0.json \
+  --dev_file dev-v2.0.json \
+  --train_batch_size 32 \
+  --eval_batch_size 32  \
+  --learning_rate 2e-5 \
+  --num_train_epochs 4 \
+  --max_seq_length 512 \
+  --doc_stride 128 \
+  --eval_metric best_f1 \
+  --output_dir squad2_output \
+  --version_2_with_negative \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5         | 76.5      | 73.1    |  67.7  |
+| SpanBERT (base)         | [92.4](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv1)         | **83.6** (this one)      | 77.4    |  [68.2](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)          | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/spanbert-base-finetuned-squadv2",
+    tokenizer="SpanBERT/spanbert-base-cased"
+)
+
+qa_pipeline({
+    'context': "Manuel Romero has been working very hard in the repository hugginface/transformers lately",
+    'question': "How has been working Manuel Romero lately?"
+
+})
+# Output: {'answer': 'very hard', 'end': 40, 'score': 0.9052708846768347, 'start': 31}
+```
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md b/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md
new file mode 100644
index 000000000000..df33342008c4
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-base-finetuned-tacred/README.md
@@ -0,0 +1,53 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT base fine-tuned on TACRED
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [TACRED](https://nlp.stanford.edu/projects/tacred/) dataset by [them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Dataset 📚
+
+[TACRED](https://nlp.stanford.edu/projects/tacred/) A large-scale relation extraction dataset with 106k+ examples over 42 TAC KBP relation types.
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_tacred.py \
+  --do_train \
+  --do_eval \
+  --data_dir <TACRED_DATA_DIR> \
+  --model spanbert-base-cased \
+  --train_batch_size 32 \
+  --eval_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 10 \
+  --max_seq_length 128 \
+  --output_dir tacred_dir \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5*         | 76.5*      | 73.1    |  67.7  |
+| SpanBERT (base)         | [92.4*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv1)         | [83.6*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv2)      | 77.4    |  **68.2** (this one)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)        | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)   |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md b/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md
new file mode 100644
index 000000000000..04936da4aa98
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-large-finetuned-squadv1/README.md
@@ -0,0 +1,80 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT large fine-tuned on SQuAD v1
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [SQuAD 1.1](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/) for **Q&A** downstream task ([by them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)).
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
+
+[SQuAD1.1](https://rajpurkar.github.io/SQuAD-explorer/)
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_squad.py \
+  --do_train \
+  --do_eval \
+  --model spanbert-large-cased \
+  --train_file train-v1.1.json \
+  --dev_file dev-v1.1.json \
+  --train_batch_size 32 \
+  --eval_batch_size 32  \
+  --learning_rate 2e-5 \
+  --num_train_epochs 4 \
+  --max_seq_length 512 \
+  --doc_stride 128 \
+  --eval_metric f1 \
+  --output_dir squad_output \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5*         | 76.5*      | 73.1    |  67.7  |
+| SpanBERT (base)         | [92.4*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv1)         | [83.6*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv2)      | 77.4    |  [68.2](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | **94.6** (this)         | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/spanbert-large-finetuned-squadv1",
+    tokenizer="SpanBERT/spanbert-large-cased"
+)
+
+qa_pipeline({
+    'context': "Manuel Romero has been working very hard in the repository hugginface/transformers lately",
+    'question': "How has been working Manuel Romero lately?"
+
+})
+
+# Output: {'answer': 'very hard in the repository hugginface/transformers',
+ 'end': 82,
+ 'score': 0.327230326857725,
+ 'start': 31}
+```
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md b/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md
new file mode 100644
index 000000000000..fb4af6413c36
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-large-finetuned-squadv2/README.md
@@ -0,0 +1,82 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT large fine-tuned on SQuAD v2
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [SQuAD 2.0](https://rajpurkar.github.io/SQuAD-explorer/) for **Q&A** downstream task ([by them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)).
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Details of the downstream task (Q&A) - Dataset 📚 🧐 ❓
+
+[SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but also determine when no answer is supported by the paragraph and abstain from answering.
+
+| Dataset  | Split | # samples |
+| -------- | ----- | --------- |
+| SQuAD2.0 | train | 130k      |
+| SQuAD2.0 | eval  | 12.3k     |
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_squad.py \
+  --do_train \
+  --do_eval \
+  --model spanbert-large-cased \
+  --train_file train-v2.0.json \
+  --dev_file dev-v2.0.json \
+  --train_batch_size 32 \
+  --eval_batch_size 32  \
+  --learning_rate 2e-5 \
+  --num_train_epochs 4 \
+  --max_seq_length 512 \
+  --doc_stride 128 \
+  --eval_metric best_f1 \
+  --output_dir squad2_output \
+  --version_2_with_negative \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5*         | 76.5*      | 73.1    |  67.7  |
+| SpanBERT (base)         | [92.4*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv1)         | [83.6*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv2)      | 77.4    |  [68.2](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)          | **88.7** (this)     | 79.6    |  [70.8](https://huggingface.co/mrm8488/spanbert-large-finetuned-tacred)  |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+## Model in action
+
+Fast usage with **pipelines**:
+
+```python
+from transformers import pipeline
+
+qa_pipeline = pipeline(
+    "question-answering",
+    model="mrm8488/spanbert-large-finetuned-squadv2",
+    tokenizer="SpanBERT/spanbert-large-cased"
+)
+
+qa_pipeline({
+    'context': "Manuel Romero has been working very hard in the repository hugginface/transformers lately",
+    'question': "How has been working Manuel Romero lately?"
+
+})
+# Output: {'answer': 'very hard', 'end': 40, 'score': 0.9052708846768347, 'start': 31}
+```
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md b/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md
new file mode 100644
index 000000000000..1377745d2e92
--- /dev/null
+++ b/model_cards/mrm8488/spanbert-large-finetuned-tacred/README.md
@@ -0,0 +1,53 @@
+---
+language: english
+thumbnail:
+---
+
+# SpanBERT large fine-tuned on TACRED
+
+[SpanBERT](https://github.com/facebookresearch/SpanBERT) created by [Facebook Research](https://github.com/facebookresearch) and fine-tuned on [TACRED](https://nlp.stanford.edu/projects/tacred/) dataset by [them](https://github.com/facebookresearch/SpanBERT#finetuned-models-squad-1120-relation-extraction-coreference-resolution)
+
+## Details of SpanBERT
+
+[SpanBERT: Improving Pre-training by Representing and Predicting Spans](https://arxiv.org/abs/1907.10529)
+
+## Dataset 📚
+
+[TACRED](https://nlp.stanford.edu/projects/tacred/) A large-scale relation extraction dataset with 106k+ examples over 42 TAC KBP relation types.
+
+## Model fine-tuning 🏋️‍
+
+You can get the fine-tuning script [here](https://github.com/facebookresearch/SpanBERT)
+
+```bash
+python code/run_tacred.py \
+  --do_train \
+  --do_eval \
+  --data_dir <TACRED_DATA_DIR> \
+  --model spanbert-large-cased \
+  --train_batch_size 32 \
+  --eval_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 10 \
+  --max_seq_length 128 \
+  --output_dir tacred_dir \
+  --fp16
+```
+
+## Results Comparison 📝
+
+|                   | SQuAD 1.1     | SQuAD 2.0  | Coref   | TACRED |
+| ----------------------  | ------------- | ---------  | ------- | ------ |
+|                         | F1            | F1         | avg. F1 |  F1    |
+| BERT (base)             | 88.5*         | 76.5*      | 73.1    |  67.7  |
+| SpanBERT (base)         | [92.4*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv1)         | [83.6*](https://huggingface.co/mrm8488/spanbert-base-finetuned-squadv2)      | 77.4    |  [68.2](https://huggingface.co/mrm8488/spanbert-base-finetuned-tacred)  |
+| BERT (large)            | 91.3          | 83.3       | 77.1    |  66.4  |
+| SpanBERT (large)        | [94.6](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv1)        | [88.7](https://huggingface.co/mrm8488/spanbert-large-finetuned-squadv2)     | 79.6    |  **70.8** (this one)  |
+
+
+Note: The numbers marked as * are evaluated on the development sets becaus those models were not submitted to the official SQuAD leaderboard. All the other numbers are test numbers.
+
+
+> Created by [Manuel Romero/@mrm8488](https://twitter.com/mrm8488)
+
+> Made with <span style="color: #e25555;">&hearts;</span> in Spain
diff --git a/model_cards/redewiedergabe/bert-base-historical-german-rw-cased/README.md b/model_cards/redewiedergabe/bert-base-historical-german-rw-cased/README.md
new file mode 100644
index 000000000000..cf5b098935b1
--- /dev/null
+++ b/model_cards/redewiedergabe/bert-base-historical-german-rw-cased/README.md
@@ -0,0 +1,57 @@
+---
+language: german
+---
+
+# Model description
+## Dataset
+Trained on fictional and non-fictional German texts written between 1840 and 1920:
+* Narrative texts from Digitale Bibliothek (https://textgrid.de/digitale-bibliothek)
+* Fairy tales and sagas from Grimm Korpus (https://www1.ids-mannheim.de/kl/projekte/korpora/archiv/gri.html)
+* Newspaper and magazine article from Mannheimer Korpus Historischer Zeitungen und Zeitschriften (https://repos.ids-mannheim.de/mkhz-beschreibung.html)
+* Magazine article from the journal „Die Grenzboten“ (http://www.deutschestextarchiv.de/doku/textquellen#grenzboten)
+* Fictional and non-fictional texts from Projekt Gutenberg (https://www.projekt-gutenberg.org)
+
+## Hardware used
+1 Tesla P4 GPU
+
+## Hyperparameters
+
+| Parameter                     | Value    |
+|-------------------------------|----------|
+| Epochs                        | 3        |
+| Gradient_accumulation_steps   | 1        |
+| Train_batch_size              | 32       |
+| Learning_rate                 | 0.00003  |
+| Max_seq_len                   | 128      |
+
+## Evaluation results: Automatic tagging of four forms of speech/thought/writing representation in historical fictional and non-fictional German texts
+
+The language model was used in the task to tag direct, indirect, reported and free indirect speech/thought/writing representation in fictional and non-fictional German texts. The tagger is available and described in detail at https://github.com/redewiedergabe/tagger.
+
+The tagging model was trained using the SequenceTagger Class of the Flair framework ([Akbik et al., 2019](https://www.aclweb.org/anthology/N19-4010)) which implements a BiLSTM-CRF architecture on top of a language embedding (as proposed by [Huang et al. (2015)](https://arxiv.org/abs/1508.01991)). 
+
+
+Hyperparameters
+
+| Parameter                     | Value      |
+|-------------------------------|------------|
+| Hidden_size                   | 256        |
+| Learning_rate                 | 0.1        |
+| Mini_batch_size               | 8          |
+| Max_epochs                    | 150        |
+
+Results are reported below in comparison to a custom trained flair embedding, which was stacked onto a custom trained fastText-model. Both models were trained on the same dataset.
+
+|                | BERT       ||| FastText+Flair  |||Test data|
+|----------------|----------|-----------|----------|------|-----------|--------|--------|
+|                | F1       | Precision | Recall   | F1   | Precision | Recall ||
+| Direct         | 0.80     | 0.86      | 0.74     | 0.84 | 0.90      | 0.79   |historical German, fictional & non-fictional|
+| Indirect       | **0.76** | **0.79**  | **0.73** | 0.73 | 0.78      | 0.68   |historical German, fictional & non-fictional|
+| Reported       | **0.58** | **0.69**  | **0.51** | 0.56 | 0.68      | 0.48   |historical German, fictional & non-fictional|
+| Free indirect  | **0.57** | **0.80**  | **0.44** | 0.47 | 0.78      | 0.34   |modern German, fictional|
+
+## Intended use:
+Historical German Texts (1840 to 1920)
+
+(Showed good performance with modern German fictional texts as well)
+
diff --git a/model_cards/roberta-base-README.md b/model_cards/roberta-base-README.md
new file mode 100644
index 000000000000..59ca24e29802
--- /dev/null
+++ b/model_cards/roberta-base-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=roberta-base)
diff --git a/model_cards/shoarora/alectra-small-owt/README.md b/model_cards/shoarora/alectra-small-owt/README.md
new file mode 100644
index 000000000000..046db2a82b3e
--- /dev/null
+++ b/model_cards/shoarora/alectra-small-owt/README.md
@@ -0,0 +1,60 @@
+# ALECTRA-small-OWT
+
+This is an extension of
+[ELECTRA](https://openreview.net/forum?id=r1xMH1BtvB) small model, trained on the
+[OpenWebText corpus](https://skylion007.github.io/OpenWebTextCorpus/).
+The training task (discriminative LM / replaced-token-detection) can be generalized to any transformer type.  Here, we train an ALBERT model under the same scheme.
+
+## Pretraining task
+![electra task diagram](https://github.com/shoarora/lmtuners/raw/master/assets/electra.png)
+(figure from [Clark et al. 2020](https://openreview.net/pdf?id=r1xMH1BtvB))
+
+ELECTRA uses discriminative LM / replaced-token-detection for pretraining.
+This involves a generator (a Masked LM model) creating examples for a discriminator
+to classify as original or replaced for each token.
+
+The generator generalizes to any `*ForMaskedLM` model and the discriminator could be
+any `*ForTokenClassification` model.  Therefore, we can extend the task to ALBERT models,
+not just BERT as in the original paper.
+
+## Usage
+```python
+from transformers import AlbertForSequenceClassification, BertTokenizer
+
+# Both models use the bert-base-uncased tokenizer and vocab.
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+alectra = AlbertForSequenceClassification.from_pretrained('shoarora/alectra-small-owt')
+```
+NOTE: this ALBERT model uses a BERT WordPiece tokenizer.
+
+## Code
+The pytorch module that implements this task is available [here](https://github.com/shoarora/lmtuners/blob/master/lmtuners/lightning_modules/discriminative_lm.py).
+
+Further implementation information [here](https://github.com/shoarora/lmtuners/tree/master/experiments/disc_lm_small),
+and [here](https://github.com/shoarora/lmtuners/blob/master/experiments/disc_lm_small/train_alectra_small.py) is the script that created this model.
+
+This specific model was trained with the following params:
+- `batch_size: 512`
+- `training_steps: 5e5`
+- `warmup_steps: 4e4`
+- `learning_rate: 2e-3`
+
+
+## Downstream tasks
+#### GLUE Dev results
+| Model                    | # Params | CoLA | SST | MRPC | STS  | QQP  | MNLI | QNLI | RTE |
+| ---                      | ---      | ---  | --- | ---  | ---  | ---  | ---  | ---  | --- |
+| ELECTRA-Small++          | 14M      | 57.0 | 91. | 88.0 | 87.5 | 89.0 | 81.3 | 88.4 | 66.7|
+| ELECTRA-Small-OWT        | 14M      | 56.8 | 88.3| 87.4 | 86.8 | 88.3 | 78.9 | 87.9 | 68.5|
+| ELECTRA-Small-OWT (ours) | 17M      | 56.3 | 88.4| 75.0 | 86.1 | 89.1 | 77.9 | 83.0 | 67.1|
+| ALECTRA-Small-OWT (ours) |  4M      | 50.6 | 89.1| 86.3 | 87.2 | 89.1 | 78.2 | 85.9 | 69.6|
+
+
+#### GLUE Test results
+| Model                    | # Params | CoLA | SST | MRPC | STS  | QQP  | MNLI | QNLI | RTE |
+| ---                      | ---      | ---  | --- | ---  | ---  | ---  | ---  | ---  | --- |
+| BERT-Base                | 110M     | 52.1 | 93.5| 84.8 | 85.9 | 89.2 | 84.6 | 90.5 | 66.4|
+| GPT                      | 117M     | 45.4 | 91.3| 75.7 | 80.0 | 88.5 | 82.1 | 88.1 | 56.0|
+| ELECTRA-Small++          | 14M      | 57.0 | 91.2| 88.0 | 87.5 | 89.0 | 81.3 | 88.4 | 66.7|
+| ELECTRA-Small-OWT (ours) | 17M      | 57.4 | 89.3| 76.2 | 81.9 | 87.5 | 78.1 | 82.4 | 68.1|
+| ALECTRA-Small-OWT (ours) |  4M      | 43.9 | 87.9| 82.1 | 82.0 | 87.6 | 77.9 | 85.8 | 67.5|
diff --git a/model_cards/shoarora/electra-small-owt/README.md b/model_cards/shoarora/electra-small-owt/README.md
new file mode 100644
index 000000000000..a1d1c8f93f9a
--- /dev/null
+++ b/model_cards/shoarora/electra-small-owt/README.md
@@ -0,0 +1,59 @@
+# ELECTRA-small-OWT
+
+This is an unnoficial implementation of an
+[ELECTRA](https://openreview.net/forum?id=r1xMH1BtvB) small model, trained on the
+[OpenWebText corpus](https://skylion007.github.io/OpenWebTextCorpus/).
+
+Differences from official ELECTRA models:
+ - we use a `BertForMaskedLM` as the generator and `BertForTokenClassification` as the discriminator
+ - they use an embedding projection layer, but Bert doesn't have one
+
+## Pretraining ttask
+![electra task diagram](https://github.com/shoarora/lmtuners/raw/master/assets/electra.png)
+(figure from [Clark et al. 2020](https://openreview.net/pdf?id=r1xMH1BtvB))
+
+ELECTRA uses discriminative LM / replaced-token-detection for pretraining.
+This involves a generator (a Masked LM model) creating examples for a discriminator
+to classify as original or replaced for each token.
+
+
+## Usage
+```python
+from transformers import BertForSequenceClassification, BertTokenizer
+
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+electra = BertForSequenceClassification.from_pretrained('shoarora/electra-small-owt')
+```
+
+## Code
+The pytorch module that implements this task is available [here](https://github.com/shoarora/lmtuners/blob/master/lmtuners/lightning_modules/discriminative_lm.py).
+
+Further implementation information [here](https://github.com/shoarora/lmtuners/tree/master/experiments/disc_lm_small),
+and [here](https://github.com/shoarora/lmtuners/blob/master/experiments/disc_lm_small/train_electra_small.py) is the script that created this model.
+
+This specific model was trained with the following params:
+- `batch_size: 512`
+- `training_steps: 5e5`
+- `warmup_steps: 4e4`
+- `learning_rate: 2e-3`
+
+
+## Downstream tasks
+#### GLUE Dev results
+| Model                    | # Params | CoLA | SST | MRPC | STS  | QQP  | MNLI | QNLI | RTE |
+| ---                      | ---      | ---  | --- | ---  | ---  | ---  | ---  | ---  | --- |
+| ELECTRA-Small++          | 14M      | 57.0 | 91. | 88.0 | 87.5 | 89.0 | 81.3 | 88.4 | 66.7|
+| ELECTRA-Small-OWT        | 14M      | 56.8 | 88.3| 87.4 | 86.8 | 88.3 | 78.9 | 87.9 | 68.5|
+| ELECTRA-Small-OWT (ours) | 17M      | 56.3 | 88.4| 75.0 | 86.1 | 89.1 | 77.9 | 83.0 | 67.1|
+| ALECTRA-Small-OWT (ours) |  4M      | 50.6 | 89.1| 86.3 | 87.2 | 89.1 | 78.2 | 85.9 | 69.6|
+
+- Table initialized from [ELECTRA github repo](https://github.com/google-research/electra)
+
+#### GLUE Test results
+| Model                    | # Params | CoLA | SST | MRPC | STS  | QQP  | MNLI | QNLI | RTE |
+| ---                      | ---      | ---  | --- | ---  | ---  | ---  | ---  | ---  | --- |
+| BERT-Base                | 110M     | 52.1 | 93.5| 84.8 | 85.9 | 89.2 | 84.6 | 90.5 | 66.4|
+| GPT                      | 117M     | 45.4 | 91.3| 75.7 | 80.0 | 88.5 | 82.1 | 88.1 | 56.0|
+| ELECTRA-Small++          | 14M      | 57.0 | 91.2| 88.0 | 87.5 | 89.0 | 81.3 | 88.4 | 66.7|
+| ELECTRA-Small-OWT (ours) | 17M      | 57.4 | 89.3| 76.2 | 81.9 | 87.5 | 78.1 | 82.4 | 68.1|
+| ALECTRA-Small-OWT (ours) |  4M      | 43.9 | 87.9| 82.1 | 82.0 | 87.6 | 77.9 | 85.8 | 67.5|
diff --git a/model_cards/xlm-mlm-en-2048-README.md b/model_cards/xlm-mlm-en-2048-README.md
new file mode 100644
index 000000000000..6291005386ba
--- /dev/null
+++ b/model_cards/xlm-mlm-en-2048-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=xlm-mlm-en-2048)
diff --git a/model_cards/xlm-roberta-base-README.md b/model_cards/xlm-roberta-base-README.md
new file mode 100644
index 000000000000..8afea16712bd
--- /dev/null
+++ b/model_cards/xlm-roberta-base-README.md
@@ -0,0 +1,6 @@
+---
+tags:
+- exbert
+---
+
+[![ExBERT](https://img.shields.io/badge/Visualize%20Attentions-ExBERT-green)](https://huggingface.co/exbert/?model=xlm-roberta-base)
diff --git a/notebooks/README.md b/notebooks/README.md
index 7afe17490a20..569fef606919 100644
--- a/notebooks/README.md
+++ b/notebooks/README.md
@@ -11,8 +11,8 @@ Pull Request and we'll review it so it can be included here.
 
 | Notebook     |      Description      |   |
 |:----------|:-------------:|------:|
-| [Getting Started Tokenizers](01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
-| [Getting Started Transformers](02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb) |
-| [How to use Pipelines](03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb) |
+| [Getting Started Tokenizers](https://github.com/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb)  | How to train and use your very own tokenizer  |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/01-training-tokenizers.ipynb) |
+| [Getting Started Transformers](https://github.com/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb)   | How to easily start using transformers  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/02-transformers.ipynb) |
+| [How to use Pipelines](https://github.com/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb)  | Simple and efficient way to use State-of-the-Art models on downstream tasks through transformers | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/transformers/blob/master/notebooks/03-pipelines.ipynb) |
 | [How to train a language model](https://github.com/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)|
 | [How to generate text](https://github.com/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)|
diff --git a/setup.py b/setup.py
index 1edb14aadd20..d201ae88d15b 100644
--- a/setup.py
+++ b/setup.py
@@ -83,7 +83,7 @@
 
 setup(
     name="transformers",
-    version="2.6.0",
+    version="2.8.0",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Sam Shleifer, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
@@ -96,7 +96,7 @@
     packages=find_packages("src"),
     install_requires=[
         "numpy",
-        "tokenizers == 0.5.2",
+        "tokenizers == 0.7.0rc3",
         # dataclasses for Python versions that don't have it
         "dataclasses;python_version<'3.7'",
         # accessing files from S3 directly
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 91693d2f9982..b28cd3f616e1 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-__version__ = "2.6.0"
+__version__ = "2.8.0"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.
@@ -38,6 +38,7 @@
 from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_mmbt import MMBTConfig
@@ -127,6 +128,7 @@
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
+from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
@@ -297,6 +299,15 @@
         FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
+    from .modeling_electra import (
+        ElectraForPreTraining,
+        ElectraForMaskedLM,
+        ElectraForTokenClassification,
+        ElectraModel,
+        load_tf_weights_in_electra,
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
     # Optimization
     from .optimization import (
         AdamW,
@@ -463,6 +474,15 @@
         TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 
+    from .modeling_tf_electra import (
+        TFElectraPreTrainedModel,
+        TFElectraModel,
+        TFElectraForPreTraining,
+        TFElectraForMaskedLM,
+        TFElectraForTokenClassification,
+        TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    )
+
     # Optimization
     from .optimization_tf import WarmUp, create_optimizer, AdamWeightDecay, GradientAccumulator
 
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index 7968b88ba926..004189556c8c 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -18,12 +18,6 @@ def _gelu_python(x):
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
 
-if torch.__version__ < "1.4.0":
-    gelu = _gelu_python
-else:
-    gelu = F.gelu
-
-
 def gelu_new(x):
     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
         Also see https://arxiv.org/abs/1606.08415
@@ -31,6 +25,12 @@ def gelu_new(x):
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
+if torch.__version__ < "1.4.0":
+    gelu = _gelu_python
+else:
+    gelu = F.gelu
+    gelu_new = torch.jit.script(gelu_new)
+
 ACT2FN = {
     "relu": F.relu,
     "swish": swish,
diff --git a/src/transformers/configuration_auto.py b/src/transformers/configuration_auto.py
index 3b112704ccaf..2951da44d445 100644
--- a/src/transformers/configuration_auto.py
+++ b/src/transformers/configuration_auto.py
@@ -24,6 +24,7 @@
 from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig
+from .configuration_electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig
 from .configuration_flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig
 from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
 from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig
@@ -57,6 +58,7 @@
         T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ]
     for key, value, in pretrained_map.items()
 )
@@ -79,6 +81,7 @@
         ("xlnet", XLNetConfig,),
         ("xlm", XLMConfig,),
         ("ctrl", CTRLConfig,),
+        ("electra", ElectraConfig,),
     ]
 )
 
@@ -133,6 +136,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             - contains `xlm`: :class:`~transformers.XLMConfig` (XLM model)
             - contains `ctrl` : :class:`~transformers.CTRLConfig` (CTRL model)
             - contains `flaubert` : :class:`~transformers.FlaubertConfig` (Flaubert model)
+            - contains `electra` : :class:`~transformers.ElectraConfig` (ELECTRA model)
 
 
         Args:
diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
new file mode 100644
index 000000000000..8cfba54be092
--- /dev/null
+++ b/src/transformers/configuration_electra.py
@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" ELECTRA model configuration """
+
+
+import logging
+
+from .configuration_utils import PretrainedConfig
+
+
+logger = logging.getLogger(__name__)
+
+ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/config.json",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/config.json",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/config.json",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/config.json",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/config.json",
+}
+
+
+class ElectraConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel`.
+        It is used to instantiate an ELECTRA model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the ELECTRA `google/electra-small-discriminator <https://huggingface.co/google/electra-small-discriminator>`__
+        architecture.
+
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+
+
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the ELECTRA model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.ElectraModel`.
+            embedding_size (:obj:`int`, optional, defaults to 128):
+                Dimensionality of the encoder layers and the pooler layer.
+            hidden_size (:obj:`int`, optional, defaults to 256):
+                Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (:obj:`int`, optional, defaults to 4):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 1024):
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.ElectraModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+
+        Example::
+
+            from transformers import ElectraModel, ElectraConfig
+
+            # Initializing a ELECTRA electra-base-uncased style configuration
+            configuration = ElectraConfig()
+
+            # Initializing a model from the electra-base-uncased style configuration
+            model = ElectraModel(configuration)
+
+            # Accessing the model configuration
+            configuration = model.config
+
+        Attributes:
+            pretrained_config_archive_map (Dict[str, str]):
+                A dictionary containing all the available pre-trained checkpoints.
+    """
+    pretrained_config_archive_map = ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP
+    model_type = "electra"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        embedding_size=128,
+        hidden_size=256,
+        num_hidden_layers=12,
+        num_attention_heads=4,
+        intermediate_size=1024,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 01f6b6554a88..67477b76d0c3 100644
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -80,6 +80,7 @@ def __init__(self, **kwargs):
         self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
         self.length_penalty = kwargs.pop("length_penalty", 1.0)
         self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
         self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
 
         # Fine-tuning task arguments
diff --git a/src/transformers/convert_electra_original_tf_checkpoint_to_pytorch.py b/src/transformers/convert_electra_original_tf_checkpoint_to_pytorch.py
new file mode 100644
index 000000000000..1b7579524bc5
--- /dev/null
+++ b/src/transformers/convert_electra_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert ELECTRA checkpoint."""
+
+
+import argparse
+import logging
+
+import torch
+
+from transformers import ElectraConfig, ElectraForMaskedLM, ElectraForPreTraining, load_tf_weights_in_electra
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path, discriminator_or_generator):
+    # Initialise PyTorch model
+    config = ElectraConfig.from_json_file(config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+
+    if discriminator_or_generator == "discriminator":
+        model = ElectraForPreTraining(config)
+    elif discriminator_or_generator == "generator":
+        model = ElectraForMaskedLM(config)
+    else:
+        raise ValueError("The discriminator_or_generator argument should be either 'discriminator' or 'generator'")
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_electra(
+        model, config, tf_checkpoint_path, discriminator_or_generator=discriminator_or_generator
+    )
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
+    )
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the pre-trained model. \n"
+        "This specifies the model architecture.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--discriminator_or_generator",
+        default=None,
+        type=str,
+        required=True,
+        help="Whether to export the generator or the discriminator. Should be a string, either 'discriminator' or "
+        "'generator'.",
+    )
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(
+        args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path, args.discriminator_or_generator
+    )
diff --git a/src/transformers/convert_pytorch_checkpoint_to_tf2.py b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
index 3f3c923d521a..1699af5884ba 100755
--- a/src/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/src/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -25,6 +25,7 @@
     CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
     DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
     FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
     GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
     OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -39,6 +40,7 @@
     CamembertConfig,
     CTRLConfig,
     DistilBertConfig,
+    ElectraConfig,
     FlaubertConfig,
     GPT2Config,
     OpenAIGPTConfig,
@@ -52,6 +54,7 @@
     TFCTRLLMHeadModel,
     TFDistilBertForMaskedLM,
     TFDistilBertForQuestionAnswering,
+    TFElectraForPreTraining,
     TFFlaubertWithLMHeadModel,
     TFGPT2LMHeadModel,
     TFOpenAIGPTLMHeadModel,
@@ -110,6 +113,8 @@
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5ForConditionalGeneration,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
 else:
     (
@@ -147,6 +152,8 @@
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5ForConditionalGeneration,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
     ) = (
         None,
         None,
@@ -182,6 +189,8 @@
         None,
         None,
         None,
+        None,
+        None,
     )
 
 
@@ -321,6 +330,13 @@
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
     ),
+    "electra": (
+        ElectraConfig,
+        TFElectraForPreTraining,
+        ElectraForPreTraining,
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    ),
 }
 
 
diff --git a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 39e4b82019ef..869568580da2 100644
--- a/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -25,15 +25,8 @@
 from fairseq.modules import TransformerSentenceEncoderLayer
 from packaging import version
 
-from transformers.modeling_bert import (
-    BertConfig,
-    BertIntermediate,
-    BertLayer,
-    BertOutput,
-    BertSelfAttention,
-    BertSelfOutput,
-)
-from transformers.modeling_roberta import RobertaForMaskedLM, RobertaForSequenceClassification
+from transformers.modeling_bert import BertIntermediate, BertLayer, BertOutput, BertSelfAttention, BertSelfOutput
+from transformers.modeling_roberta import RobertaConfig, RobertaForMaskedLM, RobertaForSequenceClassification
 
 
 if version.parse(fairseq.__version__) < version.parse("0.9.0"):
@@ -55,7 +48,7 @@ def convert_roberta_checkpoint_to_pytorch(
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
     roberta.eval()  # disable dropout
     roberta_sent_encoder = roberta.model.decoder.sentence_encoder
-    config = BertConfig(
+    config = RobertaConfig(
         vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
@@ -138,7 +131,7 @@ def convert_roberta_checkpoint_to_pytorch(
         model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
         model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
         model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
-        model.lm_head.bias = roberta.model.decoder.lm_head.bias
+        model.lm_head.decoder.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.
     input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index 82fec40223f5..ac929decbb10 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -28,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=False)
 class InputExample:
     """
     A single training/test example for simple sequence classification.
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 7661a3615485..8b21c619d0ce 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -35,8 +35,8 @@
 logger = logging.getLogger(__name__)
 
 
-class ModelCard(object):
-    r""" Model Card class.
+class ModelCard:
+    r""" Structured Model Card class.
         Store model card as well as methods for loading/downloading/saving model cards.
 
         Please read the following paper for details and explanation on the sections:
@@ -93,7 +93,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a model card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
 
             cache_dir: (`optional`) string:
@@ -163,33 +163,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
             # Load model card
             modelcard = cls.from_json_file(resolved_model_card_file)
 
-        except EnvironmentError:
-            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.warning("Couldn't reach server at '{}' to download model card file.".format(model_card_file))
-            else:
-                logger.warning(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url to a model card file named {} or "
-                    "a directory containing such a file but couldn't find any such file at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ", ".join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
-                        model_card_file,
-                        MODEL_CARD_NAME,
-                    )
-                )
-            logger.warning("Creating an empty model card.")
-
-            # We fall back on creating an empty model card
-            modelcard = cls()
-
-        except json.JSONDecodeError:
-            logger.warning(
-                "Couldn't reach server at '{}' to download model card file or "
-                "model card file is not a valid JSON file. "
-                "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)
-            )
-            logger.warning("Creating an empty model card.")
-
+        except (EnvironmentError, json.JSONDecodeError):
             # We fall back on creating an empty model card
             modelcard = cls()
 
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index 016767369f1a..89fbe6e59d08 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -26,6 +26,7 @@
     CamembertConfig,
     CTRLConfig,
     DistilBertConfig,
+    ElectraConfig,
     FlaubertConfig,
     GPT2Config,
     OpenAIGPTConfig,
@@ -76,6 +77,13 @@
     DistilBertForTokenClassification,
     DistilBertModel,
 )
+from .modeling_electra import (
+    ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
+    ElectraForMaskedLM,
+    ElectraForPreTraining,
+    ElectraForTokenClassification,
+    ElectraModel,
+)
 from .modeling_flaubert import (
     FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     FlaubertForQuestionAnsweringSimple,
@@ -141,6 +149,7 @@
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
         FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP,
     ]
     for key, value, in pretrained_map.items()
 )
@@ -162,6 +171,7 @@
         (FlaubertConfig, FlaubertModel),
         (XLMConfig, XLMModel),
         (CTRLConfig, CTRLModel),
+        (ElectraConfig, ElectraModel),
     ]
 )
 
@@ -182,6 +192,7 @@
         (FlaubertConfig, FlaubertWithLMHeadModel),
         (XLMConfig, XLMWithLMHeadModel),
         (CTRLConfig, CTRLLMHeadModel),
+        (ElectraConfig, ElectraForPreTraining),
     ]
 )
 
@@ -202,6 +213,7 @@
         (FlaubertConfig, FlaubertWithLMHeadModel),
         (XLMConfig, XLMWithLMHeadModel),
         (CTRLConfig, CTRLLMHeadModel),
+        (ElectraConfig, ElectraForMaskedLM),
     ]
 )
 
@@ -242,6 +254,7 @@
         (BertConfig, BertForTokenClassification),
         (XLNetConfig, XLNetForTokenClassification),
         (AlbertConfig, AlbertForTokenClassification),
+        (ElectraConfig, ElectraForTokenClassification),
     ]
 )
 
@@ -281,7 +294,8 @@ def from_config(cls, config):
                 - isInstance of `transfo-xl` configuration class: :class:`~transformers.TransfoXLModel` (Transformer-XL model)
                 - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModel` (XLNet model)
                 - isInstance of `xlm` configuration class: :class:`~transformers.XLMModel` (XLM model)
-                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertModel` (XLM model)
+                - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertModel` (Flaubert model)
+                - isInstance of `electra` configuration class: :class:`~transformers.ElectraModel` (Electra model)
 
         Examples::
 
@@ -322,7 +336,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             - contains `xlnet`: :class:`~transformers.XLNetModel` (XLNet model)
             - contains `xlm`: :class:`~transformers.XLMModel` (XLM model)
             - contains `ctrl`: :class:`~transformers.CTRLModel` (Salesforce CTRL  model)
-            - contains `flaubert`: :class:`~transformers.Flaubert` (Flaubert  model)
+            - contains `flaubert`: :class:`~transformers.FlaubertModel` (Flaubert  model)
+            - contains `electra`: :class:`~transformers.ElectraModel` (Electra  model)
 
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
@@ -430,6 +445,7 @@ def from_config(cls, config):
                 - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
                 - isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
                 - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
+                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForPreTraining` (Electra model)
 
         Examples::
 
@@ -470,6 +486,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             - contains `xlm`: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
             - contains `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
             - contains `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
+            - contains `electra`: :class:`~transformers.ElectraForPreTraining` (Electra model)
 
         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
         To train the model, you should first set it back in training mode with `model.train()`
@@ -571,6 +588,7 @@ def from_config(cls, config):
                 - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetLMHeadModel` (XLNet model)
                 - isInstance of `xlm` configuration class: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
                 - isInstance of `flaubert` configuration class: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
+                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForMaskedLM` (Electra model)
 
         Examples::
 
@@ -612,6 +630,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             - contains `xlm`: :class:`~transformers.XLMWithLMHeadModel` (XLM model)
             - contains `ctrl`: :class:`~transformers.CTRLLMHeadModel` (Salesforce CTRL model)
             - contains `flaubert`: :class:`~transformers.FlaubertWithLMHeadModel` (Flaubert model)
+            - contains `electra`: :class:`~transformers.ElectraForMaskedLM` (Electra model)
 
         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
         To train the model, you should first set it back in training mode with `model.train()`
@@ -998,6 +1017,7 @@ def from_config(cls, config):
                 - isInstance of `xlnet` configuration class: :class:`~transformers.XLNetModelForTokenClassification` (XLNet model)
                 - isInstance of `camembert` configuration class: :class:`~transformers.CamembertModelForTokenClassification` (Camembert model)
                 - isInstance of `roberta` configuration class: :class:`~transformers.RobertaModelForTokenClassification` (Roberta model)
+                - isInstance of `electra` configuration class: :class:`~transformers.ElectraForTokenClassification` (Electra model)
 
         Examples::
 
@@ -1035,6 +1055,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             - contains `bert`: :class:`~transformers.BertForTokenClassification` (Bert model)
             - contains `xlnet`: :class:`~transformers.XLNetForTokenClassification` (XLNet model)
             - contains `roberta`: :class:`~transformers.RobertaForTokenClassification` (Roberta model)
+            - contains `electra`: :class:`~transformers.ElectraForTokenClassification` (Electra model)
 
         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
         To train the model, you should first set it back in training mode with `model.train()`
diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py
index cf88f9d5942e..bec39811b6f5 100644
--- a/src/transformers/modeling_bart.py
+++ b/src/transformers/modeling_bart.py
@@ -73,7 +73,7 @@
             Mask to avoid performing attention on padding token indices in input_ids.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        encoder_outputs (tuple(:obj:`tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
             Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
             `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
             Used in the cross-attention of the decoder.
@@ -116,7 +116,6 @@ class PretrainedBartModel(PreTrainedModel):
     config_class = BartConfig
     base_model_prefix = "model"
     pretrained_model_archive_map = BART_PRETRAINED_MODEL_ARCHIVE_MAP
-    encoder_outputs_batch_dim_idx = 1  # outputs shaped (seq_len, bs, ...)
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -294,7 +293,10 @@ def forward(
         if self.output_hidden_states:
             encoder_states.append(x)
 
+        # T x B x C -> B x T x C
         encoder_states = [hidden_state.transpose(0, 1) for hidden_state in encoder_states]
+        x = x.transpose(0, 1)
+
         return x, encoder_states, all_attentions
 
 
@@ -448,7 +450,11 @@ def forward(
 
         x = self.layernorm_embedding(x)
         x = F.dropout(x, p=self.dropout, training=self.training)
-        x = x.transpose(0, 1)  # (seq_len, BS, model_dim)
+
+        # Convert to Bart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
+        x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
         # decoder layers
         all_hidden_states = ()
         all_self_attns = ()
@@ -477,9 +483,10 @@ def forward(
             if self.output_attentions:
                 all_self_attns += (layer_self_attn,)
 
-        # Convert shapes from (seq_len, BS, model_dim) to (BS, seq_len, model_dim)
+        # Convert to standart output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim)
         all_hidden_states = [hidden_state.transpose(0, 1) for hidden_state in all_hidden_states]
         x = x.transpose(0, 1)
+        encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
 
         if self.output_past:
             next_cache = ((encoder_hidden_states, encoder_padding_mask), next_decoder_cache)
@@ -805,6 +812,8 @@ def get_input_embeddings(self):
 
     def set_input_embeddings(self, value):
         self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
 
     def get_output_embeddings(self):
         return _make_linear_from_emb(self.shared)  # make it on the fly
@@ -928,10 +937,9 @@ def _reorder_cache(past, beam_idx):
             layer_past_new = {
                 attn_key: _reorder_buffer(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items()
             }
-            # reordered_layer_past = [layer_past[:, i].unsqueeze(1).clone().detach() for i in beam_idx]
-            # reordered_layer_past = torch.cat(reordered_layer_past, dim=1)
             reordered_past.append(layer_past_new)
-        new_enc_out = enc_out if enc_out is None else enc_out.index_select(1, beam_idx)
+
+        new_enc_out = enc_out if enc_out is None else enc_out.index_select(0, beam_idx)
         new_enc_mask = enc_mask if enc_mask is None else enc_mask.index_select(0, beam_idx)
 
         past = ((new_enc_out, new_enc_mask), reordered_past)
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 47d7b2301f94..d0231d5bd18f 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -183,7 +183,7 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs
 class BertSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
                 "heads (%d)" % (config.hidden_size, config.num_attention_heads)
@@ -845,7 +845,7 @@ def forward(
             Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
         prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, 2)`):
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
             Prediction scores of the next sequence prediction (classification) head (scores of True/False
             continuation before SoftMax).
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
@@ -1048,7 +1048,7 @@ def forward(
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
             Next sequence prediction (classification) loss.
-        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, 2)`):
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
             Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py
new file mode 100644
index 000000000000..85996cb6a17b
--- /dev/null
+++ b/src/transformers/modeling_electra.py
@@ -0,0 +1,671 @@
+import logging
+import os
+
+import torch
+import torch.nn as nn
+
+from transformers import ElectraConfig, add_start_docstrings
+from transformers.activations import get_activation
+
+from .file_utils import add_start_docstrings_to_callable
+from .modeling_bert import BertEmbeddings, BertEncoder, BertLayerNorm, BertPreTrainedModel
+
+
+logger = logging.getLogger(__name__)
+
+
+ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/pytorch_model.bin",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/pytorch_model.bin",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/pytorch_model.bin",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/pytorch_model.bin",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/pytorch_model.bin",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/pytorch_model.bin",
+}
+
+
+def load_tf_weights_in_electra(model, config, tf_checkpoint_path, discriminator_or_generator="discriminator"):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        original_name: str = name
+
+        try:
+            if isinstance(model, ElectraForMaskedLM):
+                name = name.replace("electra/embeddings/", "generator/embeddings/")
+
+            if discriminator_or_generator == "generator":
+                name = name.replace("electra/", "discriminator/")
+                name = name.replace("generator/", "electra/")
+
+            name = name.replace("dense_1", "dense_prediction")
+            name = name.replace("generator_predictions/output_bias", "generator_lm_head/bias")
+
+            name = name.split("/")
+            # print(original_name, name)
+            # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+            # which are not required for using pretrained model
+            if any(n in ["global_step", "temperature"] for n in name):
+                logger.info("Skipping {}".format(original_name))
+                continue
+            pointer = model
+            for m_name in name:
+                if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                    scope_names = re.split(r"_(\d+)", m_name)
+                else:
+                    scope_names = [m_name]
+                if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                    pointer = getattr(pointer, "weight")
+                elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                    pointer = getattr(pointer, "bias")
+                elif scope_names[0] == "output_weights":
+                    pointer = getattr(pointer, "weight")
+                elif scope_names[0] == "squad":
+                    pointer = getattr(pointer, "classifier")
+                else:
+                    pointer = getattr(pointer, scope_names[0])
+                if len(scope_names) >= 2:
+                    num = int(scope_names[1])
+                    pointer = pointer[num]
+            if m_name.endswith("_embeddings"):
+                pointer = getattr(pointer, "weight")
+            elif m_name == "kernel":
+                array = np.transpose(array)
+            try:
+                assert pointer.shape == array.shape, original_name
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            print("Initialize PyTorch weight {}".format(name), original_name)
+            pointer.data = torch.from_numpy(array)
+        except AttributeError as e:
+            print("Skipping {}".format(original_name), name, e)
+            continue
+    return model
+
+
+class ElectraEmbeddings(BertEmbeddings):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.embedding_size, eps=config.layer_norm_eps)
+
+
+class ElectraDiscriminatorPredictions(nn.Module):
+    """Prediction module for the discriminator, made up of two dense layers."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dense_prediction = nn.Linear(config.hidden_size, 1)
+        self.config = config
+
+    def forward(self, discriminator_hidden_states, attention_mask):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = get_activation(self.config.hidden_act)(hidden_states)
+        logits = self.dense_prediction(hidden_states).squeeze()
+
+        return logits
+
+
+class ElectraGeneratorPredictions(nn.Module):
+    """Prediction module for the generator, made up of two dense layers."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.LayerNorm = BertLayerNorm(config.embedding_size)
+        self.dense = nn.Linear(config.hidden_size, config.embedding_size)
+
+    def forward(self, generator_hidden_states):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = get_activation("gelu")(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+
+        return hidden_states
+
+
+class ElectraPreTrainedModel(BertPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = ElectraConfig
+    pretrained_model_archive_map = ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_electra
+    base_model_prefix = "electra"
+
+    def get_extended_attention_mask(self, attention_mask, input_shape, device):
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = causal_mask.to(
+                    attention_mask.dtype
+                )  # causal and attention masks must have same type with pytorch version < 1.3
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        return extended_attention_mask
+
+    def get_head_mask(self, head_mask):
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        num_hidden_layers = self.config.num_hidden_layers
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = (
+                    head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
+                )  # We can specify head_mask for each layer
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+
+ELECTRA_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ELECTRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+"""
+
+
+@add_start_docstrings(
+    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
+    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
+    "hidden size and embedding size are different."
+    ""
+    "Both the generator and discriminator checkpoints may be loaded into this model.",
+    ELECTRA_START_DOCSTRING,
+)
+class ElectraModel(ElectraPreTrainedModel):
+
+    config_class = ElectraConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embeddings = ElectraEmbeddings(config)
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = nn.Linear(config.embedding_size, config.hidden_size)
+
+        self.encoder = BertEncoder(config)
+        self.config = config
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        from transformers import ElectraModel, ElectraTokenizer
+        import torch
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = ElectraModel.from_pretrained('google/electra-small-discriminator')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+        """
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        head_mask = self.get_head_mask(head_mask)
+
+        hidden_states = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        if hasattr(self, "embeddings_project"):
+            hidden_states = self.embeddings_project(hidden_states)
+
+        hidden_states = self.encoder(hidden_states, attention_mask=extended_attention_mask, head_mask=head_mask)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Electra model with a binary classification head on top as used during pre-training for identifying generated
+    tokens.
+
+    It is recommended to load the discriminator checkpoint into that model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class ElectraForPreTraining(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.discriminator_predictions = ElectraDiscriminatorPredictions(config)
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        r"""
+        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates the token is an original token,
+            ``1`` indicates the token was replaced.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss of the ELECTRA objective.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`)
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+
+    Examples::
+
+        from transformers import ElectraTokenizer, ElectraForPreTraining
+        import torch
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+
+        prediction_scores, seq_relationship_scores = outputs[:2]
+
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        logits = self.discriminator_predictions(discriminator_sequence_output, attention_mask)
+
+        output = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1, discriminator_sequence_output.shape[1]) == 1
+                active_logits = logits.view(-1, discriminator_sequence_output.shape[1])[active_loss]
+                active_labels = labels[active_loss]
+                loss = loss_fct(active_logits, active_labels.float())
+            else:
+                loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float())
+
+            output = (loss,) + output
+
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """
+    Electra model with a language modeling head on top.
+
+    Even though both the discriminator and generator may be loaded into this model, the generator is
+    the only model of the two to have been trained for the masked language modeling task.""",
+    ELECTRA_START_DOCSTRING,
+)
+class ElectraForMaskedLM(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.generator_predictions = ElectraGeneratorPredictions(config)
+
+        self.generator_lm_head = nn.Linear(config.embedding_size, config.vocab_size)
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.generator_lm_head
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        masked_lm_labels=None,
+    ):
+        r"""
+        masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+        Examples::
+
+            from transformers import ElectraTokenizer, ElectraForMaskedLM
+            import torch
+
+            tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
+            model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
+
+            input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+            outputs = model(input_ids, masked_lm_labels=input_ids)
+
+            loss, prediction_scores = outputs[:2]
+
+        """
+
+        generator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
+        )
+        generator_sequence_output = generator_hidden_states[0]
+
+        prediction_scores = self.generator_predictions(generator_sequence_output)
+        prediction_scores = self.generator_lm_head(prediction_scores)
+
+        output = (prediction_scores,)
+
+        # Masked language modeling softmax layer
+        if masked_lm_labels is not None:
+            loss_fct = nn.CrossEntropyLoss()  # -100 index = padding token
+            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            output = (loss,) + output
+
+        output += generator_hidden_states[1:]
+
+        return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """
+    Electra model with a token classification head on top.
+
+    Both the discriminator and generator may be loaded into this model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class ElectraForTokenClassification(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.electra = ElectraModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+
+    Examples::
+
+        from transformers import ElectraTokenizer, ElectraForTokenClassification
+        import torch
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = ElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
+
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+
+        loss, scores = outputs[:2]
+
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+
+        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+        logits = self.classifier(discriminator_sequence_output)
+
+        output = (logits,)
+
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.config.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+            output = (loss,) + output
+
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 94fb3ac1db91..c89fc46113f7 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -104,7 +104,10 @@ def __init__(self, nx, n_ctx, config, scale=False):
         n_state = nx  # in Attention: n_state=768 (nx=n_embd)
         # [switch nx => n_state from Block to Attention to keep identical to TF implem]
         assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.register_buffer(
+            "bias", torch.tril(torch.ones((n_ctx, n_ctx), dtype=torch.uint8)).view(1, 1, n_ctx, n_ctx)
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
         self.n_head = config.n_head
         self.split_size = n_state
         self.scale = scale
@@ -142,8 +145,8 @@ def _attn(self, q, k, v, attention_mask=None, head_mask=None):
         if self.scale:
             w = w / math.sqrt(v.size(-1))
         nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns - nd : ns, :ns]
-        w = w * b - 1e4 * (1 - b)
+        mask = self.bias[:, :, ns - nd : ns, :ns]
+        w = torch.where(mask, w, self.masked_bias)
 
         if attention_mask is not None:
             # Apply the attention mask
diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py
index cdfa1ce91704..5235629c60aa 100644
--- a/src/transformers/modeling_t5.py
+++ b/src/transformers/modeling_t5.py
@@ -457,7 +457,6 @@ class T5PreTrainedModel(PreTrainedModel):
     pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_t5
     base_model_prefix = "transformer"
-    encoder_outputs_batch_dim_idx = 0  # outputs shaped (bs, ...)
 
     @property
     def dummy_inputs(self):
@@ -502,6 +501,27 @@ def _init_weights(self, module):
             if module.has_relative_attention_bias:
                 module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor * ((d_model) ** -0.5))
 
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. See T5 docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in lm_labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100"
+
+        return shifted_input_ids
+
 
 class T5Stack(T5PreTrainedModel):
     def __init__(self, config, embed_tokens=None):
@@ -699,32 +719,24 @@ def forward(
     Args:
         input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
-            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
             T5 is a model with relative position embeddings so you should be able to pad the inputs on
-            the right or the left.
-
             Indices can be obtained using :class:`transformers.T5Tokenizer`.
             See :func:`transformers.PreTrainedTokenizer.encode` and
             :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+            To know more on how to prepare :obj:`input_ids` for pre-training take a look at
+            `T5 Training <./t5.html#training>`_ .
         attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        encoder_outputs (tuple(:obj:`tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`, defaults to :obj:`None`):
             Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
             `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
             Used in the cross-attention of the decoder.
         decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
             Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
+            To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at
+            `T5 Training <./t5.html#training>`_ .
         decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`, defaults to :obj:`None`):
             Default behavior: generate a tensor that ignores pad tokens in decoder_input_ids. Causal mask will also be used by default.
         inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
@@ -887,8 +899,9 @@ def forward(
         r"""
         lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
                 Labels for computing the sequence classification/regression loss.
-                Indices should be in :obj:`[0, ..., config.vocab_size - 1]`.
-                If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+                Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
+                All labels set to ``-100`` are ignored (masked), the loss is only
+                computed for labels in ``[0, ..., config.vocab_size]``
 
     Returns:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
@@ -931,6 +944,10 @@ def forward(
 
         hidden_states = encoder_outputs[0]
 
+        if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(lm_labels)
+
         # Decode
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
@@ -949,10 +966,8 @@ def forward(
 
         decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-100)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
             decoder_outputs = (
                 loss,
             ) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py
new file mode 100644
index 000000000000..58763be59335
--- /dev/null
+++ b/src/transformers/modeling_tf_electra.py
@@ -0,0 +1,615 @@
+import logging
+
+import tensorflow as tf
+
+from transformers import ElectraConfig
+
+from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
+from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
+from .modeling_tf_utils import get_initializer, shape_list
+
+
+logger = logging.getLogger(__name__)
+
+
+TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/tf_model.h5",
+    "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/tf_model.h5",
+    "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/tf_model.h5",
+    "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/tf_model.h5",
+    "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/tf_model.h5",
+    "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/tf_model.h5",
+}
+
+
+class TFElectraEmbeddings(tf.keras.layers.Layer):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.embedding_size = config.embedding_size
+        self.initializer_range = config.initializer_range
+
+        self.position_embeddings = tf.keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="position_embeddings",
+        )
+        self.token_type_embeddings = tf.keras.layers.Embedding(
+            config.type_vocab_size,
+            config.embedding_size,
+            embeddings_initializer=get_initializer(self.initializer_range),
+            name="token_type_embeddings",
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        with tf.name_scope("word_embeddings"):
+            # Create and initialize weights. The random normal initializer was chosen
+            # arbitrarily, and works well.
+            self.word_embeddings = self.add_weight(
+                "weight",
+                shape=[self.vocab_size, self.embedding_size],
+                initializer=get_initializer(self.initializer_range),
+            )
+        super().build(input_shape)
+
+    def call(self, inputs, mode="embedding", training=False):
+        """Get token embeddings of inputs.
+        Args:
+            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            mode: string, a valid value is one of "embedding" and "linear".
+        Returns:
+            outputs: (1) If mode == "embedding", output embedding tensor, float32 with
+                shape [batch_size, length, embedding_size]; (2) mode == "linear", output
+                linear tensor, float32 with shape [batch_size, length, vocab_size].
+        Raises:
+            ValueError: if mode is not valid.
+
+        Shared weights logic adapted from
+            https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+        if mode == "embedding":
+            return self._embedding(inputs, training=training)
+        elif mode == "linear":
+            return self._linear(inputs)
+        else:
+            raise ValueError("mode {} is not valid.".format(mode))
+
+    def _embedding(self, inputs, training=False):
+        """Applies embedding based on inputs tensor."""
+        input_ids, position_ids, token_type_ids, inputs_embeds = inputs
+
+        if input_ids is not None:
+            input_shape = shape_list(input_ids)
+        else:
+            input_shape = shape_list(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+        if position_ids is None:
+            position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        if inputs_embeds is None:
+            inputs_embeds = tf.gather(self.word_embeddings, input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def _linear(self, inputs):
+        """Computes logits by running inputs through a linear layer.
+            Args:
+                inputs: A float32 tensor with shape [batch_size, length, hidden_size]
+            Returns:
+                float32 tensor with shape [batch_size, length, vocab_size].
+        """
+        batch_size = shape_list(inputs)[0]
+        length = shape_list(inputs)[1]
+
+        x = tf.reshape(inputs, [-1, self.embedding_size])
+        logits = tf.matmul(x, self.word_embeddings, transpose_b=True)
+
+        return tf.reshape(logits, [batch_size, length, self.vocab_size])
+
+
+class TFElectraDiscriminatorPredictions(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense")
+        self.dense_prediction = tf.keras.layers.Dense(1, name="dense_prediction")
+        self.config = config
+
+    def call(self, discriminator_hidden_states, training=False):
+        hidden_states = self.dense(discriminator_hidden_states)
+        hidden_states = ACT2FN[self.config.hidden_act](hidden_states)
+        logits = tf.squeeze(self.dense_prediction(hidden_states))
+
+        return logits
+
+
+class TFElectraGeneratorPredictions(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dense = tf.keras.layers.Dense(config.embedding_size, name="dense")
+
+    def call(self, generator_hidden_states, training=False):
+        hidden_states = self.dense(generator_hidden_states)
+        hidden_states = ACT2FN["gelu"](hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+
+        return hidden_states
+
+
+class TFElectraPreTrainedModel(TFBertPreTrainedModel):
+
+    config_class = ElectraConfig
+    pretrained_model_archive_map = TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "electra"
+
+    def get_extended_attention_mask(self, attention_mask, input_shape):
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        return extended_attention_mask
+
+    def get_head_mask(self, head_mask):
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        return head_mask
+
+
+class TFElectraMainLayer(TFElectraPreTrainedModel):
+
+    config_class = ElectraConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+        self.embeddings = TFElectraEmbeddings(config, name="embeddings")
+
+        if config.embedding_size != config.hidden_size:
+            self.embeddings_project = tf.keras.layers.Dense(config.hidden_size, name="embeddings_project")
+        self.encoder = TFBertEncoder(config, name="encoder")
+        self.config = config
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        raise NotImplementedError
+
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            inputs_embeds = inputs[5] if len(inputs) > 5 else inputs_embeds
+            assert len(inputs) <= 6, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            token_type_ids = inputs.get("token_type_ids", token_type_ids)
+            position_ids = inputs.get("position_ids", position_ids)
+            head_mask = inputs.get("head_mask", head_mask)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            assert len(inputs) <= 6, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if attention_mask is None:
+            attention_mask = tf.fill(input_shape, 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(input_shape, 0)
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        head_mask = self.get_head_mask(head_mask)
+
+        hidden_states = self.embeddings([input_ids, position_ids, token_type_ids, inputs_embeds], training=training)
+
+        if hasattr(self, "embeddings_project"):
+            hidden_states = self.embeddings_project(hidden_states, training=training)
+
+        hidden_states = self.encoder([hidden_states, extended_attention_mask, head_mask], training=training)
+
+        return hidden_states
+
+
+ELECTRA_START_DOCSTRING = r"""
+    This model is a `tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ sub-class.
+    Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. note::
+
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is useful when using :obj:`tf.keras.Model.fit()` method which currently requires having
+        all the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors
+        in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.ElectraConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+ELECTRA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.ElectraTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        head_mask (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, embedding_dim)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        training (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+            Whether to activate dropout modules (if set to :obj:`True`) during training or to de-activate them
+            (if set to :obj:`False`) for evaluation.
+
+"""
+
+
+@add_start_docstrings(
+    "The bare Electra Model transformer outputting raw hidden-states without any specific head on top. Identical to "
+    "the BERT model except that it uses an additional linear layer between the embedding layer and the encoder if the "
+    "hidden size and embedding size are different."
+    ""
+    "Both the generator and discriminator checkpoints may be loaded into this model.",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraModel(TFElectraPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.electra = TFElectraMainLayer(config, name="electra")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(self, inputs, **kwargs):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraModel
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraModel.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
+        outputs = self.electra(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings(
+    """
+Electra model with a binary classification head on top as used during pre-training for identifying generated
+tokens.
+
+Even though both the discriminator and generator may be loaded into this model, the discriminator is
+the only model of the two to have the correct classification head to be used for this model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForPreTraining(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.discriminator_predictions = TFElectraDiscriminatorPredictions(config, name="discriminator_predictions")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Prediction scores of the head (scores for each token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForPreTraining
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        logits = self.discriminator_predictions(discriminator_sequence_output)
+        output = (logits,)
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
+
+
+class TFElectraMaskedLMHead(tf.keras.layers.Layer):
+    def __init__(self, config, input_embeddings, **kwargs):
+        super().__init__(**kwargs)
+        self.vocab_size = config.vocab_size
+        self.input_embeddings = input_embeddings
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+        super().build(input_shape)
+
+    def call(self, hidden_states, training=False):
+        hidden_states = self.input_embeddings(hidden_states, mode="linear")
+        hidden_states = hidden_states + self.bias
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+Electra model with a language modeling head on top.
+
+Even though both the discriminator and generator may be loaded into this model, the generator is
+the only model of the two to have been trained for the masked language modeling task.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForMaskedLM(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.vocab_size = config.vocab_size
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.generator_predictions = TFElectraGeneratorPredictions(config, name="generator_predictions")
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.generator_lm_head = TFElectraMaskedLMHead(config, self.electra.embeddings, name="generator_lm_head")
+
+    def get_input_embeddings(self):
+        return self.electra.embeddings
+
+    def get_output_embeddings(self):
+        return self.generator_lm_head
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForMaskedLM
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
+        model = TFElectraForMaskedLM.from_pretrained('google/electra-small-generator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+        """
+
+        generator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        generator_sequence_output = generator_hidden_states[0]
+        prediction_scores = self.generator_predictions(generator_sequence_output, training=training)
+        prediction_scores = self.generator_lm_head(prediction_scores, training=training)
+        output = (prediction_scores,)
+        output += generator_hidden_states[1:]
+
+        return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings(
+    """
+Electra model with a token classification head on top.
+
+Both the discriminator and generator may be loaded into this model.""",
+    ELECTRA_START_DOCSTRING,
+)
+class TFElectraForTokenClassification(TFElectraPreTrainedModel):
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.electra = TFElectraMainLayer(config, name="electra")
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels, name="classifier")
+
+    @add_start_docstrings_to_callable(ELECTRA_INPUTS_DOCSTRING)
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        training=False,
+    ):
+        r"""
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
+        scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import ElectraTokenizer, TFElectraForTokenClassification
+
+        tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        model = TFElectraForTokenClassification.from_pretrained('google/electra-small-discriminator')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+        """
+
+        discriminator_hidden_states = self.electra(
+            input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, training=training
+        )
+        discriminator_sequence_output = discriminator_hidden_states[0]
+        discriminator_sequence_output = self.dropout(discriminator_sequence_output)
+        logits = self.classifier(discriminator_sequence_output)
+        output = (logits,)
+        output += discriminator_hidden_states[1:]
+
+        return output  # (loss), scores, (hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index ddc6b7a801b0..eede1fd675b7 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -119,7 +119,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs):
 
         if self.has_relative_attention_bias:
             self.relative_attention_bias = tf.keras.layers.Embedding(
-                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias"
+                self.relative_attention_num_buckets, self.n_heads, name="relative_attention_bias",
             )
         self.pruned_heads = set()
 
@@ -178,13 +178,15 @@ def compute_bias(self, qlen, klen):
         memory_position = tf.range(klen)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
         rp_bucket = self._relative_position_bucket(
-            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets
+            relative_position, bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets,
         )
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
         values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0)  # shape (1, num_heads, qlen, klen)
         return values
 
-    def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
+    def call(
+        self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False,
+    ):
         """
         Self-attention (if kv is None) or attention over source sentence (provided by kv).
         """
@@ -261,15 +263,17 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super().__init__(**kwargs)
         self.SelfAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention"
+            config, has_relative_attention_bias=has_relative_attention_bias, name="SelfAttention",
         )
         self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False):
+    def call(
+        self, hidden_states, attention_mask=None, position_bias=None, head_mask=None, training=False,
+    ):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.SelfAttention(
-            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training
+            norm_x, mask=attention_mask, position_bias=position_bias, head_mask=head_mask, training=training,
         )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
@@ -281,15 +285,17 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
     def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super().__init__(**kwargs)
         self.EncDecAttention = TFT5Attention(
-            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention"
+            config, has_relative_attention_bias=has_relative_attention_bias, name="EncDecAttention",
         )
         self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="layer_norm")
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
-    def call(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False):
+    def call(
+        self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None, training=False,
+    ):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(
-            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training
+            norm_x, mask=attention_mask, kv=kv, position_bias=position_bias, head_mask=head_mask, training=training,
         )
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y, training=training)
@@ -303,12 +309,12 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         self.is_decoder = config.is_decoder
         self.layer = []
         self.layer.append(
-            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0")
+            TFT5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._0",)
         )
         if self.is_decoder:
             self.layer.append(
                 TFT5LayerCrossAttention(
-                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1"
+                    config, has_relative_attention_bias=has_relative_attention_bias, name="layer_._1",
                 )
             )
             self.layer.append(TFT5LayerFF(config, name="layer_._2"))
@@ -402,7 +408,7 @@ def __init__(self, config, embed_tokens=None, **kwargs):
         self.num_hidden_layers = config.num_layers
 
         self.block = [
-            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i))
+            TFT5Block(config, has_relative_attention_bias=bool(i == 0), name="block_._{}".format(i),)
             for i in range(config.num_layers)
         ]
         self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, name="final_layer_norm")
@@ -469,7 +475,7 @@ def call(
             if self.config.is_decoder:
                 seq_ids = tf.range(seq_length)
                 causal_mask = tf.less_equal(
-                    tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None]
+                    tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), seq_ids[None, :, None],
                 )
                 causal_mask = tf.cast(causal_mask, dtype=tf.float32)
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
@@ -586,8 +592,8 @@ def dummy_inputs(self):
         input_ids = tf.constant(DUMMY_INPUTS)
         input_mask = tf.constant(DUMMY_MASK)
         dummy_inputs = {
+            "inputs": input_ids,
             "decoder_input_ids": input_ids,
-            "input_ids": input_ids,
             "decoder_attention_mask": input_mask,
         }
         return dummy_inputs
@@ -631,33 +637,24 @@ def dummy_inputs(self):
 
 T5_INPUTS_DOCSTRING = r"""
     Args:
-        decoder_input_ids are usually used as a `dict` (see T5 description above for more information) containing all the following.
-        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
-            Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
+        inputs are usually used as a `dict` (see T5 description above for more information) containing all the following.
 
-        input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+        inputs (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
-            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
-
-            (a) For sequence pairs:
-
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-
-            (b) For single sequences:
-
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
-
             T5 is a model with relative position embeddings so you should be able to pad the inputs on
             the right or the left.
-
             Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            To know more on how to prepare :obj:`input_ids` for pre-training take a look at
+            `T5 Training <./t5.html#training>`_ .
             See :func:`transformers.PreTrainedTokenizer.encode` and
             :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`, defaults to :obj:`None`):
+            Provide for sequence to sequence training. T5 uses the pad_token_id as the starting token for decoder_input_ids generation.
         attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        encoder_outputs (tuple(:obj:`tuple(tf.FloatTensor)`, `optional`, defaults to :obj:`None`):
+        encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`, defaults to :obj:`None`):
             Tuple consists of (`last_hidden_state`, `optional`: `hidden_states`, `optional`: `attentions`)
             `last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`) is a sequence of hidden-states at the output of the last layer of the encoder.
             Used in the cross-attention of the decoder.
@@ -671,6 +668,8 @@ def dummy_inputs(self):
             Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
             than the model's internal embedding lookup matrix.
+            To know more on how to prepare :obj:`decoder_input_ids` for pre-training take a look at
+            `T5 Training <./t5.html#training>`_ .
         head_mask: (:obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
@@ -707,7 +706,7 @@ def get_output_embeddings(self):
         return self.shared
 
     @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
-    def call(self, decoder_input_ids, **kwargs):
+    def call(self, inputs, **kwargs):
         r"""
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
@@ -732,18 +731,18 @@ def call(self, decoder_input_ids, **kwargs):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = TFT5Model.from_pretrained('t5-small')
         input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(input_ids, input_ids=input_ids)
+        outputs = model(input_ids, decoder_input_ids=input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
         """
 
-        if isinstance(decoder_input_ids, dict):
-            kwargs.update(decoder_input_ids)
+        if isinstance(inputs, dict):
+            kwargs.update(inputs)
         else:
-            kwargs["decoder_input_ids"] = decoder_input_ids
+            kwargs["inputs"] = inputs
 
         # retrieve arguments
-        input_ids = kwargs.get("input_ids", None)
+        input_ids = kwargs.get("inputs", None)
         decoder_input_ids = kwargs.get("decoder_input_ids", None)
         attention_mask = kwargs.get("attention_mask", None)
         encoder_outputs = kwargs.get("encoder_outputs", None)
@@ -755,7 +754,7 @@ def call(self, decoder_input_ids, **kwargs):
         # Encode if needed (training, first prediction pass)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
-                input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask
+                input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask,
             )
 
         hidden_states = encoder_outputs[0]
@@ -804,13 +803,8 @@ def get_encoder(self):
         return self.encoder
 
     @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
-    def call(self, decoder_input_ids, **kwargs):
+    def call(self, inputs, **kwargs):
         r"""
-        lm_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.vocab_size - 1]`.
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
         loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
@@ -835,8 +829,8 @@ def call(self, decoder_input_ids, **kwargs):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
         input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="tf")  # Batch size 1
-        outputs = model(input_ids, input_ids=input_ids, lm_labels=input_ids)
-        prediction_scores = outputs[:1]  # TODO: TFT5 still needs to implement
+        outputs = model(input_ids, decoder_input_ids=input_ids)
+        prediction_scores = outputs[0]
 
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
@@ -845,13 +839,13 @@ def call(self, decoder_input_ids, **kwargs):
 
         """
 
-        if isinstance(decoder_input_ids, dict):
-            kwargs.update(decoder_input_ids)
+        if isinstance(inputs, dict):
+            kwargs.update(inputs)
         else:
-            kwargs["decoder_input_ids"] = decoder_input_ids
+            kwargs["inputs"] = inputs
 
         # retrieve arguments
-        input_ids = kwargs.get("input_ids", None)
+        input_ids = kwargs.get("inputs", None)
         decoder_input_ids = kwargs.get("decoder_input_ids", None)
         attention_mask = kwargs.get("attention_mask", None)
         encoder_outputs = kwargs.get("encoder_outputs", None)
@@ -864,7 +858,7 @@ def call(self, decoder_input_ids, **kwargs):
         if encoder_outputs is None:
             # Convert encoder inputs in embeddings if needed
             encoder_outputs = self.encoder(
-                input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask
+                input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask,
             )
 
         hidden_states = encoder_outputs[0]
@@ -879,7 +873,6 @@ def call(self, decoder_input_ids, **kwargs):
             head_mask=head_mask,
         )
 
-        # TODO (thom / patrick): add lm_labels for loss function
         sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
         embed_tokens = self.get_output_embeddings()
         lm_logits = embed_tokens(sequence_output, mode="linear")
@@ -897,7 +890,8 @@ def prepare_inputs_for_generation(self, input_ids, past, attention_mask, **kwarg
             encoder_outputs = (past,)
 
         return {
-            "inputs": input_ids,
+            "inputs": None,  # inputs don't have to be defined, but still need to be passed to make Keras.layer.__call__ happy
+            "decoder_input_ids": input_ids,  # input_ids are the decoder_input_ids
             "encoder_outputs": encoder_outputs,
             "attention_mask": attention_mask,
         }
diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py
index b53f0e9307f7..e614aa18c852 100644
--- a/src/transformers/modeling_tf_transfo_xl.py
+++ b/src/transformers/modeling_tf_transfo_xl.py
@@ -488,7 +488,7 @@ def init_mems(self, bsz):
         else:
             return None
 
-    def _update_mems(self, hids, mems, qlen, mlen):
+    def _update_mems(self, hids, mems, mlen, qlen):
         # does not deal with None
         if mems is None:
             return None
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 9e4a155e2d84..fc75984c8bf6 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -467,6 +467,7 @@ def generate(
         top_k=None,
         top_p=None,
         repetition_penalty=None,
+        bad_words_ids=None,
         bos_token_id=None,
         pad_token_id=None,
         eos_token_id=None,
@@ -523,8 +524,8 @@ def generate(
             pad_token_id: (`optional`) int
                 Pad token. Defaults to pad_token_id as defined in the models config.
 
-            eos_token_ids: (`optional`) int or list of int
-                End of sequence token or list of tokens to stop the generation. Default to 0.
+            eos_token_id: (`optional`) int
+                EOS token. Defaults to eos_token_id as defined in the models config.
 
             length_penalty: (`optional`) float
                 Exponential penalty to the length. Default to 1.
@@ -532,6 +533,9 @@ def generate(
             no_repeat_ngram_size: (`optional`) int
                 If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
 
+            bad_words_ids: (`optional`) list of lists of int
+                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+
             num_return_sequences: (`optional`) int
                 The number of independently computed returned sequences for each element in the batch. Default to 1.
 
@@ -582,6 +586,12 @@ def generate(
             outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
             print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
 
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+            model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
         """
 
         # We cannot generate if the model does not have a LM head
@@ -607,6 +617,7 @@ def generate(
         no_repeat_ngram_size = (
             no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
         )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
         num_return_sequences = (
             num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
         )
@@ -641,6 +652,9 @@ def generate(
         assert (
             isinstance(num_return_sequences, int) and num_return_sequences > 0
         ), "`num_return_sequences` should be a strictely positive integer."
+        assert (
+            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
 
         if input_ids is None:
             assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
@@ -742,6 +756,7 @@ def generate(
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
                 bos_token_id=bos_token_id,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
@@ -766,6 +781,7 @@ def generate(
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
                 bos_token_id=bos_token_id,
                 pad_token_id=pad_token_id,
                 eos_token_id=eos_token_id,
@@ -790,6 +806,7 @@ def _generate_no_beam_search(
         top_p,
         repetition_penalty,
         no_repeat_ngram_size,
+        bad_words_ids,
         bos_token_id,
         pad_token_id,
         eos_token_id,
@@ -828,7 +845,7 @@ def _generate_no_beam_search(
             if no_repeat_ngram_size > 0:
                 # calculate a list of banned tokens to prevent repetitively generating the same ngrams
                 # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_tokens = calc_banned_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
+                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
                 # create banned_tokens boolean mask
                 banned_tokens_indices_mask = []
                 for banned_tokens_slice in banned_tokens:
@@ -840,6 +857,20 @@ def _generate_no_beam_search(
                     next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
                 )
 
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                next_token_logits = set_tensor_by_indices_to_value(
+                    next_token_logits, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
             # set eos token prob to zero if min_length is not reached
             if eos_token_id is not None and cur_len < min_length:
                 # create eos_token_id boolean mask
@@ -936,6 +967,7 @@ def _generate_beam_search(
         top_p,
         repetition_penalty,
         no_repeat_ngram_size,
+        bad_words_ids,
         bos_token_id,
         pad_token_id,
         decoder_start_token_id,
@@ -1012,7 +1044,9 @@ def _generate_beam_search(
                 # calculate a list of banned tokens to prevent repetitively generating the same ngrams
                 # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
                 num_batch_hypotheses = batch_size * num_beams
-                banned_tokens = calc_banned_tokens(input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len)
+                banned_tokens = calc_banned_ngram_tokens(
+                    input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
+                )
                 # create banned_tokens boolean mask
                 banned_tokens_indices_mask = []
                 for banned_tokens_slice in banned_tokens:
@@ -1024,6 +1058,20 @@ def _generate_beam_search(
                     scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
                 )
 
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
+                banned_tokens_indices_mask = []
+                for banned_tokens_slice in banned_tokens:
+                    banned_tokens_indices_mask.append(
+                        [True if token in banned_tokens_slice else False for token in range(vocab_size)]
+                    )
+
+                scores = set_tensor_by_indices_to_value(
+                    scores, tf.convert_to_tensor(banned_tokens_indices_mask, dtype=tf.bool), -float("inf")
+                )
+
             assert shape_list(scores) == [batch_size * num_beams, vocab_size]
 
             if do_sample:
@@ -1064,12 +1112,12 @@ def _generate_beam_search(
             assert shape_list(next_scores) == shape_list(next_tokens) == [batch_size, 2 * num_beams]
 
             # next batch beam content
-            # list of (batch_size * num_beams) tuple(next hypothesis score, next token, current position in the batch)
             next_batch_beam = []
 
             # for each sentence
             for batch_idx in range(batch_size):
 
+                # if we are done with this sentence
                 if done[batch_idx]:
                     assert (
                         len(generated_hyps[batch_idx]) >= num_beams
@@ -1087,14 +1135,13 @@ def _generate_beam_search(
                 for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
                     zip(next_tokens[batch_idx], next_scores[batch_idx])
                 ):
-
                     # get beam and token IDs
                     beam_id = beam_token_id // vocab_size
                     token_id = beam_token_id % vocab_size
 
                     effective_beam_id = batch_idx * num_beams + beam_id
                     # add to generated hypotheses if end of sentence or last iteration
-                    if eos_token_id is not None and token_id.numpy() is eos_token_id:
+                    if (eos_token_id is not None) and (token_id.numpy() == eos_token_id):
                         # if beam_token does not belong to top num_beams tokens, it should not be added
                         is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
                         if is_beam_token_worse_than_top_num_beams:
@@ -1110,9 +1157,9 @@ def _generate_beam_search(
                     if len(next_sent_beam) == num_beams:
                         break
 
-                # if we are done with this sentence
+                # Check if were done so that we can save a pad step if all(done)
                 done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
-                    tf.reduce_max(next_scores[batch_idx]).numpy()
+                    tf.reduce_max(next_scores[batch_idx]).numpy(), cur_len=cur_len
                 )
 
                 # update next beam content
@@ -1137,6 +1184,7 @@ def _generate_beam_search(
             if past is not None:
                 past = self._reorder_cache(past, beam_idx)
 
+            # extend attention_mask for new generated input if only decoder
             if self.config.is_encoder_decoder is False:
                 attention_mask = tf.concat(
                     [attention_mask, tf.ones((shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
@@ -1196,16 +1244,26 @@ def _generate_beam_search(
 
             # fill with hypothesis and eos_token_id if necessary
             for i, hypo in enumerate(best):
-                padding = tf.ones((sent_max_len - shape_list(hypo)[0],), dtype=tf.int32) * pad_token_id
-                decoded_hypo = tf.concat([hypo, padding], axis=0)
-
-                if sent_lengths[i] < max_length:
-                    decoded_hypo = tf.where(
-                        tf.range(max_length) == sent_lengths[i],
-                        eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32),
-                        decoded_hypo,
-                    )
-                decoded_list.append(decoded_hypo)
+                assert sent_lengths[i] == shape_list(hypo)[0]
+                # if sent_length is max_len do not pad
+                if sent_lengths[i] == sent_max_len:
+                    decoded_slice = hypo
+                else:
+                    # else pad to sent_max_len
+                    num_pad_tokens = sent_max_len - sent_lengths[i]
+                    padding = pad_token_id * tf.ones((num_pad_tokens,), dtype=tf.int32)
+                    decoded_slice = tf.concat([hypo, padding], axis=-1)
+
+                    # finish sentence with EOS token
+                    if sent_lengths[i] < max_length:
+                        decoded_slice = tf.where(
+                            tf.range(sent_max_len, dtype=tf.int32) == sent_lengths[i],
+                            eos_token_id * tf.ones((sent_max_len,), dtype=tf.int32),
+                            decoded_slice,
+                        )
+                # add to list
+                decoded_list.append(decoded_slice)
+
             decoded = tf.stack(decoded_list)
         else:
             # none of the hypotheses have an eos_token
@@ -1243,7 +1301,7 @@ def _create_next_token_logits_penalties(input_ids, logits, repetition_penalty):
     return tf.convert_to_tensor(token_penalties, dtype=tf.float32)
 
 
-def calc_banned_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
+def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
     # Copied from fairseq for no_repeat_ngram in beam_search"""
     if cur_len + 1 < no_repeat_ngram_size:
         # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
@@ -1266,6 +1324,42 @@ def _get_generated_ngrams(hypo_idx):
     return banned_tokens
 
 
+def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
+    banned_tokens = []
+
+    def _tokens_match(prev_tokens, tokens):
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        if len(tokens) > len(prev_input_ids):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+
+        if prev_tokens[-len(tokens) :] == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+
+    for prev_input_ids_slice in prev_input_ids:
+        banned_tokens_slice = []
+
+        for banned_token_seq in bad_words_ids:
+            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+                bad_words_ids
+            )
+
+            if _tokens_match(prev_input_ids_slice.numpy().tolist(), banned_token_seq[:-1]) is False:
+                # if tokens do not match continue
+                continue
+
+            banned_tokens_slice.append(banned_token_seq[-1])
+
+        banned_tokens.append(banned_tokens_slice)
+
+    return banned_tokens
+
+
 def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
         Args:
diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py
index dd626c2cf968..1ed840896655 100644
--- a/src/transformers/modeling_transfo_xl.py
+++ b/src/transformers/modeling_transfo_xl.py
@@ -136,7 +136,7 @@ def load_tf_weights_in_transfo_xl(model, config, tf_path):
         if "kernel" in name or "proj" in name:
             array = np.transpose(array)
         if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
-            # Here we will split the TF weigths
+            # Here we will split the TF weights
             assert len(pointer) == array.shape[0]
             for i, p_i in enumerate(pointer):
                 arr_i = array[i, ...]
@@ -859,7 +859,7 @@ def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None,
 
     Return:
         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.TransfoXLConfig`) and inputs:
-        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when ``labels`` is provided)
+        loss (:obj:`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`, returned when ``labels`` is provided)
             Language modeling loss.
         prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 9d4abb2dedd8..685605e77301 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -667,6 +667,7 @@ def generate(
         top_k=None,
         top_p=None,
         repetition_penalty=None,
+        bad_words_ids=None,
         bos_token_id=None,
         pad_token_id=None,
         eos_token_id=None,
@@ -721,19 +722,18 @@ def generate(
                 Padding token. Default to specicic model pad_token_id or None if it does not exist.
 
             bos_token_id: (`optional`) int
-                BOS token. Defaults to bos_token_id as defined in the models config.
+                BOS token. Defaults to `bos_token_id` as defined in the models config.
 
-            pad_token_id: (`optional`) int
-                Pad token. Defaults to pad_token_id as defined in the models config.
-
-            eos_token_ids: (`optional`) int or list of int
-                End of sequence token or list of tokens to stop the generation. Default to eos_token_ids as defined in the models config.
+            eos_token_id: (`optional`) int
+                EOS token. Defaults to `eos_token_id` as defined in the models config.
 
             length_penalty: (`optional`) float
                 Exponential penalty to the length. Default to 1.
 
             no_repeat_ngram_size: (`optional`) int
                 If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
+            bad_words_ids: (`optional`) list of lists of int
+                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
 
             num_return_sequences: (`optional`) int
                 The number of independently computed returned sequences for each element in the batch. Default to 1.
@@ -785,6 +785,12 @@ def generate(
             outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
             print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
 
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
         """
 
         # We cannot generate if the model does not have a LM head
@@ -810,6 +816,7 @@ def generate(
         no_repeat_ngram_size = (
             no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
         )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
         num_return_sequences = (
             num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
         )
@@ -847,6 +854,9 @@ def generate(
         assert (
             isinstance(num_return_sequences, int) and num_return_sequences > 0
         ), "`num_return_sequences` should be a strictly positive integer."
+        assert (
+            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
 
         if input_ids is None:
             assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
@@ -938,18 +948,22 @@ def generate(
                 device=next(self.parameters()).device,
             )
             cur_len = 1
-            batch_idx = self.encoder_outputs_batch_dim_idx
+
             assert (
-                batch_size == encoder_outputs[0].shape[batch_idx]
-            ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[1]} "
-            expanded_idx = (
+                batch_size == encoder_outputs[0].shape[0]
+            ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} "
+
+            # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
+            expanded_batch_idxs = (
                 torch.arange(batch_size)
                 .view(-1, 1)
                 .repeat(1, num_beams * effective_batch_mult)
                 .view(-1)
                 .to(input_ids.device)
             )
-            encoder_outputs = (encoder_outputs[0].index_select(batch_idx, expanded_idx), *encoder_outputs[1:])
+            # expand encoder_outputs
+            encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:])
+
         else:
             encoder_outputs = None
             cur_len = input_ids.shape[-1]
@@ -967,6 +981,7 @@ def generate(
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
                 bos_token_id=bos_token_id,
                 pad_token_id=pad_token_id,
                 decoder_start_token_id=decoder_start_token_id,
@@ -991,6 +1006,7 @@ def generate(
                 top_p=top_p,
                 repetition_penalty=repetition_penalty,
                 no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
                 bos_token_id=bos_token_id,
                 pad_token_id=pad_token_id,
                 decoder_start_token_id=decoder_start_token_id,
@@ -1014,6 +1030,7 @@ def _generate_no_beam_search(
         top_p,
         repetition_penalty,
         no_repeat_ngram_size,
+        bad_words_ids,
         bos_token_id,
         pad_token_id,
         eos_token_id,
@@ -1048,7 +1065,14 @@ def _generate_no_beam_search(
             if no_repeat_ngram_size > 0:
                 # calculate a list of banned tokens to prevent repetitively generating the same ngrams
                 # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_tokens = calc_banned_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
+                banned_tokens = calc_banned_ngram_tokens(input_ids, batch_size, no_repeat_ngram_size, cur_len)
+                for batch_idx in range(batch_size):
+                    next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
+
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
                 for batch_idx in range(batch_size):
                     next_token_logits[batch_idx, banned_tokens[batch_idx]] = -float("inf")
 
@@ -1124,6 +1148,7 @@ def _generate_beam_search(
         top_p,
         repetition_penalty,
         no_repeat_ngram_size,
+        bad_words_ids,
         bos_token_id,
         pad_token_id,
         eos_token_id,
@@ -1190,12 +1215,19 @@ def _generate_beam_search(
                 # calculate a list of banned tokens to prevent repetitively generating the same ngrams
                 num_batch_hypotheses = batch_size * num_beams
                 # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
-                banned_batch_tokens = calc_banned_tokens(
+                banned_batch_tokens = calc_banned_ngram_tokens(
                     input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
                 )
                 for i, banned_tokens in enumerate(banned_batch_tokens):
                     scores[i, banned_tokens] = -float("inf")
 
+            if bad_words_ids is not None:
+                # calculate a list of banned tokens according to bad words
+                banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+
+                for i, banned_tokens in enumerate(banned_tokens):
+                    scores[i, banned_tokens] = -float("inf")
+
             assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format(
                 scores.shape, (batch_size * num_beams, vocab_size)
             )
@@ -1256,14 +1288,13 @@ def _generate_beam_search(
                 for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
                     zip(next_tokens[batch_idx], next_scores[batch_idx])
                 ):
-                    # get beam and word IDs
+                    # get beam and token IDs
                     beam_id = beam_token_id // vocab_size
                     token_id = beam_token_id % vocab_size
 
                     effective_beam_id = batch_idx * num_beams + beam_id
-
-                    # add to generated hypotheses if end of sentence
-                    if (eos_token_id is not None) and (token_id.item() is eos_token_id):
+                    # add to generated hypotheses if end of sentence or last iteration
+                    if (eos_token_id is not None) and (token_id.item() == eos_token_id):
                         # if beam_token does not belong to top num_beams tokens, it should not be added
                         is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
                         if is_beam_token_worse_than_top_num_beams:
@@ -1272,7 +1303,7 @@ def _generate_beam_search(
                             input_ids[effective_beam_id].clone(), beam_token_score.item(),
                         )
                     else:
-                        # add next predicted word if it is not eos_token
+                        # add next predicted token if it is not eos_token
                         next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
 
                     # the beam for next step is full
@@ -1302,7 +1333,6 @@ def _generate_beam_search(
             # re-order batch
             input_ids = input_ids[beam_idx, :]
             input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
-
             # re-order internal states
             if past is not None:
                 past = self._reorder_cache(past, beam_idx)
@@ -1400,7 +1430,7 @@ def _reorder_cache(past, beam_idx):
         return past
 
 
-def calc_banned_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
+def calc_banned_ngram_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
     # Copied from fairseq for no_repeat_ngram in beam_search"""
     if cur_len + 1 < no_repeat_ngram_size:
         # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
@@ -1423,6 +1453,42 @@ def _get_generated_ngrams(hypo_idx):
     return banned_tokens
 
 
+def calc_banned_bad_words_ids(prev_input_ids, bad_words_ids):
+    banned_tokens = []
+
+    def _tokens_match(prev_tokens, tokens):
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        if len(tokens) > len(prev_input_ids):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+
+        if prev_tokens[-len(tokens) :] == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+
+    for prev_input_ids_slice in prev_input_ids:
+        banned_tokens_slice = []
+
+        for banned_token_seq in bad_words_ids:
+            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+                bad_words_ids
+            )
+
+            if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False:
+                # if tokens do not match continue
+                continue
+
+            banned_tokens_slice.append(banned_token_seq[-1])
+
+        banned_tokens.append(banned_tokens_slice)
+
+    return banned_tokens
+
+
 def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("Inf"), min_tokens_to_keep=1):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
         Args:
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 7d34e7ef2bda..aad18ccb1cf9 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -156,7 +156,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
             logger.info("Transposing")
             array = np.transpose(array)
         if isinstance(pointer, list):
-            # Here we will split the TF weigths
+            # Here we will split the TF weights
             assert len(pointer) == array.shape[0]
             for i, p_i in enumerate(pointer):
                 arr_i = array[i, ...]
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index d232370905e2..db87eda0bec0 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -214,7 +214,7 @@ def __call__(self, gradients):
             raise ValueError("Expected %s gradients, but got %d" % (len(self._gradients), len(gradients)))
 
         for accum_gradient, gradient in zip(self._get_replica_gradients(), gradients):
-            if accum_gradient is not None:
+            if accum_gradient is not None and gradient is not None:
                 accum_gradient.assign_add(gradient)
 
         self._accum_steps.assign_add(1)
@@ -241,6 +241,7 @@ def _get_replica_gradients(self):
             return (
                 gradient.device_map.select_for_current_replica(gradient.values, replica_context)
                 for gradient in self._gradients
+                if gradient is not None
             )
         else:
             return self._gradients
diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 89caf192cd26..37eafab9e9c9 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -459,7 +459,7 @@ def _parse_and_tokenize(self, *texts, pad_to_max_length=False, **kwargs):
         )
 
         # Filter out features not available on specific models
-        inputs = self.inputs_for_model(inputs)
+        # inputs = self.inputs_for_model(inputs)
 
         return inputs
 
@@ -480,7 +480,7 @@ def _forward(self, inputs, return_tensors=False):
         with self.device_placement():
             if self.framework == "tf":
                 # TODO trace model
-                predictions = self.model(inputs, training=False)[0]
+                predictions = self.model(inputs.data, training=False)[0]
             else:
                 with torch.no_grad():
                     inputs = self.ensure_tensor_on_device(**inputs)
@@ -778,7 +778,7 @@ def __call__(self, *texts, **kwargs):
 
                 # Forward
                 if self.framework == "tf":
-                    entities = self.model(tokens)[0][0].numpy()
+                    entities = self.model(tokens.data)[0][0].numpy()
                     input_ids = tokens["input_ids"].numpy()[0]
                 else:
                     with torch.no_grad():
@@ -1235,17 +1235,19 @@ def __call__(
             elif self.framework == "tf":
                 input_length = tf.shape(inputs["input_ids"])[-1].numpy()
 
-            if input_length < self.model.config.min_length // 2:
+            min_length = generate_kwargs.get("min_length", self.model.config.min_length)
+            if input_length < min_length // 2:
                 logger.warning(
                     "Your min_length is set to {}, but you input_length is only {}. You might consider decreasing min_length manually, e.g. summarizer('...', min_length=10)".format(
-                        self.model.config.min_length, input_length
+                        min_length, input_length
                     )
                 )
 
-            if input_length < self.model.config.max_length:
+            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
+            if input_length < max_length:
                 logger.warning(
                     "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
-                        self.model.config.max_length, input_length
+                        max_length, input_length
                     )
                 )
 
@@ -1349,10 +1351,11 @@ def __call__(
             elif self.framework == "tf":
                 input_length = tf.shape(inputs["input_ids"])[-1].numpy()
 
-            if input_length > 0.9 * self.model.config.max_length:
+            max_length = generate_kwargs.get("max_length", self.model.config.max_length)
+            if input_length > 0.9 * max_length:
                 logger.warning(
                     "Your input_length: {} is bigger than 0.9 * max_length: {}. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)".format(
-                        input_length, self.model.config.max_length
+                        input_length, max_length
                     )
                 )
 
@@ -1396,7 +1399,7 @@ def __call__(
                 "tf": "distilbert-base-uncased-finetuned-sst-2-english",
             },
             "config": "distilbert-base-uncased-finetuned-sst-2-english",
-            "tokenizer": "distilbert-base-uncased",
+            "tokenizer": "distilbert-base-cased",
         },
     },
     "ner": {
diff --git a/src/transformers/tokenization_auto.py b/src/transformers/tokenization_auto.py
index d59a934e98a5..b62531469f90 100644
--- a/src/transformers/tokenization_auto.py
+++ b/src/transformers/tokenization_auto.py
@@ -26,6 +26,7 @@
     CamembertConfig,
     CTRLConfig,
     DistilBertConfig,
+    ElectraConfig,
     FlaubertConfig,
     GPT2Config,
     OpenAIGPTConfig,
@@ -44,6 +45,7 @@
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_ctrl import CTRLTokenizer
 from .tokenization_distilbert import DistilBertTokenizer, DistilBertTokenizerFast
+from .tokenization_electra import ElectraTokenizer, ElectraTokenizerFast
 from .tokenization_flaubert import FlaubertTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
 from .tokenization_openai import OpenAIGPTTokenizer, OpenAIGPTTokenizerFast
@@ -67,6 +69,7 @@
         (XLMRobertaConfig, (XLMRobertaTokenizer, None)),
         (BartConfig, (BartTokenizer, None)),
         (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
+        (ElectraConfig, (ElectraTokenizer, ElectraTokenizerFast)),
         (BertConfig, (BertTokenizer, BertTokenizerFast)),
         (OpenAIGPTConfig, (OpenAIGPTTokenizer, OpenAIGPTTokenizerFast)),
         (GPT2Config, (GPT2Tokenizer, GPT2TokenizerFast)),
@@ -104,6 +107,7 @@ class method.
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
             - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
+            - contains `electra`: ElectraTokenizer (Google ELECTRA model)
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
@@ -135,6 +139,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
             - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model)
+            - contains `electra`: ElectraTokenizer (Google ELECTRA model)
 
         Params:
             pretrained_model_name_or_path: either:
diff --git a/src/transformers/tokenization_bert.py b/src/transformers/tokenization_bert.py
index 159600a37c88..21e39732344f 100644
--- a/src/transformers/tokenization_bert.py
+++ b/src/transformers/tokenization_bert.py
@@ -592,8 +592,6 @@ def __init__(
         self,
         vocab_file,
         do_lower_case=True,
-        do_basic_tokenize=True,
-        never_split=None,
         unk_token="[UNK]",
         sep_token="[SEP]",
         pad_token="[PAD]",
@@ -601,7 +599,6 @@ def __init__(
         mask_token="[MASK]",
         clean_text=True,
         tokenize_chinese_chars=True,
-        add_special_tokens=True,
         strip_accents=True,
         wordpieces_prefix="##",
         **kwargs
@@ -609,7 +606,6 @@ def __init__(
         super().__init__(
             BertWordPieceTokenizer(
                 vocab_file=vocab_file,
-                add_special_tokens=add_special_tokens,
                 unk_token=unk_token,
                 sep_token=sep_token,
                 cls_token=cls_token,
diff --git a/src/transformers/tokenization_bert_japanese.py b/src/transformers/tokenization_bert_japanese.py
index aaf82c54b320..6c9124043599 100644
--- a/src/transformers/tokenization_bert_japanese.py
+++ b/src/transformers/tokenization_bert_japanese.py
@@ -19,6 +19,7 @@
 import logging
 import os
 import unicodedata
+from typing import Optional
 
 from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer, load_vocab
 
@@ -89,6 +90,7 @@ def __init__(
         pad_token="[PAD]",
         cls_token="[CLS]",
         mask_token="[MASK]",
+        mecab_kwargs=None,
         **kwargs
     ):
         """Constructs a MecabBertTokenizer.
@@ -106,6 +108,7 @@ def __init__(
                 Type of word tokenizer.
             **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
                 Type of subword tokenizer.
+            **mecab_kwargs**: (`optional`) dict passed to `MecabTokenizer` constructor (default None)
         """
         super(BertTokenizer, self).__init__(
             unk_token=unk_token,
@@ -134,7 +137,9 @@ def __init__(
                     do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False
                 )
             elif word_tokenizer_type == "mecab":
-                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, never_split=never_split)
+                self.word_tokenizer = MecabTokenizer(
+                    do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {})
+                )
             else:
                 raise ValueError("Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
 
@@ -161,10 +166,10 @@ def _tokenize(self, text):
         return split_tokens
 
 
-class MecabTokenizer(object):
+class MecabTokenizer:
     """Runs basic tokenization with MeCab morphological parser."""
 
-    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True, mecab_option: Optional[str] = None):
         """Constructs a MecabTokenizer.
 
         Args:
@@ -176,6 +181,7 @@ def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
                 List of token not to split.
             **normalize_text**: (`optional`) boolean (default True)
                 Whether to apply unicode normalization to text before tokenization.
+            **mecab_option**: (`optional`) string passed to `MeCab.Tagger` constructor (default "")
         """
         self.do_lower_case = do_lower_case
         self.never_split = never_split if never_split is not None else []
@@ -183,7 +189,7 @@ def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
 
         import MeCab
 
-        self.mecab = MeCab.Tagger()
+        self.mecab = MeCab.Tagger(mecab_option) if mecab_option is not None else MeCab.Tagger()
 
     def tokenize(self, text, never_split=None, **kwargs):
         """Tokenizes a piece of text."""
diff --git a/src/transformers/tokenization_electra.py b/src/transformers/tokenization_electra.py
new file mode 100644
index 000000000000..ceb7eedccba0
--- /dev/null
+++ b/src/transformers/tokenization_electra.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Team, Stanford University and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .tokenization_bert import BertTokenizer, BertTokenizerFast
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "google/electra-small-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-generator/vocab.txt",
+        "google/electra-base-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt",
+        "google/electra-large-generator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-generator/vocab.txt",
+        "google/electra-small-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-small-discriminator/vocab.txt",
+        "google/electra-base-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-discriminator/vocab.txt",
+        "google/electra-large-discriminator": "https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-large-discriminator/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "google/electra-small-generator": 512,
+    "google/electra-base-generator": 512,
+    "google/electra-large-generator": 512,
+    "google/electra-small-discriminator": 512,
+    "google/electra-base-discriminator": 512,
+    "google/electra-large-discriminator": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "google/electra-small-generator": {"do_lower_case": True},
+    "google/electra-base-generator": {"do_lower_case": True},
+    "google/electra-large-generator": {"do_lower_case": True},
+    "google/electra-small-discriminator": {"do_lower_case": True},
+    "google/electra-base-discriminator": {"do_lower_case": True},
+    "google/electra-large-discriminator": {"do_lower_case": True},
+}
+
+
+class ElectraTokenizer(BertTokenizer):
+    r"""
+    Constructs an Electra tokenizer.
+    :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+
+class ElectraTokenizerFast(BertTokenizerFast):
+    r"""
+    Constructs an Electra Fast tokenizer.
+    :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end
+    tokenization: punctuation splitting + wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/src/transformers/tokenization_roberta.py b/src/transformers/tokenization_roberta.py
index 5076f8764cc3..02f352b6968f 100644
--- a/src/transformers/tokenization_roberta.py
+++ b/src/transformers/tokenization_roberta.py
@@ -18,9 +18,11 @@
 import logging
 from typing import List, Optional
 
+from tokenizers import AddedToken
 from tokenizers.processors import RobertaProcessing
 
 from .tokenization_gpt2 import GPT2Tokenizer, GPT2TokenizerFast
+from .tokenization_utils import PreTrainedTokenizer
 
 
 logger = logging.getLogger(__name__)
@@ -259,7 +261,7 @@ def __init__(
         unk_token="<unk>",
         pad_token="<pad>",
         mask_token="<mask>",
-        add_prefix_space=False,
+        add_prefix_space=True,
         **kwargs
     ):
         kwargs.setdefault("pad_token", pad_token)
@@ -281,16 +283,24 @@ def __init__(
             (sep_token, self.sep_token_id), (cls_token, self.cls_token_id)
         )
 
+        self.tokenizer.add_special_tokens([kwargs["mask_token"]])
+
         # As we override the post_processor post super.__init__ the computed num_added_tokens is wrong in super().
         # We need to recompute max_len according to the newly register post_processor to get real values.
-        self.max_len_single_sentence = self.max_len - self.num_added_tokens(False)  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True)  # take into account special tokens
-
-        logger.warning(
-            "RobertaTokenizerFast has an issue when working on mask language modeling "
-            "where it introduces an extra encoded space before the mask token."
-            "See https://github.com/huggingface/transformers/pull/2778 for more information."
-        )
+        self.max_len_single_sentence = self.max_len - self.num_special_tokens_to_add(
+            False
+        )  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - self.num_special_tokens_to_add(
+            True
+        )  # take into account special tokens
+
+    @PreTrainedTokenizer.mask_token.setter
+    def mask_token(self, value):
+        if not isinstance(value, AddedToken):
+            value = AddedToken(value, lstrip=True)
+
+        self._mask_token = str(value)
+        self.tokenizer.add_special_tokens([value])
 
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
diff --git a/src/transformers/tokenization_transfo_xl.py b/src/transformers/tokenization_transfo_xl.py
index a26d9b63716a..b67f98594f92 100644
--- a/src/transformers/tokenization_transfo_xl.py
+++ b/src/transformers/tokenization_transfo_xl.py
@@ -24,13 +24,13 @@
 import pickle
 import re
 from collections import Counter, OrderedDict
-from typing import List, Optional, Tuple, Union
+from typing import Optional
 
 import numpy as np
-from tokenizers import Encoding, Tokenizer
+from tokenizers import Tokenizer
 from tokenizers.implementations import BaseTokenizer
 from tokenizers.models import WordLevel
-from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
+from tokenizers.normalizers import Lowercase, Sequence, Strip, unicode_normalizer_from_str
 from tokenizers.pre_tokenizers import CharDelimiterSplit, WhitespaceSplit
 from tokenizers.processors import BertProcessing
 
@@ -381,6 +381,9 @@ def __init__(
         if lowercase:
             normalizer += [Lowercase()]
 
+        # Strip normalizer at the end
+        normalizer += [Strip(left=True, right=True)]
+
         if len(normalizer) > 0:
             tokenizer.normalizer = Sequence(normalizer) if len(normalizer) > 1 else normalizer[0]
 
@@ -404,14 +407,6 @@ def __init__(
 
         super().__init__(tokenizer, parameters)
 
-    def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
-        return super().encode_batch(
-            [seq.strip() if isinstance(seq, str) else (seq[0].strip(), seq[1].strip()) for seq in sequences]
-        )
-
-    def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
-        return super().encode(sequence.strip(), pair.strip() if pair else pair)
-
 
 class TransfoXLTokenizerFast(PreTrainedTokenizerFast):
 
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 1379912757a6..b45cbd2ea02b 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -15,15 +15,19 @@
 """Tokenization classes for OpenAI GPT."""
 
 import copy
+import functools
 import itertools
 import json
 import logging
+import operator
 import os
 import re
-from collections import defaultdict
+from collections import UserDict, defaultdict
 from contextlib import contextmanager
-from typing import List, Optional, Tuple, Union
+from typing import List, Optional, Sequence, Tuple, Union
 
+from tokenizers import AddedToken, Encoding
+from tokenizers.decoders import Decoder
 from tokenizers.implementations import BaseTokenizer
 
 from .file_utils import cached_path, hf_bucket_url, is_remote_url, is_tf_available, is_torch_available
@@ -41,6 +45,27 @@
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 
 
+# Define type aliases
+TextInput = str
+TextPairInput = Tuple[str, str]
+PreTokenizedInput = List[str]
+PreTokenizedInputPair = Tuple[List[str], List[str]]
+
+
+def flatten(x: Sequence):
+    """
+    Flatten the provided (potentially nested) sequence
+
+    Args:
+        x (Sequence): Potentially nested sequence to flatten
+
+    Returns:
+        list: Flattened sequence
+    """
+
+    return functools.reduce(operator.iconcat, x, [])
+
+
 @contextmanager
 def truncate_and_pad(
     tokenizer: BaseTokenizer,
@@ -61,16 +86,19 @@ def truncate_and_pad(
     before the managed section. If your tokenizer set a padding / truncation strategy before,
     then it will be reset to no padding/truncation when exiting the managed section.
 
-    :param tokenizer:
-    :param max_length:
-    :param stride:
-    :param strategy:
-    :param pad_to_max_length:
-    :param padding_side:
-    :param pad_token_id:
-    :param pad_token_type_id:
-    :param pad_token:
-    :return:
+    Args:
+        tokenizer (BaseTokenizer): The tokenizer which will be used
+        max_length (int): The maximum size of the sequence
+        stride (int): The stride to use when handling overflow
+        strategy (str): Overflowing logic to use
+        pad_to_max_length (bool): Boolean indicating if the output needs to be padded up to max_length
+        padding_side (str): "left" or "right" indicating the direction the output sequence will be padded
+        pad_token_id (int): The integer representation of the padding token to use
+        pad_token_type_id (int): The integer representation of the padding token type to use
+        pad_token (str): The string representation of the padding token to use
+
+    Returns:
+
     """
 
     # Handle all the truncation and padding stuff
@@ -103,44 +131,118 @@ def truncate_and_pad(
         tokenizer.no_padding()
 
 
-class PreTrainedTokenizer(object):
-    """ Base class for all tokenizers.
-    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+class BatchEncoding(UserDict):
+    """
+    Data structure derived from Dictionary holding all the required information to forward through
+    a model.
 
-    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    In addition, this structure expose utility methods to map from word/char space to token space.
+    """
 
-    Class attributes (overridden by derived classes):
+    def __init__(self, data: dict, encoding: Optional[Union[Encoding, Sequence[Encoding]]] = None):
+        super().__init__(data)
 
-        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
-        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
-        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
-        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+        if isinstance(encoding, Encoding):
+            encoding = [encoding]
 
-    Parameters:
+        self._encodings = encoding
 
-        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+    def __getitem__(self, item: Union[int, str]) -> Encoding:
+        if isinstance(item, str):
+            return self.data[item]
+        elif self._encodings is not None:
+            return self._encodings[item]
+        else:
+            raise KeyError("int index is supported only on {} from a Rust tokenizer".format(type(self).__name__))
 
-        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+    def __getattr__(self, item: str):
+        return self.data[item]
 
-        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+    @property
+    def encodings(self) -> Optional[List[Encoding]]:
+        """
+        Return the list all encoding from the tokenization process
 
-        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+        Returns: List[Encoding] or None if input was tokenized through Python tokenizer
+        """
+        return self._encodings
 
-        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+    def keys(self):
+        return self.data.keys()
 
-        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+    def values(self):
+        return self.data.values()
 
-        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+    def items(self):
+        return self.data.items()
 
-        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
-    """
+    def char_to_token_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
+        """
+        Find the Offsets of the token containing the character at the specified position
 
-    vocab_files_names = {}
-    pretrained_vocab_files_map = {}
-    pretrained_init_configuration = {}
-    max_model_input_sizes = {}
-    model_input_names = ["token_type_ids", "attention_mask"]
+        Args:
+            sentence: Index of the sentence relative to the batch provided to the tokenizer
+            char: Char index to get the relative token offsets
 
+        Returns:
+            tuple: (token start, token end)
+
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token_offsets() is not available when using Python based tokenizers")
+        return self[sentence].char_to_token_offsets(char)
+
+    def char_to_token(self, sentence: int, char: int) -> int:
+        """
+        Return the index of the token at position of the given char.
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            char (int): Char index to get the relative token offsets
+
+        Returns:
+            int: Integer referring to the position of the token in the returned set of tokens for the sentence
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token() is not available when using Python based tokenizers")
+        return self[sentence].char_to_token(char)
+
+    def char_to_word_offsets(self, sentence: int, char: int) -> Tuple[int, int]:
+        """
+        Find the Offsets of the word containing the character at the specified position
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            char (int): Char index to get the relative token offsets
+
+        Returns:
+            tuple: (word start, word end) representing the first and last characters of the word
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_word_offsets() is not available when using Python based tokenizers")
+        return self[sentence].char_to_word_offsets(char)
+
+    def token_to_word_offsets(self, sentence: int, index: int) -> Optional[Tuple[int, int]]:
+        """
+        Find the Offsets of the word containing the token at the given index
+
+        Args:
+            sentence (int): Index of the sentence relative to the batch provided to the tokenizer
+            index (int): Index of the token to map to the original word offsets
+
+        Returns:
+            Optional[tuple]: (word start, word end) or None
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_word_offsets() is not available when using Python based tokenizers")
+        return self[sentence].token_to_word_offsets(index)
+
+
+class SpecialTokensMixin:
     SPECIAL_TOKENS_ATTRIBUTES = [
         "bos_token",
         "eos_token",
@@ -152,19 +254,30 @@ class PreTrainedTokenizer(object):
         "additional_special_tokens",
     ]
 
-    padding_side = "right"
+    def __init__(self, **kwargs):
 
-    NO_PAD_TOKEN_FOR_BATCH_MSG = (
-        "No padding token is set for this model, therefore no batch can be made with uneven "
-        "sequences. Set a padding token or adjust the lengths of the sequences building the "
-        "batch so that every sequence is of the same length."
-    )
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._pad_token_type_id = 0
+        self._additional_special_tokens = []
 
-    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
-        "The sequences building the batch are not of the same size, no tensor "
-        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
-        "up to the larger sequence's length."
-    )
+        for key, value in kwargs.items():
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
+                elif isinstance(value, AddedToken):
+                    setattr(self, key, str(value))
+                elif isinstance(value, str):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(
+                        "special token {} has to be either str or AddedToken but got: {}".format(key, type(value))
+                    )
 
     @property
     def bos_token(self):
@@ -250,10 +363,6 @@ def cls_token(self, value):
     def mask_token(self, value):
         self._mask_token = value
 
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
-
     @property
     def bos_token_id(self):
         """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """
@@ -299,20 +408,112 @@ def additional_special_tokens_ids(self):
         """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """
         return self.convert_tokens_to_ids(self.additional_special_tokens)
 
+    @property
+    def special_tokens_map(self):
+        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
+            values ('<unk>', '<cls>'...)
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self):
+        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
+            (cls_token, unk_token...).
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(set(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self):
+        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
+            class attributes (cls_token, unk_token...).
+        """
+        all_toks = self.all_special_tokens
+        all_ids = self.convert_tokens_to_ids(all_toks)
+        return all_ids
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+
+class PreTrainedTokenizer(SpecialTokensMixin):
+    """ Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+
+    Class attributes (overridden by derived classes):
+
+        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+        - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method.
+
+    Parameters:
+
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+    """
+
+    vocab_files_names = {}
+    pretrained_vocab_files_map = {}
+    pretrained_init_configuration = {}
+    max_model_input_sizes = {}
+    model_input_names = ["token_type_ids", "attention_mask"]
+
+    padding_side = "right"
+
+    NO_PAD_TOKEN_FOR_BATCH_MSG = (
+        "No padding token is set for this model, therefore no batch can be made with uneven "
+        "sequences. Set a padding token or adjust the lengths of the sequences building the "
+        "batch so that every sequence is of the same length."
+    )
+
+    UNEVEN_SEQUENCES_FOR_BATCH_MSG = (
+        "The sequences building the batch are not of the same size, no tensor "
+        "can be built. Set `pad_to_max_length=True` to pad the smaller sequences"
+        "up to the larger sequence's length."
+    )
+
+    @property
+    def vocab_size(self) -> int:
+        """ Size of the base vocabulary (without the added tokens) """
+        raise NotImplementedError
+
+    @property
+    def is_fast(self):
+        return False
+
     def get_vocab(self):
         """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
         raise NotImplementedError()
 
     def __init__(self, max_len=None, **kwargs):
-        self._bos_token = None
-        self._eos_token = None
-        self._unk_token = None
-        self._sep_token = None
-        self._pad_token = None
-        self._cls_token = None
-        self._mask_token = None
-        self._pad_token_type_id = 0
-        self._additional_special_tokens = []
+
+        super().__init__(**kwargs)
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
@@ -329,13 +530,9 @@ def __init__(self, max_len=None, **kwargs):
         self.init_inputs = ()
         self.init_kwargs = {}
 
-        for key, value in kwargs.items():
-            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
-                if key == "additional_special_tokens":
-                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) for t in value)
-                else:
-                    assert isinstance(value, str)
-                setattr(self, key, value)
+    def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
+        return self.vocab_size + len(self.added_tokens_encoder)
 
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
@@ -614,14 +811,6 @@ def save_vocabulary(self, save_directory):
         """
         raise NotImplementedError
 
-    def vocab_size(self):
-        """ Size of the base vocabulary (without the added tokens) """
-        raise NotImplementedError
-
-    def __len__(self):
-        """ Size of the full vocabulary with the added tokens """
-        return self.vocab_size + len(self.added_tokens_encoder)
-
     def add_tokens(self, new_tokens):
         """
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the
@@ -670,7 +859,7 @@ def add_tokens(self, new_tokens):
 
         return len(to_add_tokens)
 
-    def num_added_tokens(self, pair=False):
+    def num_special_tokens_to_add(self, pair=False):
         """
         Returns the number of added tokens when encoding a sequence with special tokens.
 
@@ -743,7 +932,7 @@ def add_special_tokens(self, special_tokens_dict):
 
         return added_tokens
 
-    def tokenize(self, text, **kwargs):
+    def tokenize(self, text: TextInput, **kwargs):
         """ Converts a string in a sequence of tokens (string), using the tokenizer.
             Split in words for word-based vocabulary or sub-words for sub-word-based
             vocabularies (BPE/SentencePieces/WordPieces).
@@ -852,8 +1041,8 @@ def _convert_token_to_id(self, token):
 
     def encode(
         self,
-        text: str,
-        text_pair: Optional[str] = None,
+        text: TextInput,
+        text_pair: Optional[TextInput] = None,
         add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         stride: int = 0,
@@ -923,13 +1112,14 @@ def encode(
 
     def encode_plus(
         self,
-        text: str,
-        text_pair: Optional[str] = None,
+        text: TextInput,
+        text_pair: Optional[TextInput] = None,
         add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
+        is_pretokenized: bool = False,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -937,7 +1127,7 @@ def encode_plus(
         return_special_tokens_mask: bool = False,
         return_offsets_mapping: bool = False,
         **kwargs
-    ):
+    ) -> BatchEncoding:
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional information:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -977,6 +1167,8 @@ def encode_plus(
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Set to True to indicate the input is already tokenized
             return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
                 Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
                 or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
@@ -1071,12 +1263,15 @@ def get_input_ids(text):
 
     def batch_encode_plus(
         self,
-        batch_text_or_text_pairs: Union[str, List[str]],
+        batch_text_or_text_pairs: Union[
+            List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
+        ],
         add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
+        is_pretokenized: bool = False,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_masks: Optional[bool] = None,
@@ -1085,7 +1280,7 @@ def batch_encode_plus(
         return_offsets_mapping: bool = False,
         return_input_lengths: bool = False,
         **kwargs
-    ):
+    ) -> BatchEncoding:
         """
         Returns a dictionary containing the encoded sequence or sequence pair and additional information:
         the mask for sequence classification and the overflowing elements if a ``max_length`` is specified.
@@ -1121,6 +1316,8 @@ def batch_encode_plus(
                 - 'left': pads on the left of the sequences
                 - 'right': pads on the right of the sequences
                 Defaults to False: no padding.
+            is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
+                Set to True to indicate the input is already tokenized
             return_tensors (:obj:`str`, `optional`, defaults to :obj:`None`):
                 Can be set to 'tf' or 'pt' to return respectively TensorFlow :obj:`tf.constant`
                 or PyTorch :obj:`torch.Tensor` instead of a list of python integers.
@@ -1199,7 +1396,7 @@ def get_input_ids(text):
 
         input_ids = []
         for ids_or_pair_ids in batch_text_or_text_pairs:
-            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2:
+            if isinstance(ids_or_pair_ids, (list, tuple)) and len(ids_or_pair_ids) == 2 and not is_pretokenized:
                 ids, pair_ids = ids_or_pair_ids
             else:
                 ids, pair_ids = ids_or_pair_ids, None
@@ -1213,9 +1410,9 @@ def get_input_ids(text):
             def total_sequence_length(input_pairs):
                 first_ids, second_ids = input_pairs
                 return len(first_ids) + (
-                    self.num_added_tokens()
+                    self.num_special_tokens_to_add()
                     if second_ids is None
-                    else (len(second_ids) + self.num_added_tokens(pair=True))
+                    else (len(second_ids) + self.num_special_tokens_to_add(pair=True))
                 )
 
             max_length = max([total_sequence_length(ids) for ids in input_ids])
@@ -1277,7 +1474,7 @@ def total_sequence_length(input_pairs):
                         )
                     )
 
-        return batch_outputs
+        return BatchEncoding(batch_outputs)
 
     def prepare_for_model(
         self,
@@ -1361,7 +1558,7 @@ def prepare_for_model(
         encoded_inputs = {}
 
         # Handle max sequence length
-        total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0)
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
         if max_length and total_len > max_length:
             ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                 ids,
@@ -1474,7 +1671,7 @@ def prepare_for_model(
                 )
             )
 
-        return encoded_inputs
+        return BatchEncoding(encoded_inputs)
 
     def prepare_for_tokenization(self, text, **kwargs):
         """ Performs any necessary transformations before tokenization """
@@ -1629,39 +1826,6 @@ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spa
         else:
             return text
 
-    @property
-    def special_tokens_map(self):
-        """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
-            values ('<unk>', '<cls>'...)
-        """
-        set_attr = {}
-        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
-            attr_value = getattr(self, "_" + attr)
-            if attr_value:
-                set_attr[attr] = attr_value
-        return set_attr
-
-    @property
-    def all_special_tokens(self):
-        """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
-            (cls_token, unk_token...).
-        """
-        all_toks = []
-        set_attr = self.special_tokens_map
-        for attr_value in set_attr.values():
-            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
-        all_toks = list(set(all_toks))
-        return all_toks
-
-    @property
-    def all_special_ids(self):
-        """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
-            class attributes (cls_token, unk_token...).
-        """
-        all_toks = self.all_special_tokens
-        all_ids = self.convert_tokens_to_ids(all_toks)
-        return all_ids
-
     @staticmethod
     def clean_up_tokenization(out_string):
         """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
@@ -1692,67 +1856,71 @@ def __init__(self, tokenizer: BaseTokenizer, **kwargs):
         self._tokenizer = tokenizer
 
         super().__init__(**kwargs)
-        self.max_len_single_sentence = self.max_len - self.num_added_tokens(False)  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - self.num_added_tokens(True)  # take into account special tokens
+        self.max_len_single_sentence = self.max_len - self.num_special_tokens_to_add(
+            False
+        )  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - self.num_special_tokens_to_add(
+            True
+        )  # take into account special tokens
 
     @property
-    def tokenizer(self):
+    def tokenizer(self) -> BaseTokenizer:
         return self._tokenizer
 
     @property
-    def decoder(self):
+    def decoder(self) -> Decoder:
         return self._tokenizer._tokenizer.decoder
 
     @property
-    def vocab_size(self):
+    def is_fast(self) -> bool:
+        return True
+
+    @property
+    def vocab_size(self) -> int:
         return self._tokenizer.get_vocab_size(with_added_tokens=False)
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self._tokenizer.get_vocab_size(with_added_tokens=True)
 
     @PreTrainedTokenizer.bos_token.setter
     def bos_token(self, value):
         self._bos_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._bos_token])
 
     @PreTrainedTokenizer.eos_token.setter
     def eos_token(self, value):
         self._eos_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._eos_token])
 
     @PreTrainedTokenizer.unk_token.setter
     def unk_token(self, value):
         self._unk_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._unk_token])
 
     @PreTrainedTokenizer.sep_token.setter
     def sep_token(self, value):
         self._sep_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._sep_token])
 
     @PreTrainedTokenizer.pad_token.setter
     def pad_token(self, value):
         self._pad_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._pad_token])
 
     @PreTrainedTokenizer.cls_token.setter
     def cls_token(self, value):
         self._cls_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._cls_token])
 
     @PreTrainedTokenizer.mask_token.setter
     def mask_token(self, value):
         self._mask_token = value
-        self._update_special_tokens()
+        self._tokenizer.add_special_tokens([self._mask_token])
 
     @PreTrainedTokenizer.additional_special_tokens.setter
     def additional_special_tokens(self, value):
         self._additional_special_tokens = value
-        self._update_special_tokens()
-
-    def _update_special_tokens(self):
-        if self._tokenizer is not None:
-            self._tokenizer.add_special_tokens(self.all_special_tokens)
+        self._tokenizer.add_special_tokens(self.all_special_tokens)
 
     def _convert_encoding(
         self,
@@ -1785,7 +1953,7 @@ def _convert_encoding(
             if return_special_tokens_mask:
                 encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
             if return_offsets_mapping:
-                encoding_dict["offset_mapping"].append([e.original_str.offsets(o) for o in e.offsets])
+                encoding_dict["offset_mapping"].append(e.offsets)
 
         # Prepare inputs as tensors if asked
         if return_tensors == "tf" and is_tf_available():
@@ -1818,42 +1986,50 @@ def _convert_token_to_id_with_added_voc(self, token):
             return self.unk_token_id
         return id
 
-    def _convert_id_to_token(self, index):
+    def _convert_id_to_token(self, index: int) -> str:
         return self._tokenizer.id_to_token(int(index))
 
-    def convert_tokens_to_string(self, tokens):
-        return self._tokenizer.decode(tokens)
+    def convert_tokens_to_string(self, tokens: List[int], skip_special_tokens: bool = False) -> str:
+        return self._tokenizer.decode(tokens, skip_special_tokens)
 
-    def add_tokens(self, new_tokens):
+    def add_tokens(self, new_tokens: List[Union[str, AddedToken]]) -> int:
         if isinstance(new_tokens, str):
             new_tokens = [new_tokens]
         return self._tokenizer.add_tokens(new_tokens)
 
-    def add_special_tokens(self, special_tokens_dict):
+    def add_special_tokens(self, special_tokens_dict: dict) -> int:
         added = super().add_special_tokens(special_tokens_dict)
-        self._update_special_tokens()
+        tokens = flatten(special_tokens_dict.values())
+        self._tokenizer.add_special_tokens(tokens)
         return added
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
         if token_ids_1 is None:
             return token_ids_0
         else:
             return token_ids_0 + token_ids_1
 
-    def num_added_tokens(self, pair=False):
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
         return self.tokenizer.num_special_tokens_to_add(pair)
 
-    def tokenize(self, text, **kwargs):
-        return self.tokenizer.encode(text).tokens
+    def tokenize(
+        self, text: TextInput, pair: Optional[TextInput] = None, add_special_tokens: bool = False
+    ) -> List[str]:
+        return self.tokenizer.encode(text, pair, add_special_tokens).tokens
 
     def batch_encode_plus(
         self,
-        batch_text_or_text_pairs: Optional[Union[List[str], List[Tuple[str]]]] = None,
+        batch_text_or_text_pairs: Union[
+            List[TextInput], List[TextPairInput], List[PreTokenizedInput], List[PreTokenizedInputPair]
+        ] = None,
         add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         stride: int = 0,
         truncation_strategy: str = "longest_first",
         pad_to_max_length: bool = False,
+        is_pretokenized: bool = False,
         return_tensors: Optional[str] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1861,12 +2037,14 @@ def batch_encode_plus(
         return_special_tokens_mask: bool = False,
         return_offsets_mapping: bool = False,
         **kwargs
-    ):
-        if not add_special_tokens:
-            logger.warning(
-                "Fast tokenizers add special tokens by default. To remove special tokens, please specify"
-                "`add_special_tokens=False` during the initialisation rather than when calling `encode`,"
-                "`encode_plus` or `batch_encode_plus`."
+    ) -> BatchEncoding:
+
+        if batch_text_or_text_pairs is None:
+            raise ValueError(
+                "None is not a valid input. "
+                "Should be a list/tuple of strings, "
+                "a list/tuple of integers, "
+                "A list of list of strings or tuple of strings."
             )
 
         # Needed if we have to return a tensor
@@ -1894,15 +2072,67 @@ def batch_encode_plus(
                     "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
                 )
 
-            # Avoid thread overhead if only one example.
-            if len(batch_text_or_text_pairs) == 1:
-                if isinstance(batch_text_or_text_pairs[0], (tuple, list)):
-                    tokens = self._tokenizer.encode(*batch_text_or_text_pairs[0])
-                else:
-                    tokens = self._tokenizer.encode(batch_text_or_text_pairs[0])
-                tokens = [tokens]
+            # Check for the pretokenized path
+            if is_pretokenized:
+                encodings = []
+
+                # Iterate over each sample (we don't know yet if they are pairs or simple input
+                for i, sample in enumerate(batch_text_or_text_pairs):
+
+                    if not isinstance(sample, (list, tuple)):
+                        raise TypeError(
+                            "batch_encode_plus(..., is_pretokenized=True) requires batch_text_or_text_pairs "
+                            "to be either List[List[str]] or List[Tuple[List[str], List[str]]] but sample at "
+                            "index {} is of type {}".format(i, type(sample))
+                        )
+
+                    # Convert to tuple for convenience
+                    if isinstance(sample, list):
+                        sample = (sample,)
+
+                    encodings_text = Encoding.merge(self._tokenizer.encode_batch(sample[0], False), True)
+
+                    # Check if we have pairs
+                    if len(sample) == 2:
+                        encodings_pair = Encoding.merge(
+                            self._tokenizer.encode_batch([("", s) for s in sample[1]], False), True
+                        )
+
+                    # No pair, default to None
+                    elif len(sample) == 1:
+                        encodings_pair = None
+
+                    # Something else is invalid
+                    else:
+                        raise ValueError(
+                            "batch_encode_plus(..., is_pretokenized=True) requires batch_text_or_text_pairs "
+                            "to be either List[List[str]] or List[Tuple[List[str], List[str]]] but sample at "
+                            "index {} has too much dimensions (required 1 or 2, got: {}, type {})".format(
+                                i, len(sample), type(sample)
+                            )
+                        )
+
+                    # Post-process
+                    encoding = self._tokenizer.post_process(encodings_text, encodings_pair, add_special_tokens)
+                    encodings += [encoding]
+
+            # Classical path with strings input
             else:
-                tokens = self._tokenizer.encode_batch(batch_text_or_text_pairs)
+                # Avoid thread overhead if only one example.
+                if len(batch_text_or_text_pairs) == 1:
+                    if isinstance(batch_text_or_text_pairs[0], (tuple, list)):
+                        encodings = self._tokenizer.encode(
+                            *batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens
+                        )
+                    else:
+                        encodings = self._tokenizer.encode(
+                            batch_text_or_text_pairs[0], add_special_tokens=add_special_tokens
+                        )
+                    encodings = [encodings]
+                else:
+                    encodings = self._tokenizer.encode_batch(
+                        batch_text_or_text_pairs, add_special_tokens=add_special_tokens
+                    )
 
         # Convert encoding to dict
         tokens = [
@@ -1915,7 +2145,7 @@ def batch_encode_plus(
                 return_special_tokens_mask=return_special_tokens_mask,
                 return_offsets_mapping=return_offsets_mapping,
             )
-            for encoding in tokens
+            for encoding in encodings
         ]
 
         # Sanitize the output to have dict[list] from list[dict]
@@ -1926,8 +2156,8 @@ def batch_encode_plus(
                 stack = tf.stack(stack, axis=0)
             elif return_tensors == "pt":
                 stack = torch.stack(stack, dim=0)
-            elif not return_tensors and len(stack) == 1:
-                stack = stack[0]
+            # elif not return_tensors and len(stack) == 1:
+            #     stack = stack[0]
 
             sanitized[key] = stack
 
@@ -1938,17 +2168,19 @@ def batch_encode_plus(
                 i if len(item["input_ids"]) == 1 else [i] * len(item["input_ids"]) for i, item in enumerate(tokens)
             ]
             sanitized["overflow_to_sample_mapping"] = overflow_to_sample_mapping
-        return sanitized
+
+        return BatchEncoding(sanitized, encodings)
 
     def encode_plus(
         self,
-        text: str,
-        text_pair: Optional[str] = None,
-        add_special_tokens: bool = False,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None,
+        add_special_tokens: bool = True,
         max_length: Optional[int] = None,
         pad_to_max_length: bool = False,
         stride: int = 0,
         truncation_strategy: str = "longest_first",
+        is_pretokenized: bool = False,
         return_tensors: Optional[bool] = None,
         return_token_type_ids: Optional[bool] = None,
         return_attention_mask: Optional[bool] = None,
@@ -1956,31 +2188,84 @@ def encode_plus(
         return_special_tokens_mask: bool = False,
         return_offsets_mapping: bool = False,
         **kwargs
-    ):
-        batched_input = [(text, text_pair)] if text_pair else [text]
-        batched_output = self.batch_encode_plus(
-            batched_input,
-            add_special_tokens=add_special_tokens,
-            max_length=max_length,
-            stride=stride,
-            truncation_strategy=truncation_strategy,
-            return_tensors=return_tensors,
-            return_token_type_ids=return_token_type_ids,
-            return_attention_mask=return_attention_mask,
-            return_overflowing_tokens=return_overflowing_tokens,
-            return_special_tokens_mask=return_special_tokens_mask,
-            return_offsets_mapping=return_offsets_mapping,
-            pad_to_max_length=pad_to_max_length,
-            **kwargs,
-        )
+    ) -> BatchEncoding:
+
+        # Check for pretokenized path (ie [token1, token2, ..., tokenN] -> [id1, id2, ..., idN]
+        if is_pretokenized:
+            if isinstance(text, list) and len(text) > 0:
+
+                # Encode through encode_batch with sequence of only one word which will be merged after hand
+                encoding = self._tokenizer.encode_batch(text, add_special_tokens=False)
+                encoding = Encoding.merge(encoding, True)
+
+                # Let's do the same for pairs if provided
+                if isinstance(text_pair, list):
+                    # We prepend empty string before each word so that encoding is aware content is a pair
+                    encoding_pair = self._tokenizer.encode_batch(
+                        [("", p) for p in text_pair], add_special_tokens=False
+                    )
+                    encoding_pair = Encoding.merge(encoding_pair, True)
+                elif text_pair is None:
+                    encoding_pair = None
+                else:
+                    raise TypeError(
+                        "encode_plus(..., is_pretokenized=True) requires text and text_pair to be List[str] "
+                        "but got (text={}, text_pair={})".format(type(text), type(text_pair))
+                    )
+
+                # Post process and if asked to do so, insert special tokens where needed
+                encoding = self._tokenizer.post_process(encoding, encoding_pair, add_special_tokens)
+
+                batched_output = BatchEncoding(
+                    self._convert_encoding(
+                        encoding,
+                        return_tensors=return_tensors,
+                        return_token_type_ids=return_token_type_ids,
+                        return_attention_mask=return_attention_mask,
+                        return_overflowing_tokens=return_overflowing_tokens,
+                        return_special_tokens_mask=return_special_tokens_mask,
+                        return_offsets_mapping=return_offsets_mapping,
+                    ),
+                    encoding,
+                )
+            else:
+                raise TypeError(
+                    "encode_plus(..., is_pretokenized=True) requires text to be List[str] "
+                    "but got (text={}, text_pair={})".format(type(text), type(text_pair))
+                )
+        else:
+            batched_input = [(text, text_pair)] if text_pair else [text]
+            batched_output = self.batch_encode_plus(
+                batched_input,
+                add_special_tokens=add_special_tokens,
+                max_length=max_length,
+                stride=stride,
+                truncation_strategy=truncation_strategy,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                pad_to_max_length=pad_to_max_length,
+                **kwargs,
+            )
 
         # Return tensor is None, then we can remove the leading batch axis
         if not return_tensors:
-            return {key: value[0] if isinstance(value[0], list) else value for key, value in batched_output.items()}
-        else:
-            return batched_output
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
 
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        return batched_output
+
+    def decode(
+        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+    ):
         text = self.tokenizer.decode(token_ids, skip_special_tokens)
 
         if clean_up_tokenization_spaces:
@@ -1989,7 +2274,7 @@ def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spa
         else:
             return text
 
-    def save_vocabulary(self, save_directory):
+    def save_vocabulary(self, save_directory: str) -> Tuple[str]:
         if os.path.isdir(save_directory):
             files = self._tokenizer.save(save_directory)
         else:
diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
index 5397ca4c7898..81af5cfba5cd 100644
--- a/templates/adding_a_new_model/README.md
+++ b/templates/adding_a_new_model/README.md
@@ -59,4 +59,4 @@ You can then finish the addition step by adding imports for your classes in the
 - [ ] add a link to your conversion script in the main conversion utility (in `commands/convert.py`)
 - [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
 - [ ] add a mention of your model in the doc: `README.md` and the documentation itself at `docs/source/pretrained_models.rst`.
-- [ ] upload the pretrained weigths, configurations and vocabulary files.
+- [ ] upload the pretrained weights, configurations and vocabulary files.
diff --git a/tests/test_modeling_bart.py b/tests/test_modeling_bart.py
index db7ce6331701..10dfd5b6c8e4 100644
--- a/tests/test_modeling_bart.py
+++ b/tests/test_modeling_bart.py
@@ -27,7 +27,9 @@
 if is_torch_available():
     import torch
     from transformers import (
+        AutoModel,
         AutoModelForSequenceClassification,
+        AutoTokenizer,
         BartModel,
         BartForConditionalGeneration,
         BartForSequenceClassification,
@@ -183,6 +185,15 @@ def test_save_load_strict(self):
     def test_inputs_embeds(self):
         pass
 
+    def test_tiny_model(self):
+        model_name = "sshleifer/bart-tiny-random"
+        tiny = AutoModel.from_pretrained(model_name)  # same vocab size
+        tok = AutoTokenizer.from_pretrained(model_name)  # same tokenizer
+        inputs_dict = tok.batch_encode_plus(["Hello my friends"], return_tensors="pt")
+
+        with torch.no_grad():
+            tiny(**inputs_dict)
+
 
 @require_torch
 class BartHeadTests(unittest.TestCase):
@@ -468,7 +479,7 @@ def test_xsum_summarization_same_as_fairseq(self):
             length_penalty=1.0,
             no_repeat_ngram_size=3,
             early_stopping=True,
-            decoder_start_token_id=model.config.eos_token_ids[0],
+            decoder_start_token_id=model.config.eos_token_id,
         )
 
         decoded = [
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index b284ee6ec2e9..7ac52a672d37 100644
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -624,60 +624,114 @@ def test_inputs_embeds(self):
             with torch.no_grad():
                 model(**inputs_dict)
 
-    def test_lm_head_model_random_generate(self):
-
+    def test_lm_head_model_random_no_beam_search_generate(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict.get("input_ids")
-
-        if self.is_encoder_decoder:
-            config.output_past = True  # needed for Bart TODO: might have to update for other encoder-decoder models
+        input_ids = inputs_dict["input_ids"] if "input_ids" in inputs_dict else inputs_dict["inputs"]
 
+        # iterate over all generative models
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
-            model.to(torch_device)
-            model.eval()
 
             if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids
                 with self.assertRaises(AssertionError):
                     model.generate(do_sample=True, max_length=5)
-                # batch_size = 1
-                self._check_generated_tokens(model.generate(input_ids, do_sample=True))
-                # batch_size = 1, num_beams > 1
-                self._check_generated_tokens(model.generate(input_ids, do_sample=True, num_beams=3))
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True))
             else:
-                # batch_size = 1
-                self._check_generated_tokens(model.generate(do_sample=True, max_length=5))
-                # batch_size = 1, num_beams > 1
-                self._check_generated_tokens(model.generate(do_sample=True, max_length=5, num_beams=3))
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
 
             with self.assertRaises(AssertionError):
-                # generating multiple sequences when greedy no beam generation
+                # generating multiple sequences when no beam search generation
                 # is not allowed as it would always generate the same sequences
                 model.generate(input_ids, do_sample=False, num_return_sequences=2)
 
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.tolist(), bad_words_ids))
+
+    def test_lm_head_model_random_beam_search_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"] if "input_ids" in inputs_dict else inputs_dict["inputs"]
+
+        if self.is_encoder_decoder:
+            # needed for Bart beam search
+            config.output_past = True
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
+
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
                 model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
-            # batch_size > 1, sample
-            self._check_generated_tokens(model.generate(input_ids, do_sample=True, num_return_sequences=3))
-            # batch_size > 1, greedy
-            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2, num_return_sequences=2,))
+            # num_return_sequences > 1, greedy
+            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
-            # batch_size > 1, num_beams > 1, sample
-            self._check_generated_tokens(
-                model.generate(input_ids, do_sample=True, num_beams=3, num_return_sequences=3,)
-            )
-            # batch_size > 1, num_beams > 1, greedy
-            self._check_generated_tokens(
-                model.generate(input_ids, do_sample=False, num_beams=3, num_return_sequences=3)
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
             )
-
-    def _check_generated_tokens(self, output_ids):
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.tolist(), bad_words_ids))
+
+    def _generate_random_bad_tokens(self, num_bad_tokens, model):
+        # special tokens cannot be bad tokens
+        special_tokens = []
+        if model.config.bos_token_id is not None:
+            special_tokens.append(model.config.bos_token_id)
+        if model.config.pad_token_id is not None:
+            special_tokens.append(model.config.pad_token_id)
+        if model.config.eos_token_id is not None:
+            special_tokens.append(model.config.eos_token_id)
+
+        # create random bad tokens that are not special tokens
+        bad_tokens = []
+        while len(bad_tokens) < num_bad_tokens:
+            token = ids_tensor((1, 1), self.model_tester.vocab_size).squeeze(0).numpy()[0]
+            if token not in special_tokens:
+                bad_tokens.append(token)
+        return bad_tokens
+
+    def _check_generated_ids(self, output_ids):
         for token_id in output_ids[0].tolist():
             self.assertGreaterEqual(token_id, 0)
             self.assertLess(token_id, self.model_tester.vocab_size)
 
+    def _check_match_tokens(self, generated_ids, bad_words_ids):
+        # for all bad word tokens
+        for bad_word_ids in bad_words_ids:
+            # for all slices in batch
+            for generated_ids_slice in generated_ids:
+                # for all word idx
+                for i in range(len(bad_word_ids), len(generated_ids_slice)):
+                    # if tokens match
+                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
+                        return True
+        return False
+
 
 global_rng = random.Random()
 
diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py
new file mode 100644
index 000000000000..5e7cc8ad2fdc
--- /dev/null
+++ b/tests/test_modeling_electra.py
@@ -0,0 +1,287 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import is_torch_available
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+from .utils import CACHE_DIR, require_torch, slow, torch_device
+
+
+if is_torch_available():
+    from transformers import (
+        ElectraConfig,
+        ElectraModel,
+        ElectraForMaskedLM,
+        ElectraForTokenClassification,
+        ElectraForPreTraining,
+    )
+    from transformers.modeling_electra import ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@require_torch
+class ElectraModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (ElectraModel, ElectraForMaskedLM, ElectraForTokenClassification,) if is_torch_available() else ()
+    )
+
+    class ElectraModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+                fake_token_labels = ids_tensor([self.batch_size, self.seq_length], 1)
+
+            config = ElectraConfig(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                is_decoder=False,
+                initializer_range=self.initializer_range,
+            )
+
+            return (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+                fake_token_labels,
+            )
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(list(result["loss"].size()), [])
+
+        def create_and_check_electra_model(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ):
+            model = ElectraModel(config=config)
+            model.to(torch_device)
+            model.eval()
+            (sequence_output,) = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            (sequence_output,) = model(input_ids, token_type_ids=token_type_ids)
+            (sequence_output,) = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_electra_for_masked_lm(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ):
+            model = ElectraForMaskedLM(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, prediction_scores = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels
+            )
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+            self.check_loss_output(result)
+
+        def create_and_check_electra_for_token_classification(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ):
+            config.num_labels = self.num_labels
+            model = ElectraForTokenClassification(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels
+            )
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]
+            )
+            self.check_loss_output(result)
+
+        def create_and_check_electra_for_pretraining(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            fake_token_labels,
+        ):
+            config.num_labels = self.num_labels
+            model = ElectraForPreTraining(config=config)
+            model.to(torch_device)
+            model.eval()
+            loss, logits = model(
+                input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=fake_token_labels
+            )
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+                fake_token_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = ElectraModelTest.ElectraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_electra_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs)
+
+    def test_for_pre_training(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = ElectraModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py
index c8f9de3cc986..1d5e4e9a0a7e 100644
--- a/tests/test_modeling_t5.py
+++ b/tests/test_modeling_t5.py
@@ -24,8 +24,10 @@
 
 
 if is_torch_available():
+    import torch
     from transformers import T5Config, T5Model, T5ForConditionalGeneration
     from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    from transformers.tokenization_t5 import T5Tokenizer
 
 
 @require_torch
@@ -57,8 +59,9 @@ def __init__(
             relative_attention_num_buckets=8,
             dropout_rate=0.1,
             initializer_factor=0.002,
-            eos_token_ids=[1],
+            eos_token_id=1,
             pad_token_id=0,
+            decoder_start_token_id=0,
             scope=None,
         ):
             self.parent = parent
@@ -78,8 +81,9 @@ def __init__(
             self.dropout_rate = dropout_rate
             self.initializer_factor = initializer_factor
             self.scope = scope
-            self.eos_token_ids = eos_token_ids
+            self.eos_token_id = eos_token_id
             self.pad_token_id = pad_token_id
+            self.decoder_start_token_id = decoder_start_token_id
 
         def prepare_config_and_inputs(self):
             input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
@@ -106,9 +110,10 @@ def prepare_config_and_inputs(self):
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
                 initializer_factor=self.initializer_factor,
-                eos_token_ids=self.eos_token_ids,
+                eos_token_id=self.eos_token_id,
                 bos_token_id=self.pad_token_id,
                 pad_token_id=self.pad_token_id,
+                decoder_start_token_id=self.decoder_start_token_id,
             )
 
             return (
@@ -123,6 +128,39 @@ def prepare_config_and_inputs(self):
         def check_loss_output(self, result):
             self.parent.assertListEqual(list(result["loss"].size()), [])
 
+        def check_prepare_lm_labels_via_shift_left(
+            self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
+        ):
+            model = T5Model(config=config)
+            model.to(torch_device)
+            model.eval()
+
+            # make sure that lm_labels are correctly padded from the right
+            lm_labels.masked_fill_((lm_labels == self.decoder_start_token_id), self.eos_token_id)
+
+            # add casaul pad token mask
+            triangular_mask = torch.tril(lm_labels.new_ones(lm_labels.shape)).logical_not()
+            lm_labels.masked_fill_(triangular_mask, self.pad_token_id)
+            decoder_input_ids = model._shift_right(lm_labels)
+
+            for i, (decoder_input_ids_slice, lm_labels_slice) in enumerate(zip(decoder_input_ids, lm_labels)):
+                # first item
+                self.parent.assertEqual(decoder_input_ids_slice[0].item(), self.decoder_start_token_id)
+                if i < decoder_input_ids_slice.shape[-1]:
+                    if i < decoder_input_ids.shape[-1] - 1:
+                        # items before diagonal
+                        self.parent.assertListEqual(
+                            decoder_input_ids_slice[1 : i + 1].tolist(), lm_labels_slice[:i].tolist()
+                        )
+                    # pad items after diagonal
+                    if i < decoder_input_ids.shape[-1] - 2:
+                        self.parent.assertListEqual(
+                            decoder_input_ids_slice[i + 2 :].tolist(), lm_labels_slice[i + 1 : -1].tolist()
+                        )
+                else:
+                    # all items after square
+                    self.parent.assertListEqual(decoder_input_ids_slice[1:].tolist(), lm_labels_slice[:-1].tolist())
+
         def create_and_check_t5_model(
             self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
         ):
@@ -197,6 +235,10 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_shift_right(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.check_prepare_lm_labels_via_shift_left(*config_and_inputs)
+
     def test_t5_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_t5_model(*config_and_inputs)
@@ -210,3 +252,139 @@ def test_model_from_pretrained(self):
         for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
             model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
+
+
+@require_torch
+class T5ModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_summarization(self):
+        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        FRANCE_ARTICLE = 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+        EXPECTED_SUMMARY_FRANCE = 'french prosecutor says he is not aware of any video footage from on board the plane . prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a cell phone video of the final seconds of flight 9525 . all 150 on board were killed when the plane crashed into the french Alps .'
+
+        SHORTER_ARTICLE = '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        EXPECTED_SUMMARY_SHORTER = "the formal accession was marked with a ceremony at The Hague, in the Netherlands . the Palestinians signed the ICC's founding Rome Statute in January . they also accepted its jurisdiction over alleged crimes committed in occupied Palestinian territory . as members, Palestinians may be subject to counter-charges as well ."
+
+        IRAN_ARTICLE = "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+        EXPECTED_SUMMARY_IRAN = "the united states and its negotiating partners reached a very strong framework agreement with Iran . the agreement limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon . expect pushback anyway, if the recent past is any harbinger ."
+
+        ARTICLE_SUBWAY = 'New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+        EXPECTED_SUMMARY_SUBWAY = "in total, barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002 . she is believed to still be married to four men, and at one time, she was married to eight men at once . prosecutors say the marriages were part of an immigration scam ."
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        summarization_config = task_specific_config.get("summarization", {})
+        model.config.update(summarization_config)
+
+        dct = tok.batch_encode_plus(
+            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
+            max_length=512,
+            pad_to_max_length=True,
+            return_tensors="pt",
+        )
+        self.assertEqual(512, dct["input_ids"].shape[1])
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=142,
+            min_length=56,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+
+        decoded = [
+            tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in hypotheses_batch
+        ]
+
+        self.assertListEqual(
+            [EXPECTED_SUMMARY_FRANCE, EXPECTED_SUMMARY_SHORTER, EXPECTED_SUMMARY_IRAN, EXPECTED_SUMMARY_SUBWAY],
+            decoded,
+        )
+
+    @slow
+    def test_translation_en_to_de(self):
+        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_de", {})
+        model.config.update(translation_config)
+
+        original_input = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
+        expected_translation = (
+            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+        )
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_fr(self):
+        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_fr", {})
+        model.config.update(translation_config)
+
+        original_input = 'This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots, while more difficult to identify are the pink-coloured "new-borns" in the star delivery room.'
+        expected_translation = "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre un « portrait familial » de générations innombrables de étoiles : les plus anciennes sont observées sous forme de pointes bleues, alors que les « nouveau-nés » de couleur rose dans la salle des accouchements doivent être plus difficiles "
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=100,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_ro(self):
+        model = T5ForConditionalGeneration.from_pretrained("t5-base").to(torch_device)
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_ro", {})
+        model.config.update(translation_config)
+
+        original_input = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
+        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="pt")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index d2d7fd0b4f3f..a5038bad7825 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -162,6 +162,10 @@ def test_pt_tf_model_equivalence(self):
             pt_inputs_dict = dict(
                 (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
             )
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
             tfo = tf_model(inputs_dict, training=False)
@@ -201,6 +205,10 @@ def test_pt_tf_model_equivalence(self):
             pt_inputs_dict = dict(
                 (name, torch.from_numpy(key.numpy()).to(torch.long)) for name, key in inputs_dict.items()
             )
+            # need to rename encoder-decoder "inputs" for PyTorch
+            if "inputs" in pt_inputs_dict and self.is_encoder_decoder:
+                pt_inputs_dict["input_ids"] = pt_inputs_dict.pop("inputs")
+
             with torch.no_grad():
                 pto = pt_model(**pt_inputs_dict)
             tfo = tf_model(inputs_dict)
@@ -223,7 +231,7 @@ def test_compile_tf_model(self):
         if self.is_encoder_decoder:
             input_ids = {
                 "decoder_input_ids": tf.keras.Input(batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"),
-                "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
+                "inputs": tf.keras.Input(batch_shape=(2, 2000), name="inputs", dtype="int32"),
             }
         else:
             input_ids = tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32")
@@ -259,7 +267,7 @@ def test_keyword_and_dict_args(self):
             outputs_dict = model(inputs_dict)
 
             inputs_keywords = copy.deepcopy(inputs_dict)
-            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "decoder_input_ids", None,)
+            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
             outputs_keywords = model(input_ids, **inputs_keywords)
 
             output_dict = outputs_dict[0].numpy()
@@ -395,9 +403,9 @@ def test_inputs_embeds(self):
             input_ids = inputs_dict["input_ids"]
             del inputs_dict["input_ids"]
         else:
-            encoder_input_ids = inputs_dict["input_ids"]
+            encoder_input_ids = inputs_dict["inputs"]
             decoder_input_ids = inputs_dict["decoder_input_ids"]
-            del inputs_dict["input_ids"]
+            del inputs_dict["inputs"]
             del inputs_dict["decoder_input_ids"]
 
         for model_class in self.all_model_classes:
@@ -412,58 +420,114 @@ def test_inputs_embeds(self):
 
             model(inputs_dict)
 
-    def test_lm_head_model_random_generate(self):
-
+    def test_lm_head_model_random_no_beam_search_generate(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        input_ids = inputs_dict["input_ids"]
-
-        if self.is_encoder_decoder:
-            config.output_past = True  # needed for Bart TODO: might have to update for other encoder-decoder models
+        input_ids = inputs_dict["input_ids"] if "input_ids" in inputs_dict else inputs_dict["inputs"]
 
+        # iterate over all generative models
         for model_class in self.all_generative_model_classes:
             model = model_class(config)
 
             if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids
                 with self.assertRaises(AssertionError):
                     model.generate(do_sample=True, max_length=5)
-                # batch_size = 1
-                self._check_generated_tokens(model.generate(input_ids, do_sample=True))
-                # batch_size = 1, num_beams > 1
-                self._check_generated_tokens(model.generate(input_ids, do_sample=True, num_beams=3))
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True))
             else:
-                # batch_size = 1
-                self._check_generated_tokens(model.generate(do_sample=True, max_length=5))
-                # batch_size = 1, num_beams > 1
-                self._check_generated_tokens(model.generate(do_sample=True, max_length=5, num_beams=3))
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5))
 
             with self.assertRaises(AssertionError):
-                # generating multiple sequences when greedy no beam generation
+                # generating multiple sequences when no beam search generation
                 # is not allowed as it would always generate the same sequences
                 model.generate(input_ids, do_sample=False, num_return_sequences=2)
 
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_return_sequences=2))
+
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=True, bad_words_ids=bad_words_ids, num_return_sequences=2
+            )
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def test_lm_head_model_random_beam_search_generate(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict["input_ids"] if "input_ids" in inputs_dict else inputs_dict["inputs"]
+
+        if self.is_encoder_decoder:
+            # needed for Bart beam search
+            config.output_past = True
+
+        for model_class in self.all_generative_model_classes:
+            model = model_class(config)
+
+            if config.bos_token_id is None:
+                # if bos token id is not defined mobel needs input_ids, num_return_sequences = 1
+                self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2))
+            else:
+                # num_return_sequences = 1
+                self._check_generated_ids(model.generate(do_sample=True, max_length=5, num_beams=2))
+
             with self.assertRaises(AssertionError):
                 # generating more sequences than having beams leads is not possible
                 model.generate(input_ids, do_sample=False, num_return_sequences=3, num_beams=2)
 
-            # batch_size > 1, sample
-            self._check_generated_tokens(model.generate(input_ids, do_sample=True, num_return_sequences=3))
-            # batch_size > 1, greedy
-            self._check_generated_tokens(model.generate(input_ids, do_sample=False))
+            # num_return_sequences > 1, sample
+            self._check_generated_ids(model.generate(input_ids, do_sample=True, num_beams=2, num_return_sequences=2,))
+            # num_return_sequences > 1, greedy
+            self._check_generated_ids(model.generate(input_ids, do_sample=False, num_beams=2, num_return_sequences=2))
 
-            # batch_size > 1, num_beams > 1, sample
-            self._check_generated_tokens(
-                model.generate(input_ids, do_sample=True, num_beams=3, num_return_sequences=3,)
-            )
-            # batch_size > 1, num_beams > 1, greedy
-            self._check_generated_tokens(
-                model.generate(input_ids, do_sample=False, num_beams=3, num_return_sequences=3)
+            # check bad words tokens language generation
+            # create list of 1-seq bad token and list of 2-seq of bad tokens
+            bad_words_ids = [self._generate_random_bad_tokens(1, model), self._generate_random_bad_tokens(2, model)]
+            output_tokens = model.generate(
+                input_ids, do_sample=False, bad_words_ids=bad_words_ids, num_beams=2, num_return_sequences=2
             )
-
-    def _check_generated_tokens(self, output_ids):
+            # only count generated tokens
+            generated_ids = output_tokens[:, input_ids.shape[-1] :]
+            self.assertFalse(self._check_match_tokens(generated_ids.numpy().tolist(), bad_words_ids))
+
+    def _generate_random_bad_tokens(self, num_bad_tokens, model):
+        # special tokens cannot be bad tokens
+        special_tokens = []
+        if model.config.bos_token_id is not None:
+            special_tokens.append(model.config.bos_token_id)
+        if model.config.pad_token_id is not None:
+            special_tokens.append(model.config.pad_token_id)
+        if model.config.eos_token_id is not None:
+            special_tokens.append(model.config.eos_token_id)
+
+        # create random bad tokens that are not special tokens
+        bad_tokens = []
+        while len(bad_tokens) < num_bad_tokens:
+            token = tf.squeeze(ids_tensor((1, 1), self.model_tester.vocab_size), 0).numpy()[0]
+            if token not in special_tokens:
+                bad_tokens.append(token)
+        return bad_tokens
+
+    def _check_generated_ids(self, output_ids):
         for token_id in output_ids[0].numpy().tolist():
             self.assertGreaterEqual(token_id, 0)
             self.assertLess(token_id, self.model_tester.vocab_size)
 
+    def _check_match_tokens(self, generated_ids, bad_words_ids):
+        # for all bad word tokens
+        for bad_word_ids in bad_words_ids:
+            # for all slices in batch
+            for generated_ids_slice in generated_ids:
+                # for all word idx
+                for i in range(len(bad_word_ids), len(generated_ids_slice)):
+                    # if tokens match
+                    if generated_ids_slice[i - len(bad_word_ids) : i] == bad_word_ids:
+                        return True
+        return False
+
 
 def ids_tensor(shape, vocab_size, rng=None, name=None, dtype=None):
     """Creates a random int32 tensor of the shape within the vocab size."""
diff --git a/tests/test_modeling_tf_electra.py b/tests/test_modeling_tf_electra.py
new file mode 100644
index 000000000000..2a0bfcdbb5fb
--- /dev/null
+++ b/tests/test_modeling_tf_electra.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+from transformers import ElectraConfig, is_tf_available
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+from .utils import CACHE_DIR, require_tf, slow
+
+
+if is_tf_available():
+    from transformers.modeling_tf_electra import (
+        TFElectraModel,
+        TFElectraForMaskedLM,
+        TFElectraForPreTraining,
+        TFElectraForTokenClassification,
+    )
+
+
+@require_tf
+class TFElectraModelTest(TFModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (TFElectraModel, TFElectraForMaskedLM, TFElectraForPreTraining, TFElectraForTokenClassification,)
+        if is_tf_available()
+        else ()
+    )
+
+    class TFElectraModelTester(object):
+        def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+        ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = ElectraConfig(
+                vocab_size=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range,
+            )
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_electra_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFElectraModel(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (sequence_output,) = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            (sequence_output,) = model(inputs)
+
+            (sequence_output,) = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]
+            )
+
+        def create_and_check_electra_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFElectraForMaskedLM(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
+            )
+
+        def create_and_check_electra_for_pretraining(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            model = TFElectraForPreTraining(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (prediction_scores,) = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(list(result["prediction_scores"].shape), [self.batch_size, self.seq_length])
+
+        def create_and_check_electra_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        ):
+            config.num_labels = self.num_labels
+            model = TFElectraForTokenClassification(config=config)
+            inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+            (logits,) = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]
+            )
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (
+                config,
+                input_ids,
+                token_type_ids,
+                input_mask,
+                sequence_labels,
+                token_labels,
+                choice_labels,
+            ) = config_and_inputs
+            inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFElectraModelTest.TFElectraModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=ElectraConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_electra_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs)
+
+    def test_for_pretraining(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_pretraining(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_electra_for_token_classification(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        # for model_name in list(TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+        for model_name in ["electra-small-discriminator"]:
+            model = TFElectraModel.from_pretrained(model_name, cache_dir=CACHE_DIR)
+            self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py
index 731de2540db4..37a243b45c44 100644
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -24,7 +24,7 @@
 
 
 if is_tf_available():
-    from transformers.modeling_tf_t5 import TFT5Model, TFT5ForConditionalGeneration
+    from transformers import TFT5Model, TFT5ForConditionalGeneration, T5Tokenizer
 
 
 @require_tf
@@ -52,7 +52,7 @@ def __init__(
             relative_attention_num_buckets=8,
             dropout_rate=0.1,
             initializer_factor=0.002,
-            eos_token_ids=[1],
+            eos_token_id=1,
             pad_token_id=0,
             scope=None,
         ):
@@ -71,7 +71,7 @@ def __init__(
             self.relative_attention_num_buckets = relative_attention_num_buckets
             self.dropout_rate = dropout_rate
             self.initializer_factor = initializer_factor
-            self.eos_token_ids = eos_token_ids
+            self.eos_token_id = eos_token_id
             self.pad_token_id = pad_token_id
             self.scope = scope
 
@@ -97,7 +97,7 @@ def prepare_config_and_inputs(self):
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
                 initializer_factor=self.initializer_factor,
-                eos_token_ids=self.eos_token_ids,
+                eos_token_id=self.eos_token_id,
                 bos_token_id=self.pad_token_id,
                 pad_token_id=self.pad_token_id,
             )
@@ -107,13 +107,15 @@ def prepare_config_and_inputs(self):
         def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
             model = TFT5Model(config=config)
             inputs = {
-                "input_ids": input_ids,
+                "inputs": input_ids,
                 "decoder_input_ids": input_ids,
                 "decoder_attention_mask": input_mask,
             }
             encoder_output, decoder_output = model(inputs)
 
-            encoder_output, decoder_output = model(input_ids, decoder_attention_mask=input_mask, input_ids=input_ids)
+            encoder_output, decoder_output = model(
+                input_ids, decoder_attention_mask=input_mask, decoder_input_ids=input_ids
+            )
 
             result = {
                 "encoder_output": encoder_output.numpy(),
@@ -129,7 +131,7 @@ def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels)
         def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
             model = TFT5ForConditionalGeneration(config=config)
             inputs_dict = {
-                "input_ids": input_ids,
+                "inputs": input_ids,
                 "decoder_input_ids": input_ids,
                 "decoder_attention_mask": input_mask,
             }
@@ -147,7 +149,7 @@ def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, input_mask, token_labels) = config_and_inputs
             inputs_dict = {
-                "input_ids": input_ids,
+                "inputs": input_ids,
                 "decoder_input_ids": input_ids,
                 "decoder_attention_mask": input_mask,
             }
@@ -173,3 +175,139 @@ def test_model_from_pretrained(self):
         for model_name in ["t5-small"]:
             model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR)
             self.assertIsNotNone(model)
+
+
+@require_tf
+class TFT5ModelIntegrationTests(unittest.TestCase):
+    @slow
+    def test_summarization(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        FRANCE_ARTICLE = 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object.  Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col. Jean-Marc Menichini, a French Gendarmerie spokesman in charge of communications on rescue efforts around the Germanwings crash site, told CNN that the reports were "completely wrong" and "unwarranted." Cell phones have been collected at the site, he said, but that they "hadn\'t been exploited yet." Menichini said he believed the cell phones would need to be sent to the Criminal Research Institute in Rosny sous-Bois, near Paris, in order to be analyzed by specialized technicians working hand-in-hand with investigators. But none of the cell phones found so far have been sent to the institute, Menichini said. Asked whether staff involved in the search could have leaked a memory card to the media, Menichini answered with a categorical "no." Reichelt told "Erin Burnett: Outfront" that he had watched the video and stood by the report, saying Bild and Paris Match are "very confident" that the clip is real. He noted that investigators only revealed they\'d recovered cell phones from the crash site after Bild and Paris Match published their reports. "That is something we did not know before. ... Overall we can say many things of the investigation weren\'t revealed by the investigation at the beginning," he said. What was mental state of Germanwings co-pilot? German airline Lufthansa confirmed Tuesday that co-pilot Andreas Lubitz had battled depression years before he took the controls of Germanwings Flight 9525, which he\'s accused of deliberately crashing last week in the French Alps. Lubitz told his Lufthansa flight training school in 2009 that he had a "previous episode of severe depression," the airline said Tuesday. Email correspondence between Lubitz and the school discovered in an internal investigation, Lufthansa said, included medical documents he submitted in connection with resuming his flight training. The announcement indicates that Lufthansa, the parent company of Germanwings, knew of Lubitz\'s battle with depression, allowed him to continue training and ultimately put him in the cockpit. Lufthansa, whose CEO Carsten Spohr previously said Lubitz was 100% fit to fly, described its statement Tuesday as a "swift and seamless clarification" and said it was sharing the information and documents -- including training and medical records -- with public prosecutors. Spohr traveled to the crash site Wednesday, where recovery teams have been working for the past week to recover human remains and plane debris scattered across a steep mountainside. He saw the crisis center set up in Seyne-les-Alpes, laid a wreath in the village of Le Vernet, closer to the crash site, where grieving families have left flowers at a simple stone memorial. Menichini told CNN late Tuesday that no visible human remains were left at the site but recovery teams would keep searching. French President Francois Hollande, speaking Tuesday, said that it should be possible to identify all the victims using DNA analysis by the end of the week, sooner than authorities had previously suggested. In the meantime, the recovery of the victims\' personal belongings will start Wednesday, Menichini said. Among those personal belongings could be more cell phones belonging to the 144 passengers and six crew on board. Check out the latest from our correspondents . The details about Lubitz\'s correspondence with the flight school during his training were among several developments as investigators continued to delve into what caused the crash and Lubitz\'s possible motive for downing the jet. A Lufthansa spokesperson told CNN on Tuesday that Lubitz had a valid medical certificate, had passed all his examinations and "held all the licenses required." Earlier, a spokesman for the prosecutor\'s office in Dusseldorf, Christoph Kumpa, said medical records reveal Lubitz suffered from suicidal tendencies at some point before his aviation career and underwent psychotherapy before he got his pilot\'s license. Kumpa emphasized there\'s no evidence suggesting Lubitz was suicidal or acting aggressively before the crash. Investigators are looking into whether Lubitz feared his medical condition would cause him to lose his pilot\'s license, a European government official briefed on the investigation told CNN on Tuesday. While flying was "a big part of his life," the source said, it\'s only one theory being considered. Another source, a law enforcement official briefed on the investigation, also told CNN that authorities believe the primary motive for Lubitz to bring down the plane was that he feared he would not be allowed to fly because of his medical problems. Lubitz\'s girlfriend told investigators he had seen an eye doctor and a neuropsychologist, both of whom deemed him unfit to work recently and concluded he had psychological issues, the European government official said. But no matter what details emerge about his previous mental health struggles, there\'s more to the story, said Brian Russell, a forensic psychologist. "Psychology can explain why somebody would turn rage inward on themselves about the fact that maybe they weren\'t going to keep doing their job and they\'re upset about that and so they\'re suicidal," he said. "But there is no mental illness that explains why somebody then feels entitled to also take that rage and turn it outward on 149 other people who had nothing to do with the person\'s problems." Germanwings crash compensation: What we know . Who was the captain of Germanwings Flight 9525? CNN\'s Margot Haddad reported from Marseille and Pamela Brown from Dusseldorf, while Laura Smith-Spark wrote from London. CNN\'s Frederik Pleitgen, Pamela Boykoff, Antonia Mortensen, Sandrine Amiel and Anna-Maja Rappard contributed to this report.'  # @noqa
+        EXPECTED_SUMMARY_FRANCE = 'french prosecutor says he is not aware of any video footage from on board the plane . prosecutor: "so far no videos were used in the crash investigation" two magazines claim to have found a cell phone video of the final seconds of flight 9525 . all 150 on board were killed when the plane crashed into the french Alps .'
+
+        SHORTER_ARTICLE = '(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony, said it was a move toward greater justice. "As Palestine formally becomes a State Party to the Rome Statute today, the world is also a step closer to ending a long era of impunity and injustice," he said, according to an ICC news release. "Indeed, today brings us closer to our shared goals of justice and peace." Judge Kuniko Ozaki, a vice president of the ICC, said acceding to the treaty was just the first step for the Palestinians. "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. These are substantive commitments, which cannot be taken lightly," she said. Rights group Human Rights Watch welcomed the development. "Governments seeking to penalize Palestine for joining the ICC should immediately end their pressure, and countries that support universal acceptance of the court\'s treaty should speak out to welcome its membership," said Balkees Jarrah, international justice counsel for the group. "What\'s objectionable is the attempts to undermine international justice, not Palestine\'s decision to join a treaty to which over 100 countries around the world are members." In January, when the preliminary ICC examination was opened, Israeli Prime Minister Benjamin Netanyahu described it as an outrage, saying the court was overstepping its boundaries. The United States also said it "strongly" disagreed with the court\'s decision. "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement. It urged the warring sides to resolve their differences through direct negotiations. "We will continue to oppose actions against Israel at the ICC as counterproductive to the cause of peace," it said. But the ICC begs to differ with the definition of a state for its purposes and refers to the territories as "Palestine." While a preliminary examination is not a formal investigation, it allows the court to review evidence and determine whether to investigate suspects on both sides. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." The war between Israel and Hamas militants in Gaza last summer left more than 2,000 people dead. The inquiry will include alleged war crimes committed since June. The International Criminal Court was set up in 2002 to prosecute genocide, crimes against humanity and war crimes. CNN\'s Vasco Cotovio, Kareem Khadder and Faith Karimi contributed to this report.'
+        EXPECTED_SUMMARY_SHORTER = "the formal accession was marked with a ceremony at The Hague, in the Netherlands . the Palestinians signed the ICC's founding Rome Statute in January . they also accepted its jurisdiction over alleged crimes committed in occupied Palestinian territory . as members, Palestinians may be subject to counter-charges as well ."
+
+        IRAN_ARTICLE = "(CNN)The United States and its negotiating partners reached a very strong framework agreement with Iran in Lausanne, Switzerland, on Thursday that limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon. Expect pushback anyway, if the recent past is any harbinger. Just last month, in an attempt to head off such an agreement, House Speaker John Boehner invited Israeli Prime Minister Benjamin Netanyahu to preemptively blast it before Congress, and 47 senators sent a letter to the Iranian leadership warning them away from a deal. The debate that has already begun since the announcement of the new framework will likely result in more heat than light. It will not be helped by the gathering swirl of dubious assumptions and doubtful assertions. Let us address some of these: . The most misleading assertion, despite universal rejection by experts, is that the negotiations' objective at the outset was the total elimination of any nuclear program in Iran. That is the position of Netanyahu and his acolytes in the U.S. Congress. But that is not and never was the objective. If it had been, there would have been no Iranian team at the negotiating table. Rather, the objective has always been to structure an agreement or series of agreements so that Iran could not covertly develop a nuclear arsenal before the United States and its allies could respond. The new framework has exceeded expectations in achieving that goal. It would reduce Iran's low-enriched uranium stockpile, cut by two-thirds its number of installed centrifuges and implement a rigorous inspection regime. Another dubious assumption of opponents is that the Iranian nuclear program is a covert weapons program. Despite sharp accusations by some in the United States and its allies, Iran denies having such a program, and U.S. intelligence contends that Iran has not yet made the decision to build a nuclear weapon. Iran's continued cooperation with International Atomic Energy Agency inspections is further evidence on this point, and we'll know even more about Iran's program in the coming months and years because of the deal. In fact, the inspections provisions that are part of this agreement are designed to protect against any covert action by the Iranians. What's more, the rhetoric of some members of Congress has implied that the negotiations have been between only the United States and Iran (i.e., the 47 senators' letter warning that a deal might be killed by Congress or a future president). This of course is not the case. The talks were between Iran and the five permanent members of the U.N. Security Council (United States, United Kingdom, France, China and Russia) plus Germany, dubbed the P5+1. While the United States has played a leading role in the effort, it negotiated the terms alongside its partners. If the agreement reached by the P5+1 is rejected by Congress, it could result in an unraveling of the sanctions on Iran and threaten NATO cohesion in other areas. Another questionable assertion is that this agreement contains a sunset clause, after which Iran will be free to do as it pleases. Again, this is not the case. Some of the restrictions on Iran's nuclear activities, such as uranium enrichment, will be eased or eliminated over time, as long as 15 years. But most importantly, the framework agreement includes Iran's ratification of the Additional Protocol, which allows IAEA inspectors expanded access to nuclear sites both declared and nondeclared. This provision will be permanent. It does not sunset. Thus, going forward, if Iran decides to enrich uranium to weapons-grade levels, monitors will be able to detect such a move in a matter of days and alert the U.N. Security Council. Many in Congress have said that the agreement should be a formal treaty requiring the Senate to \"advise and consent.\" But the issue is not suited for a treaty. Treaties impose equivalent obligations on all signatories. For example, the New START treaty limits Russia and the United States to 1,550 deployed strategic warheads. But any agreement with Iran will not be so balanced.  The restrictions and obligations in the final framework agreement will be imposed almost exclusively on Iran. The P5+1 are obligated only to ease and eventually remove most but not all economic sanctions, which were imposed as leverage to gain this final deal. Finally some insist that any agreement must address Iranian missile programs, human rights violations or support for Hamas or Hezbollah.  As important as these issues are, and they must indeed be addressed, they are unrelated to the most important aim of a nuclear deal: preventing a nuclear Iran.  To include them in the negotiations would be a poison pill. This agreement should be judged on its merits and on how it affects the security of our negotiating partners and allies, including Israel. Those judgments should be fact-based, not based on questionable assertions or dubious assumptions."
+        EXPECTED_SUMMARY_IRAN = "the united states and its negotiating partners reached a very strong framework agreement with Iran . the agreement limits Iran's nuclear program in such a way as to effectively block it from building a nuclear weapon . expect pushback anyway, if the recent past is any harbinger ."
+
+        ARTICLE_SUBWAY = 'New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.  Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 2010 marriage license application, according to court documents. Prosecutors said the marriages were part of an immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.  All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.  Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.'
+        EXPECTED_SUMMARY_SUBWAY = "in total, barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002 . she is believed to still be married to four men, and at one time, she was married to eight men at once . prosecutors say the marriages were part of an immigration scam ."
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        summarization_config = task_specific_config.get("summarization", {})
+        model.config.update(summarization_config)
+
+        dct = tok.batch_encode_plus(
+            [model.config.prefix + x for x in [FRANCE_ARTICLE, SHORTER_ARTICLE, IRAN_ARTICLE, ARTICLE_SUBWAY]],
+            max_length=512,
+            pad_to_max_length=True,
+            return_tensors="tf",
+        )
+        self.assertEqual(512, dct["input_ids"].shape[1])
+
+        hypotheses_batch = model.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=142,
+            min_length=56,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+
+        decoded = [
+            tok.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in hypotheses_batch
+        ]
+
+        self.assertListEqual(
+            [EXPECTED_SUMMARY_FRANCE, EXPECTED_SUMMARY_SHORTER, EXPECTED_SUMMARY_IRAN, EXPECTED_SUMMARY_SUBWAY],
+            decoded,
+        )
+
+    @slow
+    def test_translation_en_to_de(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_de", {})
+        model.config.update(translation_config)
+
+        original_input = '"Luigi often said to me that he never wanted the brothers to end up in court", she wrote.'
+        expected_translation = (
+            '"Luigi sagte mir oft, dass er nie wollte, dass die Brüder am Gericht sitzen", schrieb sie.'
+        )
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_fr(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_fr", {})
+        model.config.update(translation_config)
+
+        original_input = 'This image section from an infrared recording by the Spitzer telescope shows a "family portrait" of countless generations of stars: the oldest stars are seen as blue dots, while more difficult to identify are the pink-coloured "new-borns" in the star delivery room.'
+        expected_translation = "Cette section d'images provenant de l'enregistrement infrarouge effectué par le télescope Spitzer montre un « portrait familial » de générations innombrables de étoiles : les plus anciennes sont observées sous forme de pointes bleues, alors que les « nouveau-nés » de couleur rose dans la salle des accouchements doivent être plus difficiles "
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=100,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
+
+    @slow
+    def test_translation_en_to_ro(self):
+        model = TFT5ForConditionalGeneration.from_pretrained("t5-base")
+        tok = T5Tokenizer.from_pretrained("t5-base")
+
+        task_specific_config = getattr(model.config, "task_specific_params", {})
+        translation_config = task_specific_config.get("translation_en_to_ro", {})
+        model.config.update(translation_config)
+
+        original_input = "Taco Bell said it plans to add 2,000 locations in the US by 2022."
+        expected_translation = "Taco Bell a declarat că intenţionează să adauge 2 000 de locaţii în SUA până în 2022."
+
+        input_ids = tok.encode(model.config.prefix + original_input, return_tensors="tf")
+
+        output = model.generate(
+            input_ids=input_ids,
+            num_beams=4,
+            length_penalty=2.0,
+            max_length=50,
+            no_repeat_ngram_size=3,
+            do_sample=False,
+            early_stopping=True,
+        )
+        translation = tok.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+
+        self.assertEqual(translation, expected_translation)
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index fad4e8528285..b97cda38340e 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -16,13 +16,11 @@
 
 QA_FINETUNED_MODELS = [
     (("bert-base-uncased", {"use_fast": False}), "bert-large-uncased-whole-word-masking-finetuned-squad", None),
-    (("bert-base-cased", {"use_fast": False}), "bert-large-cased-whole-word-masking-finetuned-squad", None),
     (("bert-base-cased", {"use_fast": False}), "distilbert-base-cased-distilled-squad", None),
 ]
 
 TF_QA_FINETUNED_MODELS = [
     (("bert-base-uncased", {"use_fast": False}), "bert-large-uncased-whole-word-masking-finetuned-squad", None),
-    (("bert-base-cased", {"use_fast": False}), "bert-large-cased-whole-word-masking-finetuned-squad", None),
     (("bert-base-cased", {"use_fast": False}), "distilbert-base-cased-distilled-squad", None),
 ]
 
@@ -49,7 +47,6 @@
 }
 
 TF_FEATURE_EXTRACT_FINETUNED_MODELS = {
-    ("bert-base-cased", "bert-base-cased", None),
     # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2
     ("distilbert-base-cased", "distilbert-base-cased", None),
 }
@@ -64,7 +61,7 @@
 
 TEXT_CLASSIF_FINETUNED_MODELS = {
     (
-        "bert-base-uncased",
+        "distilbert-base-cased",
         "distilbert-base-uncased-finetuned-sst-2-english",
         "distilbert-base-uncased-finetuned-sst-2-english",
     )
@@ -78,14 +75,17 @@
     (("distilroberta-base", {"use_fast": False}), "distilroberta-base", None),
 ]
 
-SUMMARIZATION_FINETUNED_MODELS = {("bart-large-cnn", "bart-large-cnn"), ("t5-small", "t5-small")}
-TF_SUMMARIZATION_FINETUNED_MODELS = {("t5-small", "t5-small")}
+SUMMARIZATION_FINETUNED_MODELS = {
+    ("sshleifer/bart-tiny-random", "bart-large-cnn"),
+    ("patrickvonplaten/t5-tiny-random", "t5-small"),
+}
+TF_SUMMARIZATION_FINETUNED_MODELS = {("patrickvonplaten/t5-tiny-random", "t5-small")}
 
 TRANSLATION_FINETUNED_MODELS = {
-    ("t5-small", "t5-small", "translation_en_to_de"),
-    ("t5-small", "t5-small", "translation_en_to_ro"),
+    ("patrickvonplaten/t5-tiny-random", "t5-small", "translation_en_to_de"),
+    ("patrickvonplaten/t5-tiny-random", "t5-small", "translation_en_to_ro"),
 }
-TF_TRANSLATION_FINETUNED_MODELS = {("t5-small", "t5-small", "translation_en_to_fr")}
+TF_TRANSLATION_FINETUNED_MODELS = {("patrickvonplaten/t5-tiny-random", "t5-small", "translation_en_to_fr")}
 
 
 class MonoColumnInputTestCase(unittest.TestCase):
diff --git a/tests/test_tokenization_bert.py b/tests/test_tokenization_bert.py
index 49bb073351d1..0e81eb1a5a8a 100644
--- a/tests/test_tokenization_bert.py
+++ b/tests/test_tokenization_bert.py
@@ -82,7 +82,7 @@ def test_rust_and_python_full_tokenizers(self):
             return
 
         tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_special_tokens=False)
+        rust_tokenizer = self.get_rust_tokenizer()
 
         sequence = "UNwant\u00E9d,running"
 
@@ -91,7 +91,7 @@ def test_rust_and_python_full_tokenizers(self):
         self.assertListEqual(tokens, rust_tokens)
 
         ids = tokenizer.encode(sequence, add_special_tokens=False)
-        rust_ids = rust_tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
         self.assertListEqual(ids, rust_ids)
 
         rust_tokenizer = self.get_rust_tokenizer()
diff --git a/tests/test_tokenization_bert_japanese.py b/tests/test_tokenization_bert_japanese.py
index 4900ff49da50..4e0925d72969 100644
--- a/tests/test_tokenization_bert_japanese.py
+++ b/tests/test_tokenization_bert_japanese.py
@@ -91,6 +91,20 @@ def test_mecab_tokenizer_lower(self):
             ["アップルストア", "で", "iphone", "8", "が", "発売", "さ", "れ", "た", "。"],
         )
 
+    def test_mecab_tokenizer_with_option(self):
+        try:
+            tokenizer = MecabTokenizer(
+                do_lower_case=True, normalize_text=False, mecab_option="-d /usr/local/lib/mecab/dic/jumandic"
+            )
+        except RuntimeError:
+            # if dict doesn't exist in the system, previous code raises this error.
+            return
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+            ["ｱｯﾌﾟﾙストア", "で", "iPhone", "８", "が", "発売", "さ", "れた", "\u3000", "。"],
+        )
+
     def test_mecab_tokenizer_no_normalize(self):
         tokenizer = MecabTokenizer(normalize_text=False)
 
diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
index 3534eeef066d..a7e3881ebae7 100644
--- a/tests/test_tokenization_common.py
+++ b/tests/test_tokenization_common.py
@@ -282,7 +282,7 @@ def test_number_of_added_tokens(self):
 
         # Method is implemented (e.g. not GPT-2)
         if len(attached_sequences) != 2:
-            self.assertEqual(tokenizer.num_added_tokens(pair=True), len(attached_sequences) - len(sequences))
+            self.assertEqual(tokenizer.num_special_tokens_to_add(pair=True), len(attached_sequences) - len(sequences))
 
     def test_maximum_encoding_length_single_input(self):
         tokenizer = self.get_tokenizer()
@@ -291,7 +291,7 @@ def test_maximum_encoding_length_single_input(self):
         stride = 2
 
         sequence = tokenizer.encode(seq_0, add_special_tokens=False)
-        num_added_tokens = tokenizer.num_added_tokens()
+        num_added_tokens = tokenizer.num_special_tokens_to_add()
         total_length = len(sequence) + num_added_tokens
         information = tokenizer.encode_plus(
             seq_0,
diff --git a/tests/test_tokenization_fast.py b/tests/test_tokenization_fast.py
index 916b86a28ff9..b34552cd0316 100644
--- a/tests/test_tokenization_fast.py
+++ b/tests/test_tokenization_fast.py
@@ -1,6 +1,6 @@
 import unittest
-
-import numpy as np
+from collections import namedtuple
+from itertools import takewhile
 
 from tests.utils import require_torch
 from transformers import (
@@ -21,118 +21,113 @@
 from transformers.tokenization_transfo_xl import TransfoXLTokenizerFast
 
 
-class FastTokenizerMatchingTest(unittest.TestCase):
-    def setUp(self) -> None:
-        with open("tests/fixtures/sample_text.txt") as f_data:
-            self._data = f_data.read().replace("\n\n", "\n").strip()
+NON_ENGLISH_TAGS = ["chinese", "dutch", "french", "finnish", "german", "multilingual"]
+Tokenizer = namedtuple("Tokenizer", ["name", "rust_cls", "python_cls", "vocab_key", "filter"])
+
 
-    def assert_sequence_almost_equals(self, a, b, threshold):
+def filter_non_english(_: Tokenizer, pretrained_name: str):
+    """ Filter all the model for non-english language """
+    return not any([lang in pretrained_name for lang in NON_ENGLISH_TAGS])
 
-        # Handle padding
-        if len(a) != len(b):
-            max_len = max(len(a), len(b))
 
-            # Pad with a negative number as vocab doesnt allow idx < 0
-            # if will be tracked as differences
-            if len(a) < max_len:
-                a += [-1] * (max_len - len(a))
+def filter_roberta_detectors(_: Tokenizer, pretrained_name: str):
+    return "detector" not in pretrained_name
 
-            if len(b) < max_len:
-                b += [-1] * (max_len - len(b))
 
-        # Convert to numpy for convenience
-        a_, b_ = np.array(a), np.array(b)
+class CommonFastTokenizerTest(unittest.TestCase):
 
-        # Compute elementwise difference
-        inputs_diffs = a_ - b_
-        inputs_diff = np.count_nonzero(inputs_diffs)
-        self.assertLessEqual(inputs_diff / a_.shape[0], threshold)
+    TOKENIZERS_CLASSES = frozenset([])
+
+    def setUp(self) -> None:
+        with open("tests/fixtures/sample_text.txt", encoding="utf-8") as f_data:
+            self._data = f_data.read().replace("\n\n", "\n").strip()
 
-    def assert_tokenization_python_rust_almost_equals(self, tokenizer_p, tokenizer_r, threshold: float):
+    def test_all_tokenizers(self):
+        for tok_case in self.TOKENIZERS_CLASSES:
+            for pretrained_name in tok_case.python_cls.pretrained_vocab_files_map[tok_case.vocab_key].keys():
+
+                # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
+                # information available in Tokenizer (name, rust class, python class, vocab key name)
+                if tok_case.filter is None or (
+                    tok_case.filter is not None and tok_case.filter(tok_case, pretrained_name)
+                ):
+                    with self.subTest("{} ({})".format(tok_case.name, pretrained_name)):
+                        tokenizer_r = tok_case.rust_cls.from_pretrained(pretrained_name)
+                        tokenizer_p = tok_case.python_cls.from_pretrained(pretrained_name)
+
+                        self.fast_align_python(tokenizer_r, tokenizer_p)
+                        self.fast_only(tokenizer_r)
+
+    def fast_align_python(self, tokenizer_r, tokenizer_p):
+        # Check is_fast is set correctly
+        self.assertFalse(tokenizer_p.is_fast)
+        self.assertTrue(tokenizer_r.is_fast)
+
+        # Check that Rust and Python align
+        self.assert_tokenization_python_rust_equals(tokenizer_r, tokenizer_p)
+        self.assert_num_special_tokens_to_add_equal(tokenizer_r, tokenizer_p)
+        self.assert_max_length_equal(tokenizer_r, tokenizer_p)
+        self.assert_special_tokens_map_equal(tokenizer_r, tokenizer_p)
+        self.assert_embeded_special_tokens(tokenizer_r, tokenizer_p)
+        self.assert_padding(tokenizer_r, tokenizer_p)
+        # TODO: enable for v3.0.0
+        # self.assert_empty_output_no_special_tokens(tokenizer_r, tokenizer_p)
+
+    def fast_only(self, tokenizer_r):
+        # Ensure None raise an error
+        self.assertRaises(ValueError, tokenizer_r.tokenize, None)
+        self.assertRaises(ValueError, tokenizer_r.encode, None)
+        self.assertRaises(ValueError, tokenizer_r.encode_plus, None)
+        self.assertRaises(ValueError, tokenizer_r.batch_encode_plus, None)
+
+        self.assert_add_tokens(tokenizer_r)
+        self.assert_offsets_mapping(tokenizer_r)
+        self.assert_add_special_tokens(tokenizer_r)
+
+    def assert_tokenization_python_rust_equals(self, tokenizer_p, tokenizer_r):
         # Ensure basic input match
         input_p = tokenizer_p.encode_plus(self._data)
         input_r = tokenizer_r.encode_plus(self._data)
 
         for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-            self.assert_sequence_almost_equals(input_p[key], input_r[key], threshold)
+            self.assertSequenceEqual(input_p[key], input_r[key])
 
         input_pairs_p = tokenizer_p.encode_plus(self._data, self._data)
         input_pairs_r = tokenizer_r.encode_plus(self._data, self._data)
 
         for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-            self.assert_sequence_almost_equals(input_pairs_p[key], input_pairs_r[key], threshold)
+            self.assertSequenceEqual(input_pairs_p[key], input_pairs_r[key])
 
         # Ensure truncation match
         input_p = tokenizer_p.encode_plus(self._data, max_length=512)
         input_r = tokenizer_r.encode_plus(self._data, max_length=512)
 
         for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-            self.assert_sequence_almost_equals(input_p[key], input_r[key], threshold)
+            self.assertSequenceEqual(input_p[key], input_r[key])
 
         # Ensure truncation with stride match
         input_p = tokenizer_p.encode_plus(self._data, max_length=512, stride=3, return_overflowing_tokens=True)
         input_r = tokenizer_r.encode_plus(self._data, max_length=512, stride=3, return_overflowing_tokens=True)
 
         for key in filter(lambda x: x in ["input_ids", "token_type_ids", "attention_mask"], input_p.keys()):
-            self.assert_sequence_almost_equals(input_p[key], input_r[key], threshold)
-
-    def assert_padding(self, tokenizer_r, tokenizer_p):
-        # Simple input
-        input_r = tokenizer_r.encode("This is a simple input", max_length=15, pad_to_max_length=True)
-        input_p = tokenizer_p.encode("This is a simple input", max_length=15, pad_to_max_length=True)
-
-        self.assertSequenceEqual(input_r, input_p)
-
-        # Simple input
-        input_r = tokenizer_r.encode_plus("This is a simple input", max_length=15, pad_to_max_length=True)
-        input_p = tokenizer_p.encode_plus("This is a simple input", max_length=15, pad_to_max_length=True)
-
-        self.assertSequenceEqual(input_r, input_p)
-
-        # Simple input
-        # TODO: Re-enable this test when batch_encode_plus with padding correctly handles padding
-        # input_r = tokenizer_r.batch_encode_plus(
-        #     ["This is a simple input 1", "This is a simple input 2"], max_length=15, pad_to_max_length=True
-        # )
-        # input_p = tokenizer_p.batch_encode_plus(
-        #     ["This is a simple input 1", "This is a simple input 2"], max_length=15, pad_to_max_length=True
-        # )
-
-        # self.assertSequenceEqual(input_r, input_p)
-
-        # Pair input
-        input_r = tokenizer_r.encode("This is a simple input", "This is a pair", max_length=15, pad_to_max_length=True)
-        input_p = tokenizer_p.encode("This is a simple input", "This is a pair", max_length=15, pad_to_max_length=True)
-
-        self.assertSequenceEqual(input_r, input_p)
-
-        # Pair input
-        input_r = tokenizer_r.encode_plus(
-            "This is a simple input", "This is a pair", max_length=15, pad_to_max_length=True
-        )
-        input_p = tokenizer_p.encode_plus(
-            "This is a simple input", "This is a pair", max_length=15, pad_to_max_length=True
+            self.assertSequenceEqual(input_p[key], input_r[key])
+
+    def assert_num_special_tokens_to_add_equal(self, tokenizer_r, tokenizer_p):
+        # Check we have the same number of added_tokens for both pair and non-pair inputs.
+        self.assertEqual(tokenizer_r.num_special_tokens_to_add(False), tokenizer_p.num_special_tokens_to_add(False))
+        self.assertEqual(tokenizer_r.num_special_tokens_to_add(True), tokenizer_p.num_special_tokens_to_add(True))
+
+    def assert_max_length_equal(self, tokenizer_r, tokenizer_p):
+        # Check we have the correct max_length for both pair and non-pair inputs.
+        self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
+        self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+
+    def assert_special_tokens_map_equal(self, tokenizer_r, tokenizer_p):
+        # Assert the set of special tokens match.
+        self.assertSequenceEqual(
+            tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(),
         )
 
-        self.assertSequenceEqual(input_r, input_p)
-
-        # Pair input
-        # TODO: Re-enable this test when batch_encode_plus with padding correctly handles padding
-        # input_r = tokenizer_r.batch_encode_plus(
-        #     ["This is a simple input 1", "This is a simple input 2"],
-        #     ["This is a simple pair 1", "This is a simple pair 2"],
-        #     max_length=15,
-        #     pad_to_max_length=True,
-        # )
-        # input_p = tokenizer_p.batch_encode_plus(
-        #     ["This is a simple input 1", "This is a simple input 2"],
-        #     ["This is a simple pair 1", "This is a simple pair 2"],
-        #     max_length=15,
-        #     pad_to_max_length=True,
-        # )
-
-        # self.assertSequenceEqual(input_r, input_p)
-
     def assert_add_tokens(self, tokenizer_r):
         vocab_size = tokenizer_r.vocab_size
         self.assertEqual(tokenizer_r.add_tokens(""), 0)
@@ -150,34 +145,34 @@ def assert_add_tokens(self, tokenizer_r):
         )
         self.assertEqual(len(tokenizer_r), vocab_size + 6)
 
-    def assert_offsets_mapping(self, tokenizer):
+    def assert_offsets_mapping(self, tokenizer_r):
         text = "Wonderful no inspiration example with subtoken"
         pair = "Along with an awesome pair"
 
         # No pair
-        tokens_with_offsets = tokenizer.encode_plus(text, return_special_tokens_mask=True, return_offsets_mapping=True)
-        added_tokens = tokenizer.num_added_tokens(False)
+        tokens_with_offsets = tokenizer_r.encode_plus(
+            text, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
+        )
+        added_tokens = tokenizer_r.num_special_tokens_to_add(False)
         offsets = tokens_with_offsets["offset_mapping"]
 
         # Assert there is the same number of tokens and offsets
         self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
 
         # Assert there is online added_tokens special_tokens
-        self.assertEqual(sum([0 if x else 1 for x in offsets]), added_tokens)
         self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
 
         # Pairs
-        tokens_with_offsets = tokenizer.encode_plus(
-            text, pair, return_special_tokens_mask=True, return_offsets_mapping=True
+        tokens_with_offsets = tokenizer_r.encode_plus(
+            text, pair, return_special_tokens_mask=True, return_offsets_mapping=True, add_special_tokens=True
         )
-        added_tokens = tokenizer.num_added_tokens(True)
+        added_tokens = tokenizer_r.num_special_tokens_to_add(True)
         offsets = tokens_with_offsets["offset_mapping"]
 
         # Assert there is the same number of tokens and offsets
         self.assertEqual(len(offsets), len(tokens_with_offsets["input_ids"]))
 
         # Assert there is online added_tokens special_tokens
-        self.assertEqual(sum([0 if x else 1 for x in offsets]), added_tokens)
         self.assertEqual(sum(tokens_with_offsets["special_tokens_mask"]), added_tokens)
 
     def assert_batch_encode_dynamic_overflowing(self, tokenizer: PreTrainedTokenizer):
@@ -258,281 +253,273 @@ def assert_build_inputs_with_special_tokens(self, tokenizer_r, tokenizer_p):
         output_p = tokenizer_p.build_inputs_with_special_tokens(input_simple, input_pair)
         self.assertEqual(output_p, output_r)
 
-    def assert_save_pretrained(self, tokenizer_r, tokenizer_p):
-
-        # Checks it save with the same files
-        self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary("."))
-
-        # Checks everything loads correctly in the same way
-        tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".")
-
-        # Check special tokens are set accordingly on Rust and Python
-        for key in tokenizer_pp.special_tokens_map:
-            self.assertTrue(hasattr(tokenizer_rp, key))
-            # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
-            # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
-
-    def test_bert(self):
-        for tokenizer_name in BertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
-            tokenizer_p = BertTokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = BertTokenizerFast.from_pretrained(tokenizer_name)
-
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
-
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
-
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "Bert tokenizers doesn't have the same set of special_tokens",
-            )
-
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)
-
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
-
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
-
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assert_batch_encode_dynamic_overflowing(tokenizer_r)
-
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
-
-            # Check the number of returned files for save_vocabulary
-            self.assert_save_pretrained(tokenizer_r, tokenizer_p)
-
-            # Check for padding
-            self.assert_padding(tokenizer_r, tokenizer_p)
-
-    @require_torch
-    def test_transfoxl(self):
-        for tokenizer_name in TransfoXLTokenizer.pretrained_vocab_files_map["pretrained_vocab_file"].keys():
-            tokenizer_p = TransfoXLTokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = TransfoXLTokenizerFast.from_pretrained(tokenizer_name)
-
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
-
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
-
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "TransfoXL tokenizers doesn't have the same set of special_tokens",
-            )
-
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)
-
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
-
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
-
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r)
-
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
-
-            # Check for padding
-            self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
-
-            # Check the number of returned files for save_vocabulary
-            # TransfoXL tokenizers comes in a special format which is not compatible at all
-            # with rust tokenizers. We ensure the errors detection at correctly raised
-            tokenizer_r_files = tokenizer_r.save_pretrained(".")
-            self.assertSequenceEqual(
-                tokenizer_r_files, ["./vocab.json", "./special_tokens_map.json", "./added_tokens.json"]
-            )
-
-            # Check loading Python-tokenizer save through Rust doesnt work (and the opposite)
-            self.assertRaises(ValueError, tokenizer_p.from_pretrained, *tokenizer_r_files)
-            self.assertRaises(ValueError, tokenizer_r.from_pretrained, *tokenizer_p.save_pretrained("."))
+    def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15):
+        def assert_padded_input_match(input_r: list, input_p: list, max_length: int):
 
-            # Check loading works for Python to Python and Rust to Rust
-            # Issue: https://github.com/huggingface/transformers/issues/3000
-            # self.assertIsNotNone(tokenizer_p.__class__.from_pretrained('./'))
-            self.assertIsNotNone(tokenizer_r.__class__.from_pretrained("./"))
+            # Ensure we match max_length
+            self.assertEqual(len(input_r), max_length), self.assertEqual(len(input_p), max_length)
 
-    def test_distilbert(self):
-        for tokenizer_name in DistilBertTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
-            tokenizer_p = DistilBertTokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = DistilBertTokenizerFast.from_pretrained(tokenizer_name)
-
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
+            # Ensure the number of padded tokens is the same
+            padded_tokens_r = list(takewhile(lambda i: i == tokenizer_r.pad_token_id, reversed(input_r)))
+            padded_tokens_p = list(takewhile(lambda i: i == tokenizer_p.pad_token_id, reversed(input_p)))
+            self.assertSequenceEqual(padded_tokens_r, padded_tokens_p)
 
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+        def assert_batch_padded_input_match(input_r: dict, input_p: dict):
+            for i_r in input_r.values():
+                self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), 15), self.assertEqual(len(i_r[1]), 15)
+                self.assertEqual(len(i_r), 2), self.assertEqual(len(i_r[0]), 15), self.assertEqual(len(i_r[1]), 15)
 
-            # DistilBert should match 100%
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "DistilBert tokenizers doesn't have the same set of special_tokens",
-            )
+            for i_r, i_p in zip(input_r["input_ids"], input_p["input_ids"]):
+                assert_padded_input_match(i_r, i_p, max_length)
 
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)
+            for i_r, i_p in zip(input_r["attention_mask"], input_p["attention_mask"]):
+                self.assertSequenceEqual(i_r, i_p)
 
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
-
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
-
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assert_batch_encode_dynamic_overflowing(tokenizer_r)
-
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
-
-            # Check the number of returned files for save_vocabulary
-            self.assert_save_pretrained(tokenizer_r, tokenizer_p)
-
-            # Check for padding
-            self.assert_padding(tokenizer_r, tokenizer_p)
-
-    def test_gpt2(self):
-        for tokenizer_name in GPT2Tokenizer.pretrained_vocab_files_map["vocab_file"].keys():
-            tokenizer_p = GPT2Tokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = GPT2TokenizerFast.from_pretrained(tokenizer_name)
+        # Simple input
+        input_r = tokenizer_r.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+        input_p = tokenizer_p.encode("This is a simple input", max_length=max_length, pad_to_max_length=True)
+        assert_padded_input_match(input_r, input_p, max_length)
 
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
+        # Pair input
+        input_r = tokenizer_r.encode(
+            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+        )
+        input_p = tokenizer_p.encode(
+            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+        )
+        assert_padded_input_match(input_r, input_p, max_length)
 
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+        # Simple input
+        input_r = tokenizer_r.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True)
+        input_p = tokenizer_p.encode_plus("This is a simple input", max_length=max_length, pad_to_max_length=True)
+        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
 
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "GPT2 tokenizers doesn't have the same set of special_tokens",
-            )
+        # Pair input
+        input_r = tokenizer_r.encode_plus(
+            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+        )
+        input_p = tokenizer_p.encode_plus(
+            "This is a simple input", "This is a pair", max_length=max_length, pad_to_max_length=True
+        )
+        assert_padded_input_match(input_r["input_ids"], input_p["input_ids"], max_length)
+        self.assertSequenceEqual(input_r["attention_mask"], input_p["attention_mask"])
 
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)
+        # Simple input
+        # TODO: Re-enable this test when batch_encode_plus with padding correctly handles padding
+        input_r = tokenizer_r.batch_encode_plus(
+            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True
+        )
+        input_p = tokenizer_p.batch_encode_plus(
+            ["This is a simple input 1", "This is a simple input 2"], max_length=max_length, pad_to_max_length=True
+        )
+        assert_batch_padded_input_match(input_r, input_p)
 
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
+        # Pair input
+        # TODO: Re-enable this test when batch_encode_plus with padding correctly handles padding
+        input_r = tokenizer_r.batch_encode_plus(
+            [
+                ("This is a simple input 1", "This is a simple input 2"),
+                ("This is a simple pair 1", "This is a simple pair 2"),
+            ],
+            max_length=15,
+            pad_to_max_length=True,
+        )
+        input_p = tokenizer_p.batch_encode_plus(
+            [
+                ("This is a simple input 1", "This is a simple input 2"),
+                ("This is a simple pair 1", "This is a simple pair 2"),
+            ],
+            max_length=15,
+            pad_to_max_length=True,
+        )
+        assert_batch_padded_input_match(input_r, input_p)
 
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
+    def assert_save_pretrained(self, tokenizer_r, tokenizer_p):
+        # Checks it save with the same files
+        self.assertSequenceEqual(tokenizer_r.save_vocabulary("."), tokenizer_p.save_vocabulary("."))
 
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r)
+        # Checks everything loads correctly in the same way
+        tokenizer_rp, tokenizer_pp = tokenizer_r.from_pretrained("."), tokenizer_p.from_pretrained(".")
 
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
+        # Check special tokens are set accordingly on Rust and Python
+        for key in tokenizer_pp.special_tokens_map:
+            self.assertTrue(hasattr(tokenizer_rp, key))
+            # self.assertEqual(getattr(tokenizer_rp, key), getattr(tokenizer_pp, key))
+            # self.assertEqual(getattr(tokenizer_rp, key + "_id"), getattr(tokenizer_pp, key + "_id"))
 
-            # Check the number of returned files for save_vocabulary
-            self.assert_save_pretrained(tokenizer_r, tokenizer_p)
+    def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p):
+        sentence = "A, <mask> AllenNLP sentence."
+        tokens_r = tokenizer_r.encode_plus(
+            sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True
+        )
+        tokens_p = tokenizer_p.encode_plus(
+            sentence, add_special_tokens=True, return_attention_mask=False, return_token_type_ids=True
+        )
 
-            # Check for padding
-            self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
+        for key in tokens_p.keys():
+            self.assertEqual(tokens_r[key], tokens_p[key])
+
+        self.assertEqual(sum(tokens_r["token_type_ids"]), 0)
+        self.assertEqual(sum(tokens_p["token_type_ids"]), 0)
+
+        tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+        tokens_p = tokenizer_p.convert_ids_to_tokens(tokens_p["input_ids"])
+        self.assertSequenceEqual(tokens_r, tokens_p)
+
+    def assert_add_special_tokens(self, tokenizer_r):
+        simple_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=False)
+        # pair_num_special_tokens_to_add = tokenizer_r.num_special_tokens_to_add(pair=True)
+
+        for text in ["", " "]:
+            # tokenize()
+            no_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=False)
+            with_special_tokens = tokenizer_r.tokenize(text, add_special_tokens=True)
+            self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
+
+            # encode()
+            no_special_tokens = tokenizer_r.encode(text, add_special_tokens=False)
+            with_special_tokens = tokenizer_r.encode(text, add_special_tokens=True)
+            self.assertEqual(len(no_special_tokens), len(with_special_tokens) - simple_num_special_tokens_to_add)
+
+            # encode_plus()
+            no_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=False)
+            with_special_tokens = tokenizer_r.encode_plus(text, add_special_tokens=True)
+            for key in no_special_tokens.keys():
+                self.assertEqual(
+                    len(no_special_tokens[key]), len(with_special_tokens[key]) - simple_num_special_tokens_to_add
+                )
+
+            # # batch_encode_plus
+            no_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=False)
+            with_special_tokens = tokenizer_r.batch_encode_plus([text, text], add_special_tokens=True)
+            for key in no_special_tokens.keys():
+                for i_no, i_with in zip(no_special_tokens[key], with_special_tokens[key]):
+                    self.assertEqual(len(i_no), len(i_with) - simple_num_special_tokens_to_add)
+
+
+class WordPieceFastTokenizerTest(CommonFastTokenizerTest):
+    """
+    Override all the specific methods to test WordPiece behavior
+    """
+
+    TOKENIZERS_CLASSES = frozenset(
+        [
+            Tokenizer("Bert", BertTokenizerFast, BertTokenizer, "vocab_file", filter_non_english),
+            Tokenizer("DistilBert", DistilBertTokenizerFast, DistilBertTokenizer, "vocab_file", filter_non_english),
+        ]
+    )
+
+    def fast_only(self, tokenizer_r):
+        super().fast_only(tokenizer_r)
+        self.assert_offsets_with_special_characters(tokenizer_r)
+
+    def assert_add_special_tokens(self, tokenizer_r):
+        super().assert_add_special_tokens(tokenizer_r)
+
+    def assert_offsets_with_special_characters(self, tokenizer_r):
+        sentence = "A, naïve [MASK] AllenNLP sentence."
+        tokens = tokenizer_r.encode_plus(
+            sentence,
+            return_attention_mask=False,
+            return_token_type_ids=False,
+            return_offsets_mapping=True,
+            add_special_tokens=True,
+        )
 
-    def test_roberta(self):
-        for tokenizer_name in RobertaTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
-            tokenizer_p = RobertaTokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = RobertaTokenizerFast.from_pretrained(tokenizer_name)
+        expected_results = [
+            ((0, 1), "A"),
+            ((1, 2), ","),
+            ((3, 8), "naive"),  # BERT normalizes this away
+            # Append MASK here after lower-casing
+            ((16, 21), "Allen"),
+            ((22, 24), "##NL"),
+            ((24, 25), "##P"),
+            ((26, 34), "sentence"),
+            ((35, 36), "."),
+        ]
 
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
+        # Check if the tokenizer is uncased
+        if tokenizer_r.init_kwargs.get("do_lower_case"):
+            expected_results = [(offset, token.lower()) for (offset, token) in expected_results]
 
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+        # Append the special tokens
+        expected_results.insert(3, ((9, 15), "[MASK]"))
+        expected_results.insert(0, (None, "[CLS]"))
+        expected_results.append((None, "[SEP]"))
 
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "Roberta tokenizers doesn't have the same set of special_tokens",
-            )
+        self.assertEqual([e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"]))
+        # self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
 
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.01)
 
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
+class RobertaFastTokenizerTest(CommonFastTokenizerTest):
+    TOKENIZERS_CLASSES = frozenset(
+        [Tokenizer("Roberta", RobertaTokenizerFast, RobertaTokenizer, "vocab_file", filter_roberta_detectors)]
+    )
 
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
+    def assert_embeded_special_tokens(self, tokenizer_r, tokenizer_p):
+        sentence = "A, <mask> AllenNLP sentence."
+        tokens_r = tokenizer_r.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
+        tokens_p = tokenizer_p.encode_plus(sentence, add_special_tokens=True, return_token_type_ids=True)
 
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assert_batch_encode_dynamic_overflowing(tokenizer_r)
+        # Rust correctly handles the space before the mask while python doesnt
+        self.assertSequenceEqual(tokens_r["input_ids"], [0, 83, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
+        self.assertSequenceEqual(tokens_p["input_ids"], [0, 83, 6, 50264, 3823, 487, 21992, 3645, 4, 2])
 
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
+        # token_type_ids should put 0 everywhere
+        self.assertEquals(sum(tokens_r["token_type_ids"]), sum(tokens_p["token_type_ids"]))
 
-            # Check the number of returned files for save_vocabulary
-            self.assert_save_pretrained(tokenizer_r, tokenizer_p)
+        # attention_mask should put 1 everywhere, so sum over length should be 1
+        self.assertEquals(
+            sum(tokens_r["attention_mask"]) / len(tokens_r["attention_mask"]),
+            sum(tokens_p["attention_mask"]) / len(tokens_p["attention_mask"]),
+        )
 
-            # Check for padding
-            # TODO: Re-enable this test as soon as Roberta align with the python tokenizer.
-            # self.assert_padding(tokenizer_r, tokenizer_p)
+        # Rust should have 'Ġ' before <mask> which should be left as an entire token
+        tokens_r = tokenizer_r.convert_ids_to_tokens(tokens_r["input_ids"])
+        self.assertSequenceEqual(tokens_r, ["<s>", "ĠA", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"])
 
-    def test_openai(self):
-        for tokenizer_name in OpenAIGPTTokenizer.pretrained_vocab_files_map["vocab_file"].keys():
-            tokenizer_p = OpenAIGPTTokenizer.from_pretrained(tokenizer_name)
-            tokenizer_r = OpenAIGPTTokenizerFast.from_pretrained(tokenizer_name)
 
-            # Check we have the same number of added_tokens for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False))
-            self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True))
+class NoPaddingTokenFastTokenizerMatchingTest(CommonFastTokenizerTest):
+    TOKENIZERS_CLASSES = [
+        Tokenizer("OpenAI GPT", OpenAIGPTTokenizerFast, OpenAIGPTTokenizer, "vocab_file", None),
+        Tokenizer("GPT2", GPT2TokenizerFast, GPT2Tokenizer, "vocab_file", None),
+    ]
 
-            # Check we have the correct max_length for both pair and non-pair inputs.
-            self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence)
-            self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair)
+    def assert_padding(self, tokenizer_r, tokenizer_p, max_length=15):
+        # Simple input
+        s = "This is a simple input"
+        s2 = ["This is a simple input 1", "This is a simple input 2"]
+        p = ("This is a simple input", "This is a pair")
+        p2 = [
+            ("This is a simple input 1", "This is a simple input 2"),
+            ("This is a simple pair 1", "This is a simple pair 2"),
+        ]
 
-            # Assert the set of special tokens match.
-            self.assertSequenceEqual(
-                tokenizer_p.special_tokens_map.items(),
-                tokenizer_r.special_tokens_map.items(),
-                "GPT tokenizers doesn't have the same set of special_tokens",
-            )
+        # Simple input tests
+        self.assertRaises(ValueError, tokenizer_r.encode, s, max_length=max_length, pad_to_max_length=True)
 
-            # Assure tokenization overlap between python and rust impl.
-            self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0)
+        # Simple input
+        self.assertRaises(ValueError, tokenizer_r.encode_plus, s, max_length=max_length, pad_to_max_length=True)
 
-            # Ensure add_tokens and add_special_tokens return the correct vocab size
-            self.assert_add_tokens(tokenizer_r)
+        # Simple input
+        self.assertRaises(ValueError, tokenizer_r.batch_encode_plus, s2, max_length=max_length, pad_to_max_length=True)
 
-            # Check for offsets mapping
-            self.assert_offsets_mapping(tokenizer_r)
+        # Pair input
+        self.assertRaises(ValueError, tokenizer_r.encode, p, max_length=max_length, pad_to_max_length=True)
 
-            # Check for dynamic encoding sequence handling in batch_encode_plus
-            self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r)
+        # Pair input
+        self.assertRaises(ValueError, tokenizer_r.encode_plus, p, max_length=max_length, pad_to_max_length=True)
 
-            # Check alignment for build_inputs_with_special_tokens
-            self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
+        # Pair input
+        self.assertRaises(ValueError, tokenizer_r.batch_encode_plus, p2, max_length=max_length, pad_to_max_length=True)
 
-            self.assertEqual(len(tokenizer_r.save_vocabulary(".")), len(tokenizer_p.save_vocabulary(".")))
 
-            # Check for padding
-            self.assertRaises(ValueError, self.assert_padding, tokenizer_r, tokenizer_p)
+class TransfoXLFastTokenizerTest(NoPaddingTokenFastTokenizerMatchingTest):
+    TOKENIZERS_CLASSES = frozenset(
+        [Tokenizer("TransfoXL", TransfoXLTokenizerFast, TransfoXLTokenizer, "pretrained_vocab_file", None)]
+    )
 
-            # Check the number of returned files for save_vocabulary
-            self.assert_save_pretrained(tokenizer_r, tokenizer_p)
+    @require_torch
+    def test_all_tokenizers(self):
+        super().test_all_tokenizers()
diff --git a/tests/test_tokenization_gpt2.py b/tests/test_tokenization_gpt2.py
index 12b7b0eeb167..c2e34e59d544 100644
--- a/tests/test_tokenization_gpt2.py
+++ b/tests/test_tokenization_gpt2.py
@@ -94,7 +94,7 @@ def test_rust_and_python_full_tokenizers(self):
             return
 
         tokenizer = self.get_tokenizer()
-        rust_tokenizer = self.get_rust_tokenizer(add_special_tokens=False, add_prefix_space=True)
+        rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
 
         sequence = "lower newer"
 
@@ -105,7 +105,7 @@ def test_rust_and_python_full_tokenizers(self):
 
         # Testing conversion to ids without special tokens
         ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
-        rust_ids = rust_tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
         self.assertListEqual(ids, rust_ids)
 
         # Testing conversion to ids with special tokens