From e68583f3c51b70806aa20b7773c8080987f8eac8 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 26 Aug 2021 10:36:33 +0800
Subject: [PATCH 01/98] REALM initial commit

---
 docs/source/model_doc/realm.rst               |  102 ++
 src/transformers/__init__.py                  |   34 +
 src/transformers/models/__init__.py           |    1 +
 .../models/auto/configuration_auto.py         |    3 +
 src/transformers/models/auto/modeling_auto.py |    8 +
 src/transformers/models/realm/__init__.py     |   72 +
 .../models/realm/configuration_realm.py       |  131 ++
 .../models/realm/modeling_realm.py            | 1543 +++++++++++++++++
 .../models/realm/tokenization_realm.py        |   54 +
 .../models/realm/tokenization_realm_fast.py   |   56 +
 tests/test_modeling_realm.py                  |  481 +++++
 11 files changed, 2485 insertions(+)
 create mode 100644 docs/source/model_doc/realm.rst
 create mode 100644 src/transformers/models/realm/__init__.py
 create mode 100644 src/transformers/models/realm/configuration_realm.py
 create mode 100644 src/transformers/models/realm/modeling_realm.py
 create mode 100644 src/transformers/models/realm/tokenization_realm.py
 create mode 100644 src/transformers/models/realm/tokenization_realm_fast.py
 create mode 100644 tests/test_modeling_realm.py
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
new file mode 100644
index 000000000000..2a11b0bdaa70
--- /dev/null
+++ b/docs/source/model_doc/realm.rst
@@ -0,0 +1,102 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+REALM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The REALM model was proposed in `<INSERT PAPER NAME HERE>
+<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+
+The abstract from the paper is the following:
+
+*<INSERT PAPER ABSTRACT HERE>*
+
+Tips:
+
+<INSERT TIPS ABOUT MODEL HERE>
+
+This model was contributed by `<INSERT YOUR HF USERNAME HERE> 
+<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>>`__. The original code can be found `here 
+<<INSERT LINK TO GITHUB REPO HERE>>`__.
+
+REALMConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMConfig
+    :members:
+
+
+REALMTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+REALMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMTokenizerFast
+    :members:
+
+
+REALMModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMModel
+    :members: forward
+
+
+REALMForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForCausalLM
+    :members: forward
+
+
+REALMForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForMaskedLM
+    :members: forward
+
+
+REALMForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForSequenceClassification
+    :members: forward
+
+
+REALMForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForMultipleChoice
+    :members: forward
+
+
+REALMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForTokenClassification
+    :members: forward
+
+
+REALMForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.REALMForQuestionAnswering
+    :members: forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index c5e22b1c718d..6553e25d6ab1 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -133,6 +133,7 @@
         "load_tf2_weights_in_pytorch_model",
     ],
     # Models
+    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "REALMConfig", "REALMTokenizer"],
     "models": [],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
@@ -337,6 +338,7 @@
 # tokenizers-backed objects
 if is_tokenizers_available():
     # Fast tokenizers
+    _import_structure["models.realm"].append("REALMTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -501,6 +503,22 @@
     _import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
 
     # PyTorch models structure
+
+    _import_structure["models.realm"].extend(
+        [
+            "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "REALMForMaskedLM",
+            "REALMForCausalLM",
+            "REALMForMultipleChoice",
+            "REALMForQuestionAnswering",
+            "REALMForSequenceClassification",
+            "REALMForTokenClassification",
+            "REALMLayer",
+            "REALMModel",
+            "REALMPreTrainedModel",
+            "load_tf_weights_in_realm",
+        ]
+    )
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1836,6 +1854,7 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
+    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, REALMConfig, REALMTokenizer
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -2027,6 +2046,7 @@
         from .utils.dummy_sentencepiece_objects import *
 
     if is_tokenizers_available():
+        from .models.realm import REALMTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -2107,6 +2127,20 @@
         from .utils.dummy_timm_objects import *
 
     if is_torch_available():
+
+        from .models.realm import (
+            REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            REALMForMaskedLM,
+            REALMForCausalLM,
+            REALMForMultipleChoice,
+            REALMForQuestionAnswering,
+            REALMForSequenceClassification,
+            REALMForTokenClassification,
+            REALMLayer,
+            REALMModel,
+            REALMPreTrainedModel,
+            load_tf_weights_in_realm,
+        )
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 7059fb539ac7..1d62e74bb2f1 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 
 from . import (
+    realm,
     albert,
     auto,
     bart,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index fa42da76f517..6503de0ea099 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -26,6 +26,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
+        ("realm", "REALMConfig"),
         ("beit", "BeitConfig"),
         ("rembert", "RemBertConfig"),
         ("visual_bert", "VisualBertConfig"),
@@ -95,6 +96,7 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here
+        ("realm", "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -156,6 +158,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
+        ("realm", "REALM"),
         ("beit", "BeiT"),
         ("rembert", "RemBERT"),
         ("visual_bert", "VisualBert"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index c851d70ceb0c..a36656ad34a2 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,6 +28,7 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
+        ("realm", "REALMModel"),
         ("beit", "BeitModel"),
         ("rembert", "RemBertModel"),
         ("visual_bert", "VisualBertModel"),
@@ -134,6 +135,7 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
+("realm", "REALMForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
@@ -182,6 +184,7 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
+        ("realm", "REALMForCausalLM"),
         ("rembert", "RemBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
         ("bigbird_pegasus", "BigBirdPegasusForCausalLM"),
@@ -223,6 +226,7 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
+("realm", "REALMForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("big_bird", "BigBirdForMaskedLM"),
@@ -285,6 +289,7 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
+        ("realm", "REALMForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("canine", "CanineForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
@@ -327,6 +332,7 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
+        ("realm", "REALMForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("canine", "CanineForQuestionAnswering"),
         ("roformer", "RoFormerForQuestionAnswering"),
@@ -371,6 +377,7 @@
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
+("realm", "REALMForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("canine", "CanineForTokenClassification"),
         ("roformer", "RoFormerForTokenClassification"),
@@ -402,6 +409,7 @@
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Multiple Choice mapping
+("realm", "REALMForMultipleChoice"),
         ("rembert", "RemBertForMultipleChoice"),
         ("canine", "CanineForMultipleChoice"),
         ("roformer", "RoFormerForMultipleChoice"),
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
new file mode 100644
index 000000000000..cd591cb90bf0
--- /dev/null
+++ b/src/transformers/models/realm/__init__.py
@@ -0,0 +1,72 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...file_utils import _LazyModule, is_torch_available, is_tokenizers_available
+_import_structure = {
+    "configuration_realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "REALMConfig"],
+    "tokenization_realm": ["REALMTokenizer"],
+}
+
+if is_tokenizers_available():
+    _import_structure["tokenization_realm_fast"] = ["REALMTokenizerFast"]
+
+if is_torch_available():
+    _import_structure["modeling_realm"] = [
+        "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "REALMForMaskedLM",
+        "REALMForCausalLM",
+        "REALMForMultipleChoice",
+        "REALMForQuestionAnswering",
+        "REALMForSequenceClassification",
+        "REALMForTokenClassification",
+        "REALMLayer",
+        "REALMModel",
+        "REALMPreTrainedModel",
+        "load_tf_weights_in_realm",
+    ]
+
+
+
+
+if TYPE_CHECKING:
+    from .configuration_realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, REALMConfig
+    from .tokenization_realm import REALMTokenizer
+
+    if is_tokenizers_available():
+        from .tokenization_realm_fast import REALMTokenizerFast
+
+    if is_torch_available():
+        from .modeling_realm import (
+            REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            REALMForMaskedLM,
+            REALMForCausalLM,
+            REALMForMultipleChoice,
+            REALMForQuestionAnswering,
+            REALMForSequenceClassification,
+            REALMForTokenClassification,
+            REALMLayer,
+            REALMModel,
+            REALMPreTrainedModel,
+            load_tf_weights_in_realm,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
new file mode 100644
index 000000000000..dadccc721537
--- /dev/null
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -0,0 +1,131 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" REALM model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+REALM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/config.json",
+    # See all REALM models at https://huggingface.co/models?filter=realm
+}
+
+
+class REALMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.REALMModel`.
+    It is used to instantiate an REALM model according to the specified arguments, defining the model
+    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+    the REALM `realm-cc-news <https://huggingface.co/realm-cc-news>`__ architecture.
+
+    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.REALMModel` or
+            :class:`~transformers.TFREALMModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with.
+            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.REALMModel` or
+            :class:`~transformers.TFREALMModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass.
+        Example::
+
+        >>> from transformers import REALMModel, REALMConfig
+
+        >>> # Initializing a REALM realm-cc-news style configuration
+        >>> configuration = REALMConfig()
+
+        >>> # Initializing a model from the realm-cc-news style configuration
+        >>> model = REALMModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "realm"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        is_encoder_decoder=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+        
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
new file mode 100644
index 000000000000..5c2834a04fdd
--- /dev/null
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -0,0 +1,1543 @@
+# coding=utf-8
+# Copyright 2021 Google AI The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch REALM model. """
+
+
+
+
+import math
+import os
+
+import torch
+import torch.utils.checkpoint
+from packaging import version
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    SequenceSummary,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from ...utils import logging
+from .configuration_realm import REALMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "realm-cc-news"
+_CONFIG_FOR_DOC = "REALMConfig"
+_TOKENIZER_FOR_DOC = "REALMTokenizer"
+
+REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "realm-cc-news",
+    # See all REALM models at https://huggingface.co/models?filter=realm
+]
+
+
+def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert (
+                pointer.shape == array.shape
+            ), f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class REALMEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+                
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class REALMSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in REALMModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class REALMSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class REALMAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = REALMSelfAttention(config)
+        self.output = REALMSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class REALMIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class REALMOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class REALMLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = REALMAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = REALMAttention(config)
+        self.intermediate = REALMIntermediate(config)
+        self.output = REALMOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class REALMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([REALMLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class REALMPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class REALMLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = REALMPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class REALMOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = REALMLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class REALMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = REALMConfig
+    load_tf_weights = load_tf_weights_in_realm
+    base_model_prefix = "realm"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+REALM_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.REALMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+REALM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.REALMTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare REALM Model transformer outputting raw hidden-states without any specific head on top.",
+    REALM_START_DOCSTRING,
+)
+class REALMModel(REALMPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
+    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = REALMEmbeddings(config)
+        self.encoder = REALMEncoder(config)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings("""REALM Model with a `language modeling` head on top. """, REALM_START_DOCSTRING)
+class REALMForMaskedLM(REALMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `REALMForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.realm = REALMModel(config)
+        self.cls = REALMOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """REALM Model with a `language modeling` head on top for CLM fine-tuning. """, REALM_START_DOCSTRING
+)
+class REALMForCausalLM(REALMPreTrainedModel):
+
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `REALMForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.realm = REALMModel(config)
+        self.cls = REALMOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            inputs_embeds=None,
+            encoder_hidden_states=None,
+            encoder_attention_mask=None,
+            head_mask=None,
+            cross_attn_head_mask=None,
+            past_key_values=None,
+            labels=None,
+            use_cache=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
+            tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+            tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+            additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
+            model.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+            cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+            decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import REALMTokenizer, REALMForCausalLM, REALMConfig
+            >>> import torch
+
+            >>> tokenizer = REALMTokenizer.from_pretrained('realm-cc-news')
+            >>> config = REALMConfig.from_pretrained("realm-cc-news")
+            >>> config.is_decoder = True
+            >>> model = REALMForCausalLM.from_pretrained('realm-cc-news', config=config)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[1:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],)
+        return reordered_past
+
+class REALMClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """REALM Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    REALM_START_DOCSTRING,
+)
+class REALMForSequenceClassification(REALMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.realm = REALMModel(config)
+        self.classifier = REALMClassificationHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+@add_start_docstrings(
+    """REALM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    REALM_START_DOCSTRING,
+)
+class REALMForMultipleChoice(REALMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.realm = REALMModel(config)
+        self.sequence_summary = SequenceSummary(config)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+            self,
+            input_ids=None,
+            attention_mask=None,
+            token_type_ids=None,
+            position_ids=None,
+            head_mask=None,
+            inputs_embeds=None,
+            labels=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
+            of the input tensors. (See :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        pooled_output = self.sequence_summary(sequence_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """REALM Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    REALM_START_DOCSTRING,
+)
+class REALMForTokenClassification(REALMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.realm = REALMModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """REALM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    REALM_START_DOCSTRING,
+)
+class REALMForQuestionAnswering(REALMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.realm = REALMModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
new file mode 100644
index 000000000000..fbfbca0862df
--- /dev/null
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for REALM."""
+from ...utils import logging
+from ..bert.tokenization_bert import BertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "realm-cc-news": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "realm-cc-news": {"do_lower_case": False},
+}
+
+
+class REALMTokenizer(BertTokenizer):
+    r"""
+    Construct a REALM tokenizer.
+
+    :class:`~transformers.REALMTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
new file mode 100644
index 000000000000..fdb921d29d88
--- /dev/null
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for REALM."""
+from ...utils import logging
+from ..bert.tokenization_bert_fast import BertTokenizerFast
+from .tokenization_realm import REALMTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "realm-cc-news": 512,
+}
+
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "realm-cc-news": {"do_lower_case": False},
+}
+
+
+class REALMTokenizerFast(BertTokenizerFast):
+    r"""
+    Construct a "fast" REALM tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.REALMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    slow_tokenizer_class = REALMTokenizer
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
new file mode 100644
index 000000000000..3c0a4a3cbff5
--- /dev/null
+++ b/tests/test_modeling_realm.py
@@ -0,0 +1,481 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch REALM model. """
+
+
+import unittest
+
+from tests.test_modeling_common import floats_tensor
+from transformers import is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+
+from transformers import REALMConfig
+from .test_configuration_common import ConfigTester
+from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        REALMForCausalLM,
+        REALMForMaskedLM,
+        REALMForMultipleChoice,
+        REALMForQuestionAnswering,
+        REALMForSequenceClassification,
+        REALMForTokenClassification,
+        REALMModel,
+    )
+    from transformers.models.realm.modeling_realm import (
+        REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+
+class REALMModelTester:
+    def __init__(
+            self,
+            parent,
+            batch_size=13,
+            seq_length=7,
+            is_training=True,
+            use_input_mask=True,
+            use_token_type_ids=True,
+            use_labels=True,
+            vocab_size=99,
+            hidden_size=32,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            type_vocab_size=16,
+            type_sequence_label_size=2,
+            initializer_range=0.02,
+            num_labels=3,
+            num_choices=4,
+            scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return REALMConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = REALMModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_model_as_decoder(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = REALMModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def create_and_check_for_causal_lm(
+            self,
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+    ):
+        model = REALMForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_for_masked_lm(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = REALMForMaskedLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = REALMForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def create_and_check_for_question_answering(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = REALMForQuestionAnswering(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            start_positions=sequence_labels,
+            end_positions=sequence_labels,
+        )
+        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
+        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
+
+    def create_and_check_for_sequence_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = REALMForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+    def create_and_check_for_token_classification(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_labels = self.num_labels
+        model = REALMForTokenClassification(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
+
+    def create_and_check_for_multiple_choice(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.num_choices = self.num_choices
+        model = REALMForMultipleChoice(config=config)
+        model.to(torch_device)
+        model.eval()
+        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+        result = model(
+            multiple_choice_inputs_ids,
+            attention_mask=multiple_choice_input_mask,
+            token_type_ids=multiple_choice_token_type_ids,
+            labels=choice_labels,
+        )
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class REALMModelTest(ModelTesterMixin, unittest.TestCase):
+
+    all_model_classes = (
+        (
+            REALMModel,
+            REALMForMaskedLM,
+            REALMForCausalLM,
+            REALMForMultipleChoice,
+            REALMForQuestionAnswering,
+            REALMForSequenceClassification,
+            REALMForTokenClassification,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (REALMForCausalLM,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = REALMModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=REALMConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+    def test_for_multiple_choice(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
+
+    def test_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
+
+    def test_model_as_decoder_with_default_input_mask(self):
+        # This regression test was failing with PyTorch < 1.3
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
+
+        input_mask = None
+
+        self.model_tester.create_and_check_model_as_decoder(
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = REALMModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+
+@require_torch
+class REALMModelIntegrationTest(unittest.TestCase):
+    @slow
+    def test_inference_masked_lm(self):
+        model = REALMForMaskedLM.from_pretrained("realm-cc-news")
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        # TODO Replace vocab size
+        vocab_size = 32000
+
+        expected_shape = torch.Size((1, 6, vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+

From 4d8559644aeb7050e011898194651009dbde6364 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 01:12:33 +0800
Subject: [PATCH 02/98] Retriever OK (Update new_gelu).

---
 docs/source/model_doc/realm.rst               |  55 +-
 src/transformers/__init__.py                  |  43 +-
 .../models/auto/configuration_auto.py         |   4 +-
 src/transformers/models/auto/modeling_auto.py |  11 +-
 src/transformers/models/realm/__init__.py     |  38 +-
 .../models/realm/configuration_realm.py       |  34 +-
 .../models/realm/modeling_realm.py            | 810 ++++++------------
 .../models/realm/tokenization_realm.py        |  12 +-
 .../models/realm/tokenization_realm_fast.py   |  16 +-
 tests/test_modeling_realm.py                  | 230 +----
 10 files changed, 366 insertions(+), 887 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 2a11b0bdaa70..9c33e89877cd 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -31,72 +31,37 @@ This model was contributed by `<INSERT YOUR HF USERNAME HERE>
 <https://huggingface.co/<INSERT YOUR HF USERNAME HERE>>`__. The original code can be found `here 
 <<INSERT LINK TO GITHUB REPO HERE>>`__.
 
-REALMConfig
+RealmConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.REALMConfig
+.. autoclass:: transformers.RealmConfig
     :members:
 
 
-REALMTokenizer
+RealmTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.REALMTokenizer
+.. autoclass:: transformers.RealmTokenizer
     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
         create_token_type_ids_from_sequences, save_vocabulary
 
 
-REALMTokenizerFast
+RealmTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.REALMTokenizerFast
+.. autoclass:: transformers.RealmTokenizerFast
     :members:
 
 
-REALMModel
+RealmModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.REALMModel
+.. autoclass:: transformers.RealmModel
     :members: forward
 
 
-REALMForCausalLM
+RealmForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.REALMForCausalLM
-    :members: forward
-
-
-REALMForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.REALMForMaskedLM
-    :members: forward
-
-
-REALMForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.REALMForSequenceClassification
-    :members: forward
-
-
-REALMForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.REALMForMultipleChoice
-    :members: forward
-
-
-REALMForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.REALMForTokenClassification
-    :members: forward
-
-
-REALMForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.REALMForQuestionAnswering
+.. autoclass:: transformers.RealmForMaskedLM
     :members: forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6553e25d6ab1..0603478f4e0d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -133,7 +133,7 @@
         "load_tf2_weights_in_pytorch_model",
     ],
     # Models
-    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "REALMConfig", "REALMTokenizer"],
+    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
     "models": [],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
@@ -338,7 +338,7 @@
 # tokenizers-backed objects
 if is_tokenizers_available():
     # Fast tokenizers
-    _import_structure["models.realm"].append("REALMTokenizerFast")
+    _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -507,15 +507,10 @@
     _import_structure["models.realm"].extend(
         [
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "REALMForMaskedLM",
-            "REALMForCausalLM",
-            "REALMForMultipleChoice",
-            "REALMForQuestionAnswering",
-            "REALMForSequenceClassification",
-            "REALMForTokenClassification",
-            "REALMLayer",
-            "REALMModel",
-            "REALMPreTrainedModel",
+            "RealmForMaskedLM",
+            "RealmLayer",
+            "RealmModel",
+            "RealmPreTrainedModel",
             "load_tf_weights_in_realm",
         ]
     )
@@ -1854,7 +1849,7 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, REALMConfig, REALMTokenizer
+    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -2046,7 +2041,7 @@
         from .utils.dummy_sentencepiece_objects import *
 
     if is_tokenizers_available():
-        from .models.realm import REALMTokenizerFast
+        from .models.realm import RealmTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -2127,20 +2122,6 @@
         from .utils.dummy_timm_objects import *
 
     if is_torch_available():
-
-        from .models.realm import (
-            REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
-            REALMForMaskedLM,
-            REALMForCausalLM,
-            REALMForMultipleChoice,
-            REALMForQuestionAnswering,
-            REALMForSequenceClassification,
-            REALMForTokenClassification,
-            REALMLayer,
-            REALMModel,
-            REALMPreTrainedModel,
-            load_tf_weights_in_realm,
-        )
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
@@ -2595,6 +2576,14 @@
             ProphetNetPreTrainedModel,
         )
         from .models.rag import RagModel, RagPreTrainedModel, RagSequenceForGeneration, RagTokenForGeneration
+        from .models.realm import (
+            REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RealmForMaskedLM,
+            RealmLayer,
+            RealmModel,
+            RealmPreTrainedModel,
+            load_tf_weights_in_realm,
+        )
         from .models.reformer import (
             REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             ReformerAttention,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 6503de0ea099..7230ba92d7bf 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -26,7 +26,7 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
-        ("realm", "REALMConfig"),
+        ("realm", "RealmConfig"),
         ("beit", "BeitConfig"),
         ("rembert", "RemBertConfig"),
         ("visual_bert", "VisualBertConfig"),
@@ -158,7 +158,7 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-        ("realm", "REALM"),
+        ("realm", "Realm"),
         ("beit", "BeiT"),
         ("rembert", "RemBERT"),
         ("visual_bert", "VisualBert"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index a36656ad34a2..baf5d89e5b28 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,7 +28,7 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
-        ("realm", "REALMModel"),
+        ("realm", "RealmModel"),
         ("beit", "BeitModel"),
         ("rembert", "RemBertModel"),
         ("visual_bert", "VisualBertModel"),
@@ -135,7 +135,7 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
-("realm", "REALMForMaskedLM"),
+        ("realm", "RealmForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
@@ -184,7 +184,6 @@
 MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Causal LM mapping
-        ("realm", "REALMForCausalLM"),
         ("rembert", "RemBertForCausalLM"),
         ("roformer", "RoFormerForCausalLM"),
         ("bigbird_pegasus", "BigBirdPegasusForCausalLM"),
@@ -226,7 +225,7 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
-("realm", "REALMForMaskedLM"),
+        ("realm", "RealmForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("big_bird", "BigBirdForMaskedLM"),
@@ -289,7 +288,6 @@
 MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Sequence Classification mapping
-        ("realm", "REALMForSequenceClassification"),
         ("rembert", "RemBertForSequenceClassification"),
         ("canine", "CanineForSequenceClassification"),
         ("roformer", "RoFormerForSequenceClassification"),
@@ -332,7 +330,6 @@
 MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
     [
         # Model for Question Answering mapping
-        ("realm", "REALMForQuestionAnswering"),
         ("rembert", "RemBertForQuestionAnswering"),
         ("canine", "CanineForQuestionAnswering"),
         ("roformer", "RoFormerForQuestionAnswering"),
@@ -377,7 +374,6 @@
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
     [
         # Model for Token Classification mapping
-("realm", "REALMForTokenClassification"),
         ("rembert", "RemBertForTokenClassification"),
         ("canine", "CanineForTokenClassification"),
         ("roformer", "RoFormerForTokenClassification"),
@@ -409,7 +405,6 @@
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict(
     [
         # Model for Multiple Choice mapping
-("realm", "REALMForMultipleChoice"),
         ("rembert", "RemBertForMultipleChoice"),
         ("canine", "CanineForMultipleChoice"),
         ("roformer", "RoFormerForMultipleChoice"),
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index cd591cb90bf0..5d936bf01e40 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -18,25 +18,20 @@
 from typing import TYPE_CHECKING
 from ...file_utils import _LazyModule, is_torch_available, is_tokenizers_available
 _import_structure = {
-    "configuration_realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "REALMConfig"],
-    "tokenization_realm": ["REALMTokenizer"],
+    "configuration_realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig"],
+    "tokenization_realm": ["RealmTokenizer"],
 }
 
 if is_tokenizers_available():
-    _import_structure["tokenization_realm_fast"] = ["REALMTokenizerFast"]
+    _import_structure["tokenization_realm_fast"] = ["RealmTokenizerFast"]
 
 if is_torch_available():
     _import_structure["modeling_realm"] = [
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "REALMForMaskedLM",
-        "REALMForCausalLM",
-        "REALMForMultipleChoice",
-        "REALMForQuestionAnswering",
-        "REALMForSequenceClassification",
-        "REALMForTokenClassification",
-        "REALMLayer",
-        "REALMModel",
-        "REALMPreTrainedModel",
+        "RealmForMaskedLM",
+        "RealmLayer",
+        "RealmModel",
+        "RealmPreTrainedModel",
         "load_tf_weights_in_realm",
     ]
 
@@ -44,24 +39,19 @@
 
 
 if TYPE_CHECKING:
-    from .configuration_realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, REALMConfig
-    from .tokenization_realm import REALMTokenizer
+    from .configuration_realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig
+    from .tokenization_realm import RealmTokenizer
 
     if is_tokenizers_available():
-        from .tokenization_realm_fast import REALMTokenizerFast
+        from .tokenization_realm_fast import RealmTokenizerFast
 
     if is_torch_available():
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
-            REALMForMaskedLM,
-            REALMForCausalLM,
-            REALMForMultipleChoice,
-            REALMForQuestionAnswering,
-            REALMForSequenceClassification,
-            REALMForTokenClassification,
-            REALMLayer,
-            REALMModel,
-            REALMPreTrainedModel,
+            RealmForMaskedLM,
+            RealmLayer,
+            RealmModel,
+            RealmPreTrainedModel,
             load_tf_weights_in_realm,
         )
 
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index dadccc721537..41b51523e468 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,17 +21,17 @@
 logger = logging.get_logger(__name__)
 
 REALM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/config.json",
+    "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/config.json",
     # See all REALM models at https://huggingface.co/models?filter=realm
 }
 
 
-class REALMConfig(PretrainedConfig):
+class RealmConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.REALMModel`.
+    This is the configuration class to store the configuration of a :class:`~transformers.RealmModel`.
     It is used to instantiate an REALM model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the REALM `realm-cc-news <https://huggingface.co/realm-cc-news>`__ architecture.
+    the REALM `realm-cc-news-pretrained <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
 
     Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
     to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
@@ -41,8 +41,8 @@ class REALMConfig(PretrainedConfig):
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.REALMModel` or
-            :class:`~transformers.TFREALMModel`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmModel` or
+            :class:`~transformers.TFRealmModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
@@ -62,8 +62,8 @@ class REALMConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with.
             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.REALMModel` or
-            :class:`~transformers.TFREALMModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmModel` or
+            :class:`~transformers.TFRealmModel`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
@@ -75,13 +75,13 @@ class REALMConfig(PretrainedConfig):
             If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass.
         Example::
 
-        >>> from transformers import REALMModel, REALMConfig
+        >>> from transformers import RealmModel, RealmConfig
 
-        >>> # Initializing a REALM realm-cc-news style configuration
-        >>> configuration = REALMConfig()
+        >>> # Initializing a REALM realm-cc-news-pretrained style configuration
+        >>> configuration = RealmConfig()
 
-        >>> # Initializing a model from the realm-cc-news style configuration
-        >>> model = REALMModel(configuration)
+        >>> # Initializing a model from the realm-cc-news-pretrained style configuration
+        >>> model = RealmModel(configuration)
 
         >>> # Accessing the model configuration
         >>> configuration = model.config
@@ -91,10 +91,12 @@ def __init__(
         self,
         vocab_size=30522,
         hidden_size=768,
+        retriever_proj_size=128,
         num_hidden_layers=12,
         num_attention_heads=12,
+        num_candidates=8,
         intermediate_size=3072,
-        hidden_act="gelu",
+        hidden_act="gelu_new",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
         max_position_embeddings=512,
@@ -118,8 +120,10 @@ def __init__(
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
+        self.retriever_proj_size = retriever_proj_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
+        self.num_candidates = num_candidates
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 5c2834a04fdd..8c897e335ce5 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 Google AI The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The HuggingFace Team The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -50,17 +50,18 @@
     prune_linear_layer,
 )
 from ...utils import logging
-from .configuration_realm import REALMConfig
+from ..bert import BertModel
+from .configuration_realm import RealmConfig
 
 
 logger = logging.get_logger(__name__)
 
-_CHECKPOINT_FOR_DOC = "realm-cc-news"
-_CONFIG_FOR_DOC = "REALMConfig"
-_TOKENIZER_FOR_DOC = "REALMTokenizer"
+_CHECKPOINT_FOR_DOC = "realm-cc-news-pretrained"
+_CONFIG_FOR_DOC = "RealmConfig"
+_TOKENIZER_FOR_DOC = "RealmTokenizer"
 
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "realm-cc-news",
+    "realm-cc-news-pretrained",
     # See all REALM models at https://huggingface.co/models?filter=realm
 ]
 
@@ -91,11 +92,27 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
+        original_name = name
+
+        name = name.replace("module/module/module/bert/", "embedder/")
+        name = name.replace("module/module/module/cls/predictions/", "cls/predictions/")
+        name = name.replace("module/module/LayerNorm/", "cls/LayerNorm/")
+        name = name.replace("module/module/dense/", "cls/dense/")
+
+        if "cls/predictions/output_bias" in name:
+            continue
+
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(
-            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            n in [
+                "adam_v",
+                "adam_m",
+                "AdamWeightDecayOptimizer",
+                "AdamWeightDecayOptimizer_1",
+                "global_step"
+            ]
             for n in name
         ):
             logger.info(f"Skipping {'/'.join(name)}")
@@ -112,8 +129,8 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
                 pointer = getattr(pointer, "bias")
             elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
-            elif scope_names[0] == "squad":
-                pointer = getattr(pointer, "classifier")
+            #elif scope_names[0] == "squad":
+            #    pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, scope_names[0])
@@ -139,7 +156,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     return model
 
 
-class REALMEmbeddings(nn.Module):
+class RealmEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
@@ -200,7 +217,7 @@ def forward(
         return embeddings
 
 
-class REALMSelfAttention(nn.Module):
+class RealmSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -298,7 +315,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in REALMModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in RealmModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -325,7 +342,7 @@ def forward(
         return outputs
 
 
-class REALMSelfOutput(nn.Module):
+class RealmSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -339,11 +356,11 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-class REALMAttention(nn.Module):
+class RealmAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.self = REALMSelfAttention(config)
-        self.output = REALMSelfOutput(config)
+        self.self = RealmSelfAttention(config)
+        self.output = RealmSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -388,7 +405,7 @@ def forward(
         return outputs
 
 
-class REALMIntermediate(nn.Module):
+class RealmIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -403,7 +420,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class REALMOutput(nn.Module):
+class RealmOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -417,19 +434,19 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-class REALMLayer(nn.Module):
+class RealmLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = REALMAttention(config)
+        self.attention = RealmAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
-            self.crossattention = REALMAttention(config)
-        self.intermediate = REALMIntermediate(config)
-        self.output = REALMOutput(config)
+            self.crossattention = RealmAttention(config)
+        self.intermediate = RealmIntermediate(config)
+        self.output = RealmOutput(config)
 
     def forward(
         self,
@@ -500,11 +517,11 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-class REALMEncoder(nn.Module):
+class RealmEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([REALMLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(
         self,
@@ -597,7 +614,7 @@ def custom_forward(*inputs):
         )
 
 
-class REALMPredictionHeadTransform(nn.Module):
+class RealmPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -609,15 +626,15 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
-        hidden_states = self.transform_act_fn(hidden_states)
+        #hidden_states = self.transform_act_fn(hidden_states)
         hidden_states = self.LayerNorm(hidden_states)
         return hidden_states
 
 
-class REALMLMPredictionHead(nn.Module):
+class RealmLMPredictionHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.transform = REALMPredictionHeadTransform(config)
+        self.transform = RealmPredictionHeadTransform(config)
 
         # The output weights are the same as the input embeddings, but there is
         # an output-only bias for each token.
@@ -634,23 +651,37 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class REALMOnlyMLMHead(nn.Module):
+class RealmOnlyMLMHead(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.predictions = REALMLMPredictionHead(config)
+        self.predictions = RealmLMPredictionHead(config)
 
     def forward(self, sequence_output):
         prediction_scores = self.predictions(sequence_output)
         return prediction_scores
 
 
-class REALMPreTrainedModel(PreTrainedModel):
+class RealmRetrieverProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RealmLMPredictionHead(config)
+        self.dense = nn.Linear(config.hidden_size, config.retriever_proj_size)
+        self.LayerNorm = nn.LayerNorm(config.retriever_proj_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        #hidden_states = self.predictions(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RealmPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and
     a simple interface for downloading and loading pretrained models.
     """
 
-    config_class = REALMConfig
+    config_class = RealmConfig
     load_tf_weights = load_tf_weights_in_realm
     base_model_prefix = "realm"
     _keys_to_ignore_on_load_missing = [r"position_ids"]
@@ -670,7 +701,21 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-
+    
+    def _flatten_inputs(self, *inputs):        
+        flattened_inputs = []
+        for tensor in inputs:
+            input_shape = tensor.shape
+            if len(input_shape) > 2:
+                tensor = tensor.view((-1, input_shape[-1]))
+            def _unflatten(flat):
+                if len(input_shape) > 2:
+                    flat = flat.view(input_shape + (-1,))
+                return flat
+            flattened_inputs.append((tensor, _unflatten))
+        return flattened_inputs
+            
+            
 
 REALM_START_DOCSTRING = r"""
     This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
@@ -678,7 +723,7 @@ def _init_weights(self, module):
     usage and behavior.
 
     Parameters:
-        config (:class:`~transformers.REALMConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~transformers.RealmConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -688,7 +733,7 @@ def _init_weights(self, module):
         input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`transformers.REALMTokenizer`.
+            Indices can be obtained using :class:`transformers.RealmTokenizer`.
             See :func:`transformers.PreTrainedTokenizer.encode` and
             :func:`transformers.PreTrainedTokenizer.__call__` for details.
 
@@ -738,7 +783,7 @@ def _init_weights(self, module):
     "The bare REALM Model transformer outputting raw hidden-states without any specific head on top.",
     REALM_START_DOCSTRING,
 )
-class REALMModel(REALMPreTrainedModel):
+class RealmModel(RealmPreTrainedModel):
     """
 
     The model can behave as an encoder (with only self-attention) as well
@@ -758,8 +803,8 @@ def __init__(self, config):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = REALMEmbeddings(config)
-        self.encoder = REALMEncoder(config)
+        self.embeddings = RealmEmbeddings(config)
+        self.encoder = RealmEncoder(config)
 
         self.init_weights()
 
@@ -914,60 +959,42 @@ def forward(
         )
 
 
-@add_start_docstrings("""REALM Model with a `language modeling` head on top. """, REALM_START_DOCSTRING)
-class REALMForMaskedLM(REALMPreTrainedModel):
-    def __init__(self, config):
+class RealmRetriever(RealmPreTrainedModel):
+    def __init__(self, config, query_embedder=None, query_predictions=None):
         super().__init__(config)
 
-        if config.is_decoder:
-            logger.warning(
-                "If you want to use `REALMForMaskedLM` make sure `config.is_decoder=False` for "
-                "bi-directional self-attention."
-            )
-
-        self.realm = REALMModel(config)
-        self.cls = REALMOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
+        self.embedder = BertModel(self.config)
+        if query_embedder:
+            self.query_embedder = query_embedder
+        else:
+            self.query_embedder = self.embedder
+        
+        self.cls = RealmRetrieverProjection(self.config)
 
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
+        if query_predictions:
+            self.query_cls = query_predictions
+        else:
+            self.query_cls = self.cls
 
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
         self,
         input_ids=None,
         attention_mask=None,
         token_type_ids=None,
         position_ids=None,
+        candidate_input_ids=None,
+        candidate_attention_mask=None,
+        candidate_token_type_ids=None,
         head_mask=None,
         inputs_embeds=None,
         encoder_hidden_states=None,
         encoder_attention_mask=None,
-        labels=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.realm(
+        query_outputs = self.query_embedder(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -981,404 +1008,179 @@ def forward(
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
+        # [batch_size * num_candidates, candidate_seq_len]
+        (
+            (flattened_input_ids, unflatten), 
+            (flattened_attention_mask, _), 
+            (flattened_token_type_ids, _)
+        ) = self._flatten_inputs(
+            candidate_input_ids, 
+            candidate_attention_mask, 
+            candidate_token_type_ids
         )
 
-    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
-        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
-        dummy_token = torch.full(
-            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
-        )
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-@add_start_docstrings(
-    """REALM Model with a `language modeling` head on top for CLM fine-tuning. """, REALM_START_DOCSTRING
-)
-class REALMForCausalLM(REALMPreTrainedModel):
-
-    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
-
-    def __init__(self, config):
-        super().__init__(config)
-
-        if not config.is_decoder:
-            logger.warning("If you want to use `REALMForCausalLM` as a standalone, add `is_decoder=True.`")
-
-        self.realm = REALMModel(config)
-        self.cls = REALMOnlyMLMHead(config)
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            inputs_embeds=None,
-            encoder_hidden_states=None,
-            encoder_attention_mask=None,
-            head_mask=None,
-            cross_attn_head_mask=None,
-            past_key_values=None,
-            labels=None,
-            use_cache=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
-            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
-            tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
-            tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
-            additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
-            model.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-            cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
-            decoding.
-
-            If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
-            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
-            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-
-        Returns:
-
-        Example::
-
-            >>> from transformers import REALMTokenizer, REALMForCausalLM, REALMConfig
-            >>> import torch
-
-            >>> tokenizer = REALMTokenizer.from_pretrained('realm-cc-news')
-            >>> config = REALMConfig.from_pretrained("realm-cc-news")
-            >>> config.is_decoder = True
-            >>> model = REALMForCausalLM.from_pretrained('realm-cc-news', config=config)
-
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
-
-            >>> prediction_logits = outputs.logits
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.realm(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
+        candidate_outputs = self.embedder(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output)
-
-        lm_loss = None
-        if labels is not None:
-            # we are doing next-token prediction; shift prediction scores and input ids by one
-            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            labels = labels[:, 1:].contiguous()
-            loss_fct = CrossEntropyLoss()
-            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[1:]
-            return ((lm_loss,) + output) if lm_loss is not None else output
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=lm_loss,
-            logits=prediction_scores,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_shape)
-
-        # cut decoder_input_ids if past is used
-        if past is not None:
-            input_ids = input_ids[:, -1:]
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
-
-    def _reorder_cache(self, past, beam_idx):
-        reordered_past = ()
-        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],)
-        return reordered_past
-
-class REALMClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-
+        # [batch_size, query_seq_len, hidden_size]
+        query_output = query_outputs.pooler_output
+        # [batch_size, retriever_proj_size]
+        query_score = self.query_cls(query_output)
+        #print('query_score', query_score, query_score.shape)
+        # [batch_size * num_candidates, candidate_seq_len, hidden_size]
+        candidate_output = candidate_outputs.pooler_output
+        # [batch_size * num_candidates, candidate_seq_len, retriever_proj_size]
+        candidate_score = self.cls(candidate_output)
+        #print('candidate_score', candidate_score[0], candidate_score.shape)
+        # [batch_size, num_candidates, candidate_seq_len, retriever_proj_size]
+        candidate_score = candidate_score.view((candidate_input_ids.shape[0], self.config.num_candidates, -1))
+        # [batch_size, num_candidates]
+        relevance_score = torch.einsum("BD,BND->BN", query_score, candidate_score)
+        #print('relevance_score', relevance_score[0], relevance_score.shape)
+
+        return relevance_score, query_score, candidate_score
+
+
+class RealmEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-
         self.config = config
 
-    def forward(self, features, **kwargs):
-        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = ACT2FN[self.config.hidden_act](x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
+        self.bert = BertModel(self.config)
 
+        
+        self.cls = RealmOnlyMLMHead(self.config)
 
-@add_start_docstrings(
-    """REALM Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    REALM_START_DOCSTRING,
-)
-class REALMForSequenceClassification(REALMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.realm = REALMModel(config)
-        self.classifier = REALMClassificationHead(config)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=SequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
     def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            labels=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        relevance_score=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
     ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
-            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        (
+            (flattened_input_ids, unflatten), 
+            (flattened_attention_mask, _), 
+            (flattened_token_type_ids, _)
+        ) = self._flatten_inputs(
+            input_ids, 
+            attention_mask, 
+            token_type_ids
+        )
 
-        outputs = self.realm(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
+        joint_outpus = self.bert(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+        # [batch_size * num_candidates, joint_seq_len, hidden_size]
+        joint_output = joint_outpus[0]
+        # [batch_size * num_candidates, joint_seq_len, vocab_size]
+        prediction_scores = self.cls(joint_output)
 
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
+        candidate_score = relevance_score
+        # [batch_siZe, num_candidates]
+        candidate_log_probs = torch.log_softmax(candidate_score)
 
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
+        masked_lm_loss = None
+        if labels is not None:
+            # Compute marginal log-likelihood
+            # [batch_size * num_candidates, joint_seq_len, vocab_size]
+            mlm_logits = prediction_scores
+            mlm_log_probs = torch.log_softmax(mlm_logits)
+        
+            # [batch_size, joint_seq_len]
+            mlm_targets = labels
+            # [batch_size, num_candidates, joint_seq_len]
+            tiled_mlm_targets = torch.tile(mlm_targets.unsequeeze(1), (1, self.config.num_candidate, 1))
+            ## [batch_size, num_candidates, joint_seq_len, 1]
+            #tiled_mlm_targets = tiled_mlm_targets.unsqueeze(-1)
+            candidate_log_probs = candidate_log_probs.unsequeeze(-1)
+            joint_gold_log_probs = candidate_log_probs + mlm_log_probs
 
-@add_start_docstrings(
-    """REALM Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    REALM_START_DOCSTRING,
-)
-class REALMForMultipleChoice(REALMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
 
-        self.realm = REALMModel(config)
-        self.sequence_summary = SequenceSummary(config)
-        self.classifier = nn.Linear(config.hidden_size, 1)
 
-        self.init_weights()
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            #masked_lm_loss = loss_fct(, labels.view(-1))
 
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MultipleChoiceModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            labels=None,
-            output_attentions=None,
-            output_hidden_states=None,
-            return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the multiple choice classification loss.
-            Indices should be in ``[0, ..., num_choices-1]`` where :obj:`num_choices` is the size of the second dimension
-            of the input tensors. (See :obj:`input_ids` above)
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
-
-        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
-        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
-        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
-        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
-        inputs_embeds = (
-            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
-            if inputs_embeds is not None
-            else None
-        )
+        if not return_dict:
+            output = (prediction_scores,) + joint_outpus[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
-        outputs = self.realm(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=joint_outpus.hidden_states,
+            attentions=joint_outpus.attentions,
         )
 
-        sequence_output = outputs[0]
 
-        pooled_output = self.sequence_summary(sequence_output)
-        logits = self.classifier(pooled_output)
-        reshaped_logits = logits.view(-1, num_choices)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(reshaped_logits, labels)
+@add_start_docstrings("""REALM Model with a `language modeling` head on top. """, REALM_START_DOCSTRING)
+class RealmEncoderCopy(RealmPreTrainedModel):
+    def __init__(self, config, query_embedder=None, query_predictions=None):
+        super().__init__(config)
 
-        if not return_dict:
-            output = (reshaped_logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RealmForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
 
-        return MultipleChoiceModelOutput(
-            loss=loss,
-            logits=reshaped_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
+        self.embedder = RealmModel(config)
+        if query_embedder:
+            self.query_embedder = query_embedder
+        else:
+            self.query_embedder = self.embedder
+        
+        self.cls = RealmOnlyMLMHead(config)
 
+        if query_predictions:
+            self.query_cls = query_predictions
+        else:
+            self.query_cls = self.cls
 
-@add_start_docstrings(
-    """REALM Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    REALM_START_DOCSTRING,
-)
-class REALMForTokenClassification(REALMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
+        self.init_weights()
 
-        self.realm = REALMModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
 
-        self.init_weights()
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
         checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=TokenClassifierOutput,
+        output_type=MaskedLMOutput,
         config_class=_CONFIG_FOR_DOC,
     )
     def forward(
@@ -1387,8 +1189,13 @@ def forward(
         attention_mask=None,
         token_type_ids=None,
         position_ids=None,
+        candidate_input_ids=None,
+        candidate_attention_mask=None,
+        candidate_token_type_ids=None,
         head_mask=None,
         inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
         labels=None,
         output_attentions=None,
         output_hidden_states=None,
@@ -1396,148 +1203,73 @@ def forward(
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        outputs = self.realm(
+        query_outputs = self.query_embedder(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-
-@add_start_docstrings(
-    """REALM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
-    REALM_START_DOCSTRING,
-)
-class REALMForQuestionAnswering(REALMPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-
-        self.realm = REALMModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`).
-            Position outside of the sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        outputs = self.realm(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
+        candidate_outputs = self.embedder(
+            candidate_input_ids,
+            attention_mask=candidate_attention_mask,
+            token_type_ids=candidate_token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
 
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
+        
+        query_score = self.query_cls(query_outputs[0])
+        candidate_score = self.cls(candidate_outputs[0])
+        relevance_dist = torch.einsum("BD,BND->BN", query_score, candidate_score).softmax()
+
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
 
         if not return_dict:
-            output = (start_logits, end_logits) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
+            output = (prediction_scores,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
\ No newline at end of file
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index fbfbca0862df..69acc2945dfc 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,25 +23,25 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "realm-cc-news": 512,
+    "realm-cc-news-pretrained": 512,
 }
 
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "realm-cc-news": {"do_lower_case": False},
+    "realm-cc-news-pretrained": {"do_lower_case": False},
 }
 
 
-class REALMTokenizer(BertTokenizer):
+class RealmTokenizer(BertTokenizer):
     r"""
     Construct a REALM tokenizer.
 
-    :class:`~transformers.REALMTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    :class:`~transformers.RealmTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
     Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index fdb921d29d88..e60a6130ed17 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright Google AI and The HuggingFace Inc. team. All rights reserved.
+# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 """Tokenization classes for REALM."""
 from ...utils import logging
 from ..bert.tokenization_bert_fast import BertTokenizerFast
-from .tokenization_realm import REALMTokenizer
+from .tokenization_realm import RealmTokenizer
 
 
 logger = logging.get_logger(__name__)
@@ -24,25 +24,25 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "realm-cc-news": "https://huggingface.co/realm-cc-news/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "realm-cc-news": 512,
+    "realm-cc-news-pretrained": 512,
 }
 
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "realm-cc-news": {"do_lower_case": False},
+    "realm-cc-news-pretrained": {"do_lower_case": False},
 }
 
 
-class REALMTokenizerFast(BertTokenizerFast):
+class RealmTokenizerFast(BertTokenizerFast):
     r"""
     Construct a "fast" REALM tokenizer (backed by HuggingFace's `tokenizers` library).
 
-    :class:`~transformers.REALMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    :class:`~transformers.RealmTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
     Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
@@ -53,4 +53,4 @@ class REALMTokenizerFast(BertTokenizerFast):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    slow_tokenizer_class = REALMTokenizer
+    slow_tokenizer_class = RealmTokenizer
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 3c0a4a3cbff5..0846f81f7856 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -21,7 +21,7 @@
 from transformers import is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from transformers import REALMConfig
+from transformers import RealmConfig
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
@@ -30,20 +30,15 @@
     import torch
 
     from transformers import (
-        REALMForCausalLM,
-        REALMForMaskedLM,
-        REALMForMultipleChoice,
-        REALMForQuestionAnswering,
-        REALMForSequenceClassification,
-        REALMForTokenClassification,
-        REALMModel,
+        RealmForMaskedLM,
+        RealmModel,
     )
     from transformers.models.realm.modeling_realm import (
         REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
 
-class REALMModelTester:
+class RealmModelTester:
     def __init__(
             self,
             parent,
@@ -116,7 +111,7 @@ def prepare_config_and_inputs(self):
         return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
     def get_config(self):
-        return REALMConfig(
+        return RealmConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
             num_hidden_layers=self.num_hidden_layers,
@@ -161,7 +156,7 @@ def prepare_config_and_inputs_for_decoder(self):
     def create_and_check_model(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = REALMModel(config=config)
+        model = RealmModel(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
@@ -182,7 +177,7 @@ def create_and_check_model_as_decoder(
             encoder_attention_mask,
     ):
         config.add_cross_attention = True
-        model = REALMModel(config)
+        model = RealmModel(config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -201,149 +196,15 @@ def create_and_check_model_as_decoder(
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
-    def create_and_check_for_causal_lm(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-    ):
-        model = REALMForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
-
     def create_and_check_for_masked_lm(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = REALMForMaskedLM(config=config)
+        model = RealmForMaskedLM(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
         self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_decoder_model_past_large_inputs(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-        encoder_hidden_states,
-        encoder_attention_mask,
-    ):
-        config.is_decoder = True
-        config.add_cross_attention = True
-        model = REALMForCausalLM(config=config)
-        model.to(torch_device)
-        model.eval()
-
-        # first forward pass
-        outputs = model(
-            input_ids,
-            attention_mask=input_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            use_cache=True,
-        )
-        past_key_values = outputs.past_key_values
-
-        # create hypothetical multiple next token and extent to next_input_ids
-        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
-        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
-
-        # append to next input_ids and
-        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
-        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
-
-        output_from_no_past = model(
-            next_input_ids,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-        output_from_past = model(
-            next_tokens,
-            attention_mask=next_attention_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            past_key_values=past_key_values,
-            output_hidden_states=True,
-        )["hidden_states"][0]
-
-        # select random slice
-        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
-        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
-        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
-
-        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
-
-        # test that outputs are equal for slice
-        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
-
-    def create_and_check_for_question_answering(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        model = REALMForQuestionAnswering(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            start_positions=sequence_labels,
-            end_positions=sequence_labels,
-        )
-        self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
-        self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
-
-    def create_and_check_for_sequence_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = REALMForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def create_and_check_for_token_classification(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_labels = self.num_labels
-        model = REALMForTokenClassification(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
-
-    def create_and_check_for_multiple_choice(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
-    ):
-        config.num_choices = self.num_choices
-        model = REALMForMultipleChoice(config=config)
-        model.to(torch_device)
-        model.eval()
-        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-        result = model(
-            multiple_choice_inputs_ids,
-            attention_mask=multiple_choice_input_mask,
-            token_type_ids=multiple_choice_token_type_ids,
-            labels=choice_labels,
-        )
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_choices))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -360,26 +221,21 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class REALMModelTest(ModelTesterMixin, unittest.TestCase):
+class RealmModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
-            REALMModel,
-            REALMForMaskedLM,
-            REALMForCausalLM,
-            REALMForMultipleChoice,
-            REALMForQuestionAnswering,
-            REALMForSequenceClassification,
-            REALMForTokenClassification,
+            RealmModel,
+            RealmForMaskedLM,
         )
         if is_torch_available()
         else ()
     )
-    all_generative_model_classes = (REALMForCausalLM,) if is_torch_available() else ()
+    all_generative_model_classes = ()
 
     def setUp(self):
-        self.model_tester = REALMModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=REALMConfig, hidden_size=37)
+        self.model_tester = RealmModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RealmConfig, hidden_size=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -398,70 +254,18 @@ def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
 
-    def test_for_multiple_choice(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs)
-
-    def test_decoder_model_past_with_large_inputs(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
-
-    def test_for_question_answering(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
-
-    def test_for_sequence_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
-
-    def test_for_token_classification(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_token_classification(*config_and_inputs)
-
-    def test_model_as_decoder(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
-        self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
-
-    def test_model_as_decoder_with_default_input_mask(self):
-        # This regression test was failing with PyTorch < 1.3
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        ) = self.model_tester.prepare_config_and_inputs_for_decoder()
-
-        input_mask = None
-
-        self.model_tester.create_and_check_model_as_decoder(
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = REALMModel.from_pretrained(model_name)
+            model = RealmModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
 
 @require_torch
-class REALMModelIntegrationTest(unittest.TestCase):
+class RealmModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = REALMForMaskedLM.from_pretrained("realm-cc-news")
+        model = RealmForMaskedLM.from_pretrained("realm-cc-news-pretrained")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 

From 1ff43643da6df5820e04a9701be9057ac5c7316e Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 15:07:19 +0800
Subject: [PATCH 03/98] Encoder prediction score OK

---
 .../models/realm/modeling_realm.py            | 47 +++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 8c897e335ce5..e11084ec6f50 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -99,8 +99,8 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         name = name.replace("module/module/LayerNorm/", "cls/LayerNorm/")
         name = name.replace("module/module/dense/", "cls/dense/")
 
-        if "cls/predictions/output_bias" in name:
-            continue
+        #if "cls/predictions/output_bias" in name:
+        #    continue
 
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
@@ -517,7 +517,7 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-class RealmEncoder(nn.Module):
+class RealmEncoderLegacy(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -626,7 +626,7 @@ def __init__(self, config):
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
-        #hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
         hidden_states = self.LayerNorm(hidden_states)
         return hidden_states
 
@@ -975,6 +975,8 @@ def __init__(self, config, query_embedder=None, query_predictions=None):
             self.query_cls = query_predictions
         else:
             self.query_cls = self.cls
+        
+        self.init_weights()
 
     def forward(
         self,
@@ -1054,13 +1056,22 @@ def forward(
 
 class RealmEncoder(RealmPreTrainedModel):
     def __init__(self, config):
-        super().__init__()
-        self.config = config
-
+        super().__init__(config)
         self.bert = BertModel(self.config)
-
-        
         self.cls = RealmOnlyMLMHead(self.config)
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.bert.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.bert.embeddings.word_embeddings = value
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
 
     def forward(
         self,
@@ -1088,7 +1099,7 @@ def forward(
             token_type_ids
         )
 
-        joint_outpus = self.bert(
+        joint_outputs = self.bert(
             flattened_input_ids,
             attention_mask=flattened_attention_mask,
             token_type_ids=flattened_token_type_ids,
@@ -1103,11 +1114,19 @@ def forward(
         )
 
         # [batch_size * num_candidates, joint_seq_len, hidden_size]
-        joint_output = joint_outpus[0]
+        joint_output = joint_outputs[0]
+        print("joint_output", joint_output[0], joint_output.shape)
+        
         # [batch_size * num_candidates, joint_seq_len, vocab_size]
         prediction_scores = self.cls(joint_output)
+        print("prediction_scores", prediction_scores[0], prediction_scores.shape)
 
+        
+        # TODO: Complete loss fn.
+        # [batch_size, num_candidates]
         candidate_score = relevance_score
+        # [batch_size * num_candidates, 1]
+        candidate_score = candidate_score.view(-1, 1)
         # [batch_siZe, num_candidates]
         candidate_log_probs = torch.log_softmax(candidate_score)
 
@@ -1133,14 +1152,14 @@ def forward(
             #masked_lm_loss = loss_fct(, labels.view(-1))
 
         if not return_dict:
-            output = (prediction_scores,) + joint_outpus[1:]
+            output = (prediction_scores,) + joint_output[1:]
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
         return MaskedLMOutput(
             loss=masked_lm_loss,
             logits=prediction_scores,
-            hidden_states=joint_outpus.hidden_states,
-            attentions=joint_outpus.attentions,
+            hidden_states=joint_output.hidden_states,
+            attentions=joint_output.attentions,
         )
 
 

From baee376d6121fee173ccebb89d5c5b10ae92931c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 18:31:42 +0800
Subject: [PATCH 04/98] Encoder pretrained model OK

---
 .../models/realm/modeling_realm.py            | 61 +++++++++----------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index e11084ec6f50..f37360cbe8ea 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1039,17 +1039,14 @@ def forward(
         query_output = query_outputs.pooler_output
         # [batch_size, retriever_proj_size]
         query_score = self.query_cls(query_output)
-        #print('query_score', query_score, query_score.shape)
         # [batch_size * num_candidates, candidate_seq_len, hidden_size]
         candidate_output = candidate_outputs.pooler_output
         # [batch_size * num_candidates, candidate_seq_len, retriever_proj_size]
         candidate_score = self.cls(candidate_output)
-        #print('candidate_score', candidate_score[0], candidate_score.shape)
         # [batch_size, num_candidates, candidate_seq_len, retriever_proj_size]
-        candidate_score = candidate_score.view((candidate_input_ids.shape[0], self.config.num_candidates, -1))
+        candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size)
         # [batch_size, num_candidates]
         relevance_score = torch.einsum("BD,BND->BN", query_score, candidate_score)
-        #print('relevance_score', relevance_score[0], relevance_score.shape)
 
         return relevance_score, query_score, candidate_score
 
@@ -1085,6 +1082,7 @@ def forward(
         encoder_hidden_states=None,
         encoder_attention_mask=None,
         labels=None,
+        mlm_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1115,51 +1113,48 @@ def forward(
 
         # [batch_size * num_candidates, joint_seq_len, hidden_size]
         joint_output = joint_outputs[0]
-        print("joint_output", joint_output[0], joint_output.shape)
-        
         # [batch_size * num_candidates, joint_seq_len, vocab_size]
         prediction_scores = self.cls(joint_output)
-        print("prediction_scores", prediction_scores[0], prediction_scores.shape)
-
-        
-        # TODO: Complete loss fn.
         # [batch_size, num_candidates]
         candidate_score = relevance_score
-        # [batch_size * num_candidates, 1]
-        candidate_score = candidate_score.view(-1, 1)
-        # [batch_siZe, num_candidates]
-        candidate_log_probs = torch.log_softmax(candidate_score)
 
         masked_lm_loss = None
         if labels is not None:
+            if mlm_mask is None:
+                mlm_mask = torch.ones_like(labels, dtype=torch.float32)
+            else:
+                mlm_mask = mlm_mask.type(torch.float32)
+
             # Compute marginal log-likelihood
-            # [batch_size * num_candidates, joint_seq_len, vocab_size]
-            mlm_logits = prediction_scores
-            mlm_log_probs = torch.log_softmax(mlm_logits)
-        
-            # [batch_size, joint_seq_len]
-            mlm_targets = labels
+            loss_fct = CrossEntropyLoss(reduction='none')  # -100 index = padding token
+            
+            # [batch_size * num_candidates * joint_seq_len, vocab_size]
+            mlm_logits = prediction_scores.view(-1, self.config.vocab_size)
+            # [batch_size * num_candidates * joint_seq_len]
+            mlm_targets = labels.tile(1, self.config.num_candidates).view(-1)
             # [batch_size, num_candidates, joint_seq_len]
-            tiled_mlm_targets = torch.tile(mlm_targets.unsequeeze(1), (1, self.config.num_candidate, 1))
-            ## [batch_size, num_candidates, joint_seq_len, 1]
-            #tiled_mlm_targets = tiled_mlm_targets.unsqueeze(-1)
-            candidate_log_probs = candidate_log_probs.unsequeeze(-1)
-            joint_gold_log_probs = candidate_log_probs + mlm_log_probs
-
-
-
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            #masked_lm_loss = loss_fct(, labels.view(-1))
+            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view_as(input_ids)
+            # [batch_size, num_candidates, 1]
+            candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1)
+            # [batch_size, num_candidates, joint_seq_len]
+            joint_gold_log_prob = candidate_log_prob + masked_lm_log_prob
+            # [batch_size, joint_seq_len]
+            marginal_gold_log_probs = joint_gold_log_prob.logsumexp(1)
+            # []
+            masked_lm_loss = -torch.nansum(
+                torch.sum(marginal_gold_log_probs * mlm_mask) / 
+                torch.sum(mlm_mask)
+            )
 
         if not return_dict:
-            output = (prediction_scores,) + joint_output[1:]
+            output = (prediction_scores,) + joint_outputs[1:]
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
         return MaskedLMOutput(
             loss=masked_lm_loss,
             logits=prediction_scores,
-            hidden_states=joint_output.hidden_states,
-            attentions=joint_output.attentions,
+            hidden_states=joint_outputs.hidden_states,
+            attentions=joint_outputs.attentions,
         )
 
 

From 7ed72657862e2310166be20e785c760787294992 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 20:09:48 +0800
Subject: [PATCH 05/98] Update retriever comments

---
 src/transformers/models/realm/modeling_realm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index f37360cbe8ea..9f929f68fbb6 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1035,15 +1035,15 @@ def forward(
             return_dict=return_dict,
         )
 
-        # [batch_size, query_seq_len, hidden_size]
+        # [batch_size, hidden_size]
         query_output = query_outputs.pooler_output
         # [batch_size, retriever_proj_size]
         query_score = self.query_cls(query_output)
-        # [batch_size * num_candidates, candidate_seq_len, hidden_size]
+        # [batch_size * num_candidates, hidden_size]
         candidate_output = candidate_outputs.pooler_output
-        # [batch_size * num_candidates, candidate_seq_len, retriever_proj_size]
+        # [batch_size * num_candidates, retriever_proj_size]
         candidate_score = self.cls(candidate_output)
-        # [batch_size, num_candidates, candidate_seq_len, retriever_proj_size]
+        # [batch_size, num_candidates, retriever_proj_size]
         candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size)
         # [batch_size, num_candidates]
         relevance_score = torch.einsum("BD,BND->BN", query_score, candidate_score)

From dd3fb739dcc41030d06307b8102ec3ae4bdb541e Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 20:56:49 +0800
Subject: [PATCH 06/98] Update docs, tests, and imports

---
 README.md                                     |  1 +
 docs/source/model_doc/realm.rst               | 27 +++++--
 src/transformers/__init__.py                  | 14 ++--
 src/transformers/models/auto/modeling_auto.py |  3 -
 src/transformers/models/realm/__init__.py     |  8 +-
 tests/test_modeling_realm.py                  | 79 ++++++++-----------
 6 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/README.md b/README.md
index 42aa5995924d..e5514dfb41de 100644
--- a/README.md
+++ b/README.md
@@ -258,6 +258,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 9c33e89877cd..21696675c7f6 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -16,20 +16,24 @@ REALM
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-The REALM model was proposed in `<INSERT PAPER NAME HERE>
-<<INSERT PAPER LINK HERE>>`__  by <INSERT AUTHORS HERE>. <INSERT SHORT SUMMARY HERE>
+The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
+<https://arxiv.org/abs/2002.08909>`__  by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's
+a retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then 
+utilizes retrieved documents to process Open-QA tasks.
 
 The abstract from the paper is the following:
 
-*<INSERT PAPER ABSTRACT HERE>*
+*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, requiring ever-larger networks to cover more facts.
+To capture knowledge in a more modular and interpretable way, we augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents.
+We demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as interpretability and modularity.*
 
 Tips:
 
 <INSERT TIPS ABOUT MODEL HERE>
 
-This model was contributed by `<INSERT YOUR HF USERNAME HERE> 
-<https://huggingface.co/<INSERT YOUR HF USERNAME HERE>>`__. The original code can be found `here 
-<<INSERT LINK TO GITHUB REPO HERE>>`__.
+This model was contributed by `qqaatw 
+<https://huggingface.co/qqaatw>>`__. The original code can be found `here 
+<https://github.com/google-research/language/tree/master/language/realm>`__.
 
 RealmConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -60,8 +64,15 @@ RealmModel
     :members: forward
 
 
-RealmForMaskedLM
+RealmRetriever
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RealmForMaskedLM
+.. autoclass:: transformers.RealmRetriever
+    :members: forward
+
+
+RealmEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RealmEncoder
     :members: forward
\ No newline at end of file
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 0603478f4e0d..b525ffd753e9 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -133,7 +133,6 @@
         "load_tf2_weights_in_pytorch_model",
     ],
     # Models
-    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
     "models": [],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.auto": [
@@ -225,6 +224,7 @@
     "models.phobert": ["PhobertTokenizer"],
     "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
+    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
     "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.retribert": ["RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RetriBertConfig", "RetriBertTokenizer"],
@@ -507,10 +507,10 @@
     _import_structure["models.realm"].extend(
         [
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "RealmForMaskedLM",
-            "RealmLayer",
+            "RealmEncoder",
             "RealmModel",
             "RealmPreTrainedModel",
+            "RealmRetriever",
             "load_tf_weights_in_realm",
         ]
     )
@@ -1849,7 +1849,6 @@
         load_tf2_weights_in_pytorch_model,
     )
     from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig
-    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
     from .models.auto import (
         ALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CONFIG_MAPPING,
@@ -1934,6 +1933,7 @@
     from .models.phobert import PhobertTokenizer
     from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
+    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
     from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.retribert import RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RetriBertConfig, RetriBertTokenizer
@@ -2041,7 +2041,6 @@
         from .utils.dummy_sentencepiece_objects import *
 
     if is_tokenizers_available():
-        from .models.realm import RealmTokenizerFast
         from .models.albert import AlbertTokenizerFast
         from .models.bart import BartTokenizerFast
         from .models.barthez import BarthezTokenizerFast
@@ -2068,6 +2067,7 @@
         from .models.mt5 import MT5TokenizerFast
         from .models.openai import OpenAIGPTTokenizerFast
         from .models.pegasus import PegasusTokenizerFast
+        from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
         from .models.retribert import RetriBertTokenizerFast
@@ -2578,10 +2578,10 @@
         from .models.rag import RagModel, RagPreTrainedModel, RagSequenceForGeneration, RagTokenForGeneration
         from .models.realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RealmForMaskedLM,
-            RealmLayer,
+            RealmEncoder,
             RealmModel,
             RealmPreTrainedModel,
+            RealmRetriever,
             load_tf_weights_in_realm,
         )
         from .models.reformer import (
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index baf5d89e5b28..c851d70ceb0c 100644
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -28,7 +28,6 @@
 MODEL_MAPPING_NAMES = OrderedDict(
     [
         # Base model mapping
-        ("realm", "RealmModel"),
         ("beit", "BeitModel"),
         ("rembert", "RemBertModel"),
         ("visual_bert", "VisualBertModel"),
@@ -135,7 +134,6 @@
 MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
     [
         # Model with LM heads mapping
-        ("realm", "RealmForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"),
@@ -225,7 +223,6 @@
 MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
     [
         # Model for Masked LM mapping
-        ("realm", "RealmForMaskedLM"),
         ("rembert", "RemBertForMaskedLM"),
         ("roformer", "RoFormerForMaskedLM"),
         ("big_bird", "BigBirdForMaskedLM"),
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 5d936bf01e40..fc10699fcd66 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -28,10 +28,10 @@
 if is_torch_available():
     _import_structure["modeling_realm"] = [
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
-        "RealmForMaskedLM",
-        "RealmLayer",
+        "RealmEncoder",
         "RealmModel",
         "RealmPreTrainedModel",
+        "RealmRetriever",
         "load_tf_weights_in_realm",
     ]
 
@@ -48,10 +48,10 @@
     if is_torch_available():
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
-            RealmForMaskedLM,
-            RealmLayer,
+            RealmEncoder,
             RealmModel,
             RealmPreTrainedModel,
+            RealmRetriever,
             load_tf_weights_in_realm,
         )
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 0846f81f7856..8eb648af1c25 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -30,8 +30,8 @@
     import torch
 
     from transformers import (
-        RealmForMaskedLM,
-        RealmModel,
+        RealmEncoder,
+        RealmRetriever,
     )
     from transformers.models.realm.modeling_realm import (
         REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -153,53 +153,32 @@ def prepare_config_and_inputs_for_decoder(self):
             encoder_attention_mask,
         )
 
-    def create_and_check_model(
+    def create_and_check_encoder(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = RealmModel(config=config)
+        model = RealmEncoder(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
 
-    def create_and_check_model_as_decoder(
-            self,
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
+    """
+    def create_and_check_model(
+            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        config.add_cross_attention = True
-        model = RealmModel(config)
+        model = RealmModel(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-        )
-        result = model(
-            input_ids,
-            attention_mask=input_mask,
-            token_type_ids=token_type_ids,
-            encoder_hidden_states=encoder_hidden_states,
-        )
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        result = model(input_ids, token_type_ids=token_type_ids)
+        result = model(input_ids)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+    """
 
-    def create_and_check_for_masked_lm(
+    def create_and_check_retriever(
             self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
-        model = RealmForMaskedLM(config=config)
+        model = RealmRetriever(config=config)
         model.to(torch_device)
         model.eval()
         result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
@@ -225,8 +204,8 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
-            RealmModel,
-            RealmForMaskedLM,
+            RealmEncoder,
+            RealmRetriever,
         )
         if is_torch_available()
         else ()
@@ -240,9 +219,15 @@ def setUp(self):
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_encoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_encoder(*config_and_inputs)
+    
+    """
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
+    """
 
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -250,27 +235,31 @@ def test_model_various_embeddings(self):
             config_and_inputs[0].position_embedding_type = type
             self.model_tester.create_and_check_model(*config_and_inputs)
 
-    def test_for_masked_lm(self):
+    def test_retriever(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+        self.model_tester.create_and_check_retriever(*config_and_inputs)
 
     @slow
-    def test_model_from_pretrained(self):
+    def test_encoder_from_pretrained(self):
         for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = RealmModel.from_pretrained(model_name)
+            model = RealmEncoder.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @slow
+    def test_retriever_from_pretrained(self):
+        for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = RealmRetriever.from_pretrained(model_name)
+            self.assertIsNotNone(model)
 
 @require_torch
 class RealmModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_masked_lm(self):
-        model = RealmForMaskedLM.from_pretrained("realm-cc-news-pretrained")
+        model = RealmRetriever.from_pretrained("realm-cc-news-pretrained-embedder")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
-        # TODO Replace vocab size
-        vocab_size = 32000
+        vocab_size = 30522
 
         expected_shape = torch.Size((1, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)

From 927b106d4056d98cddb3c078ca93175fbe3772d7 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 29 Aug 2021 23:17:05 +0800
Subject: [PATCH 07/98] Prune unused models

---
 .../models/realm/modeling_realm.py            | 798 +-----------------
 1 file changed, 30 insertions(+), 768 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 9f929f68fbb6..34227a115e99 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -15,8 +15,6 @@
 """ PyTorch REALM model. """
 
 
-
-
 import math
 import os
 
@@ -61,7 +59,8 @@
 _TOKENIZER_FOR_DOC = "RealmTokenizer"
 
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "realm-cc-news-pretrained",
+    "qqaatw/realm-cc-news-pretrained-embedder",
+    "qqaatw/realm-cc-news-pretrained-bert"
     # See all REALM models at https://huggingface.co/models?filter=realm
 ]
 
@@ -155,463 +154,30 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         pointer.data = torch.from_numpy(array)
     return model
 
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
 
-class RealmEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings."""
-
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if version.parse(torch.__version__) > version.parse("1.6.0"):
-            self.register_buffer(
-                "token_type_ids",
-                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
-                persistent=False,
-            )
-
-    def forward(
-        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
-    ):
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-
-        seq_length = input_shape[1]
-
-        if position_ids is None:
-            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
-
-        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
-        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
-        # issue #5664
-        if token_type_ids is None:
-            if hasattr(self, "token_type_ids"):
-                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-                
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = inputs_embeds + token_type_embeddings
-        if self.position_embedding_type == "absolute":
-            position_embeddings = self.position_embeddings(position_ids)
-            embeddings += position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class RealmSelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            self.max_position_embeddings = config.max_position_embeddings
-            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
-
-        self.is_decoder = config.is_decoder
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_layer, value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-
-        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
-            seq_length = hidden_states.size()[1]
-            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
-            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
-            distance = position_ids_l - position_ids_r
-            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
-            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
-
-            if self.position_embedding_type == "relative_key":
-                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores
-            elif self.position_embedding_type == "relative_key_query":
-                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
-                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
-                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
-
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in RealmModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-
-        if self.is_decoder:
-            outputs = outputs + (past_key_value,)
-        return outputs
-
-
-class RealmSelfOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class RealmAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.self = RealmSelfAttention(config)
-        self.output = RealmSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
-class RealmIntermediate(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class RealmOutput(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class RealmLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = RealmAttention(config)
-        self.is_decoder = config.is_decoder
-        self.add_cross_attention = config.add_cross_attention
-        if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
-            self.crossattention = RealmAttention(config)
-        self.intermediate = RealmIntermediate(config)
-        self.output = RealmOutput(config)
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-        )
-        attention_output = self_attention_outputs[0]
-
-        # if decoder, the last output is tuple of self-attn cache
-        if self.is_decoder:
-            outputs = self_attention_outputs[1:-1]
-            present_key_value = self_attention_outputs[-1]
-        else:
-            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-
-        cross_attn_present_key_value = None
-        if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(
-                self, "crossattention"
-            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
-
-            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            cross_attention_outputs = self.crossattention(
-                attention_output,
-                attention_mask,
-                head_mask,
-                encoder_hidden_states,
-                encoder_attention_mask,
-                cross_attn_past_key_value,
-                output_attentions,
-            )
-            attention_output = cross_attention_outputs[0]
-            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
-
-            # add cross-attn cache to positions 3,4 of present_key_value tuple
-            cross_attn_present_key_value = cross_attention_outputs[-1]
-            present_key_value = present_key_value + cross_attn_present_key_value
-
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
-
-        # if decoder, return the attn key/values as the last output
-        if self.is_decoder:
-            outputs = outputs + (present_key_value,)
-
-        return outputs
-
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class RealmEncoderLegacy(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-
-        next_decoder_cache = () if use_cache else None
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-
-            if getattr(self.config, "gradient_checkpointing", False) and self.training:
-
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                        "`use_cache=False`..."
-                    )
-                    use_cache = False
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs, past_key_value, output_attentions)
-
-                    return custom_forward
-
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
 
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
 class RealmPredictionHeadTransform(nn.Module):
@@ -716,7 +282,6 @@ def _unflatten(flat):
         return flattened_inputs
             
             
-
 REALM_START_DOCSTRING = r"""
     This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
@@ -780,185 +345,9 @@ def _unflatten(flat):
 
 
 @add_start_docstrings(
-    "The bare REALM Model transformer outputting raw hidden-states without any specific head on top.",
+    "The retriever of REALM outputting raw hidden-states without any specific head on top.",
     REALM_START_DOCSTRING,
 )
-class RealmModel(RealmPreTrainedModel):
-    """
-
-    The model can behave as an encoder (with only self-attention) as well
-    as a decoder, in which case a layer of cross-attention is added between
-    the self-attention layers, following the architecture described in `Attention is
-    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani,
-    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
-
-    To behave as an decoder the model needs to be initialized with the
-    :obj:`is_decoder` argument of the configuration set to :obj:`True`.
-    To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
-    argument and :obj:`add_cross_attention` set to :obj:`True`; an
-    :obj:`encoder_hidden_states` is then expected as an input to the forward pass.
-    """
-
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = RealmEmbeddings(config)
-        self.encoder = RealmEncoder(config)
-
-        self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """Prunes heads of the model.
-        heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=BaseModelOutputWithPastAndCrossAttentions,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-            if the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
-            is used in the cross-attention if the model is configured as a decoder.
-            Mask values selected in ``[0, 1]``:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-            batch_size, seq_length = input_shape
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size, seq_length = input_shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-
-        if not return_dict:
-            return (sequence_output,) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-
-
 class RealmRetriever(RealmPreTrainedModel):
     def __init__(self, config, query_embedder=None, query_predictions=None):
         super().__init__(config)
@@ -1051,6 +440,10 @@ def forward(
         return relevance_score, query_score, candidate_score
 
 
+@add_start_docstrings(
+    "The encoder of REALM outputting raw hidden-states without any specific head on top.",
+    REALM_START_DOCSTRING,
+)
 class RealmEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1155,135 +548,4 @@ def forward(
             logits=prediction_scores,
             hidden_states=joint_outputs.hidden_states,
             attentions=joint_outputs.attentions,
-        )
-
-
-@add_start_docstrings("""REALM Model with a `language modeling` head on top. """, REALM_START_DOCSTRING)
-class RealmEncoderCopy(RealmPreTrainedModel):
-    def __init__(self, config, query_embedder=None, query_predictions=None):
-        super().__init__(config)
-
-        if config.is_decoder:
-            logger.warning(
-                "If you want to use `RealmForMaskedLM` make sure `config.is_decoder=False` for "
-                "bi-directional self-attention."
-            )
-
-        self.embedder = RealmModel(config)
-        if query_embedder:
-            self.query_embedder = query_embedder
-        else:
-            self.query_embedder = self.embedder
-        
-        self.cls = RealmOnlyMLMHead(config)
-
-        if query_predictions:
-            self.query_cls = query_predictions
-        else:
-            self.query_cls = self.cls
-
-        self.init_weights()
-
-    def get_output_embeddings(self):
-        return self.cls.predictions.decoder
-
-    def set_output_embeddings(self, new_embeddings):
-        self.cls.predictions.decoder = new_embeddings
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=MaskedLMOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        candidate_input_ids=None,
-        candidate_attention_mask=None,
-        candidate_token_type_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss.
-            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
-            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
-            in ``[0, ..., config.vocab_size]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        query_outputs = self.query_embedder(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        candidate_outputs = self.embedder(
-            candidate_input_ids,
-            attention_mask=candidate_attention_mask,
-            token_type_ids=candidate_token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        
-        query_score = self.query_cls(query_outputs[0])
-        candidate_score = self.cls(candidate_outputs[0])
-        relevance_dist = torch.einsum("BD,BND->BN", query_score, candidate_score).softmax()
-
-
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()  # -100 index = padding token
-            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
-
-        if not return_dict:
-            output = (prediction_scores,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-
-        return MaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=prediction_scores,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-
-    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
-        input_shape = input_ids.shape
-        effective_batch_size = input_shape[0]
-
-        #  add a dummy token
-        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
-        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
-        dummy_token = torch.full(
-            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
-        )
-        input_ids = torch.cat([input_ids, dummy_token], dim=1)
-
-        return {"input_ids": input_ids, "attention_mask": attention_mask}
\ No newline at end of file
+        )
\ No newline at end of file

From 96615bd90dc3bdcace5209a041356d71688fe87c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 30 Aug 2021 00:46:54 +0800
Subject: [PATCH 08/98] Make embedder as a module `RealmEmbedder`

---
 src/transformers/__init__.py                  |   2 +
 src/transformers/models/realm/__init__.py     |   2 +
 .../models/realm/modeling_realm.py            | 137 +++++++++++++-----
 3 files changed, 108 insertions(+), 33 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index b525ffd753e9..08e020c66b3d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -507,6 +507,7 @@
     _import_structure["models.realm"].extend(
         [
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RealmEmbedder",
             "RealmEncoder",
             "RealmModel",
             "RealmPreTrainedModel",
@@ -2578,6 +2579,7 @@
         from .models.rag import RagModel, RagPreTrainedModel, RagSequenceForGeneration, RagTokenForGeneration
         from .models.realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RealmEmbedder,
             RealmEncoder,
             RealmModel,
             RealmPreTrainedModel,
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index fc10699fcd66..d3b21d865a8c 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -28,6 +28,7 @@
 if is_torch_available():
     _import_structure["modeling_realm"] = [
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "RealmEmbedder",
         "RealmEncoder",
         "RealmModel",
         "RealmPreTrainedModel",
@@ -48,6 +49,7 @@
     if is_torch_available():
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            RealmEmbedder,
             RealmEncoder,
             RealmModel,
             RealmPreTrainedModel,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 34227a115e99..dc59ca1c2d8c 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -20,6 +20,8 @@
 
 import torch
 import torch.utils.checkpoint
+from typing import Optional, Tuple
+from dataclasses import dataclass
 from packaging import version
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
@@ -39,6 +41,7 @@
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
+    ModelOutput
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -93,13 +96,12 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     for name, array in zip(names, arrays):
         original_name = name
 
-        name = name.replace("module/module/module/bert/", "embedder/")
-        name = name.replace("module/module/module/cls/predictions/", "cls/predictions/")
-        name = name.replace("module/module/LayerNorm/", "cls/LayerNorm/")
-        name = name.replace("module/module/dense/", "cls/dense/")
-
-        #if "cls/predictions/output_bias" in name:
-        #    continue
+        # embedder
+        embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+        name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
+        name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+        name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
+        name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
 
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
@@ -154,6 +156,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         pointer.data = torch.from_numpy(array)
     return model
 
+
 @dataclass
 class BaseModelOutput(ModelOutput):
     """
@@ -180,6 +183,32 @@ class BaseModelOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class RealmEmbedderOutput(ModelOutput):
+    """
+    Outputs of embedder models.
+
+    Args:
+        projected_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
+            Projected scores.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projected_score: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 class RealmPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -274,11 +303,7 @@ def _flatten_inputs(self, *inputs):
             input_shape = tensor.shape
             if len(input_shape) > 2:
                 tensor = tensor.view((-1, input_shape[-1]))
-            def _unflatten(flat):
-                if len(input_shape) > 2:
-                    flat = flat.view(input_shape + (-1,))
-                return flat
-            flattened_inputs.append((tensor, _unflatten))
+            flattened_inputs.append(tensor)
         return flattened_inputs
             
             
@@ -344,27 +369,77 @@ def _unflatten(flat):
 """
 
 
+@add_start_docstrings(
+    "The embedder of REALM outputting raw hidden-states without any specific head on top.",
+    REALM_START_DOCSTRING,
+)
+class RealmEmbedder(RealmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(self.config)
+        self.cls = RealmRetrieverProjection(self.config)
+        self.init_weights()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        bert_outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size, hidden_size]
+        pooler_output = bert_outputs.pooler_output
+        # [batch_size, retriever_proj_size]
+        projected_score = self.cls(pooler_output)
+        
+        if not return_dict:
+            return (projected_score,) + bert_outputs[1:]
+        else:
+            return RealmEmbedderOutput(
+                projected_score = projected_score,
+                hidden_states = bert_outputs.hidden_states,
+                attentions = bert_outputs.attentions,
+            )
+
+
 @add_start_docstrings(
     "The retriever of REALM outputting raw hidden-states without any specific head on top.",
     REALM_START_DOCSTRING,
 )
 class RealmRetriever(RealmPreTrainedModel):
-    def __init__(self, config, query_embedder=None, query_predictions=None):
+    def __init__(self, config, query_embedder=None):
         super().__init__(config)
 
-        self.embedder = BertModel(self.config)
+        self.embedder = RealmEmbedder(self.config)
+        
         if query_embedder:
             self.query_embedder = query_embedder
         else:
             self.query_embedder = self.embedder
         
-        self.cls = RealmRetrieverProjection(self.config)
-
-        if query_predictions:
-            self.query_cls = query_predictions
-        else:
-            self.query_cls = self.cls
-        
         self.init_weights()
 
     def forward(
@@ -401,9 +476,9 @@ def forward(
 
         # [batch_size * num_candidates, candidate_seq_len]
         (
-            (flattened_input_ids, unflatten), 
-            (flattened_attention_mask, _), 
-            (flattened_token_type_ids, _)
+            flattened_input_ids, 
+            flattened_attention_mask,
+            flattened_token_type_ids
         ) = self._flatten_inputs(
             candidate_input_ids, 
             candidate_attention_mask, 
@@ -424,14 +499,10 @@ def forward(
             return_dict=return_dict,
         )
 
-        # [batch_size, hidden_size]
-        query_output = query_outputs.pooler_output
         # [batch_size, retriever_proj_size]
-        query_score = self.query_cls(query_output)
-        # [batch_size * num_candidates, hidden_size]
-        candidate_output = candidate_outputs.pooler_output
+        query_score = query_outputs[0]
         # [batch_size * num_candidates, retriever_proj_size]
-        candidate_score = self.cls(candidate_output)
+        candidate_score = candidate_outputs[0]
         # [batch_size, num_candidates, retriever_proj_size]
         candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size)
         # [batch_size, num_candidates]
@@ -481,9 +552,9 @@ def forward(
         return_dict=None,
     ):
         (
-            (flattened_input_ids, unflatten), 
-            (flattened_attention_mask, _), 
-            (flattened_token_type_ids, _)
+            flattened_input_ids, 
+            flattened_attention_mask, 
+            flattened_token_type_ids
         ) = self._flatten_inputs(
             input_ids, 
             attention_mask, 

From dea3b2fbcc05aa5918ab51f532e7d03b41f4b16b Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 30 Aug 2021 01:04:39 +0800
Subject: [PATCH 09/98] Add RealmRetrieverOutput

---
 .../models/realm/modeling_realm.py            | 30 +++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index dc59ca1c2d8c..14f58ad28649 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -186,7 +186,7 @@ class BaseModelOutput(ModelOutput):
 @dataclass
 class RealmEmbedderOutput(ModelOutput):
     """
-    Outputs of embedder models.
+    Outputs of RealmEmbedder models.
 
     Args:
         projected_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
@@ -209,6 +209,25 @@ class RealmEmbedderOutput(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class RealmRetrieverOutput(ModelOutput):
+    """
+    Outputs of RealmRetriever models.
+
+    Args:
+        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates)`):
+            Relevance score.
+        query_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
+            Query score.
+        candidate_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates, config.retriever_proj_size)`):
+            Candidate score.
+    """
+
+    relevance_score: torch.FloatTensor = None
+    query_score: torch.FloatTensor = None
+    candidate_score: torch.FloatTensor = None
+
+
 class RealmPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -508,7 +527,14 @@ def forward(
         # [batch_size, num_candidates]
         relevance_score = torch.einsum("BD,BND->BN", query_score, candidate_score)
 
-        return relevance_score, query_score, candidate_score
+        if not return_dict:
+            return relevance_score, query_score, candidate_score
+        else:
+            return RealmRetrieverOutput(
+                relevance_score = relevance_score,
+                query_score = query_score,
+                candidate_score = candidate_score
+            )
 
 
 @add_start_docstrings(

From 66859e1a3d525fc1aab1ca1af125ca6919e6108a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 30 Aug 2021 23:57:31 +0800
Subject: [PATCH 10/98] Update tokenization

---
 src/transformers/models/realm/tokenization_realm.py  | 12 +++++++++---
 .../models/realm/tokenization_realm_fast.py          | 12 +++++++++---
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 69acc2945dfc..4d992dd1ed79 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -23,17 +23,23 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "realm-cc-news-pretrained": 512,
+    "realm-cc-news-pretrained-embedder": 512,
+    "realm-cc-news-pretrained-retriever": 512,
+    "realm-cc-news-pretrained-encoder": 512
 }
 
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "realm-cc-news-pretrained": {"do_lower_case": False},
+    "realm-cc-news-pretrained-embedder": {"do_lower_case": True},
+    "realm-cc-news-pretrained-retriever": {"do_lower_case": True},
+    "realm-cc-news-pretrained-encoder": {"do_lower_case": True},
 }
 
 
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index e60a6130ed17..853156c6e7b7 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -24,17 +24,23 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "realm-cc-news-pretrained": 512,
+    "realm-cc-news-pretrained-embedder": 512,
+    "realm-cc-news-pretrained-retriever": 512,
+    "realm-cc-news-pretrained-encoder": 512
 }
 
 
 PRETRAINED_INIT_CONFIGURATION = {
-    "realm-cc-news-pretrained": {"do_lower_case": False},
+    "realm-cc-news-pretrained-embedder": {"do_lower_case": True},
+    "realm-cc-news-pretrained-retriever": {"do_lower_case": True},
+    "realm-cc-news-pretrained-encoder": {"do_lower_case": True},
 }
 
 

From ae889ee189fc8ca5e45ae53845af60fa0f4eb423 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 31 Aug 2021 19:37:27 +0800
Subject: [PATCH 11/98] Pass all tests in test_modeling_realm.py

---
 src/transformers/models/realm/__init__.py     |   3 +-
 .../models/realm/configuration_realm.py       |  17 +-
 .../models/realm/modeling_realm.py            | 153 ++++-------
 tests/test_modeling_realm.py                  | 255 ++++++++++++------
 4 files changed, 237 insertions(+), 191 deletions(-)

diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index d3b21d865a8c..a99bb2b7d8c3 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -17,6 +17,7 @@
 # limitations under the License.
 from typing import TYPE_CHECKING
 from ...file_utils import _LazyModule, is_torch_available, is_tokenizers_available
+
 _import_structure = {
     "configuration_realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig"],
     "tokenization_realm": ["RealmTokenizer"],
@@ -37,8 +38,6 @@
     ]
 
 
-
-
 if TYPE_CHECKING:
     from .configuration_realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig
     from .tokenization_realm import RealmTokenizer
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 41b51523e468..739e431a24fb 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -21,7 +21,9 @@
 logger = logging.get_logger(__name__)
 
 REALM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "realm-cc-news-pretrained": "https://huggingface.co/realm-cc-news-pretrained/resolve/main/config.json",
+    "realm-cc-news-pretrained-bert": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-bert/resolve/main/config.json",
+    "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/config.json",
+    "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/config.json",
     # See all REALM models at https://huggingface.co/models?filter=realm
 }
 
@@ -80,13 +82,14 @@ class RealmConfig(PretrainedConfig):
         >>> # Initializing a REALM realm-cc-news-pretrained style configuration
         >>> configuration = RealmConfig()
 
-        >>> # Initializing a model from the realm-cc-news-pretrained style configuration
-        >>> model = RealmModel(configuration)
+        >>> # Initializing a model from the qqaatw/realm-cc-news-pretrained-embedder style configuration
+        >>> model = RealmRetriever(configuration)
 
         >>> # Accessing the model configuration
         >>> configuration = model.config
     """
     model_type = "realm"
+
     def __init__(
         self,
         vocab_size=30522,
@@ -110,12 +113,7 @@ def __init__(
         eos_token_id=2,
         **kwargs
     ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs
-        )
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
@@ -132,4 +130,3 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
-        
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 14f58ad28649..4d20d46653ef 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -15,7 +15,6 @@
 """ PyTorch REALM model. """
 
 
-import math
 import os
 
 import torch
@@ -33,16 +32,7 @@
     add_start_docstrings_to_model_forward,
     replace_return_docstrings,
 )
-from ...modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    MaskedLMOutput,
-    MultipleChoiceModelOutput,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutput,
-    TokenClassifierOutput,
-    ModelOutput
-)
+from ...modeling_outputs import MaskedLMOutput, ModelOutput
 from ...modeling_utils import (
     PreTrainedModel,
     SequenceSummary,
@@ -62,8 +52,9 @@
 _TOKENIZER_FOR_DOC = "RealmTokenizer"
 
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "qqaatw/realm-cc-news-pretrained-bert",
     "qqaatw/realm-cc-news-pretrained-embedder",
-    "qqaatw/realm-cc-news-pretrained-bert"
+    "qqaatw/realm-cc-news-pretrained-retriever",
     # See all REALM models at https://huggingface.co/models?filter=realm
 ]
 
@@ -107,13 +98,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(
-            n in [
-                "adam_v",
-                "adam_m",
-                "AdamWeightDecayOptimizer",
-                "AdamWeightDecayOptimizer_1",
-                "global_step"
-            ]
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
             for n in name
         ):
             logger.info(f"Skipping {'/'.join(name)}")
@@ -130,7 +115,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
                 pointer = getattr(pointer, "bias")
             elif scope_names[0] == "output_weights":
                 pointer = getattr(pointer, "weight")
-            #elif scope_names[0] == "squad":
+            # elif scope_names[0] == "squad":
             #    pointer = getattr(pointer, "classifier")
             else:
                 try:
@@ -157,32 +142,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     return model
 
 
-@dataclass
-class BaseModelOutput(ModelOutput):
-    """
-    Base class for model's outputs, with potential hidden states and attentions.
-
-    Args:
-        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-
-
 @dataclass
 class RealmEmbedderOutput(ModelOutput):
     """
@@ -190,6 +149,7 @@ class RealmEmbedderOutput(ModelOutput):
 
     Args:
         projected_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
+
             Projected scores.
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
@@ -283,7 +243,6 @@ def __init__(self, config):
         self.LayerNorm = nn.LayerNorm(config.retriever_proj_size, eps=config.layer_norm_eps)
 
     def forward(self, hidden_states):
-        #hidden_states = self.predictions(hidden_states)
         hidden_states = self.dense(hidden_states)
         hidden_states = self.LayerNorm(hidden_states)
         return hidden_states
@@ -301,7 +260,7 @@ class RealmPreTrainedModel(PreTrainedModel):
     _keys_to_ignore_on_load_missing = [r"position_ids"]
 
     def _init_weights(self, module):
-        """ Initialize the weights """
+        """Initialize the weights"""
         if isinstance(module, nn.Linear):
             # Slightly different from the TF version which uses truncated_normal for initialization
             # cf https://github.com/pytorch/pytorch/pull/5617
@@ -315,17 +274,21 @@ def _init_weights(self, module):
         elif isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
-    
-    def _flatten_inputs(self, *inputs):        
+
+    def _flatten_inputs(self, *inputs):
+        """Flatten inputs to (batch_size, ..., input_shape[-1])"""
         flattened_inputs = []
         for tensor in inputs:
-            input_shape = tensor.shape
-            if len(input_shape) > 2:
-                tensor = tensor.view((-1, input_shape[-1]))
-            flattened_inputs.append(tensor)
+            if tensor is None:
+                flattened_inputs.append(None)
+            else:
+                input_shape = tensor.shape
+                if len(input_shape) > 2:
+                    tensor = tensor.view((-1, input_shape[-1]))
+                flattened_inputs.append(tensor)
         return flattened_inputs
-            
-            
+
+
 REALM_START_DOCSTRING = r"""
     This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
@@ -389,7 +352,7 @@ def _flatten_inputs(self, *inputs):
 
 
 @add_start_docstrings(
-    "The embedder of REALM outputting raw hidden-states without any specific head on top.",
+    "The embedder of REALM outputting projected score that will be used to calculate relevance score.",
     REALM_START_DOCSTRING,
 )
 class RealmEmbedder(RealmPreTrainedModel):
@@ -400,6 +363,12 @@ def __init__(self, config):
         self.cls = RealmRetrieverProjection(self.config)
         self.init_weights()
 
+    def get_input_embeddings(self):
+        return self.bert.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.bert.embeddings.word_embeddings = value
+
     def forward(
         self,
         input_ids=None,
@@ -415,6 +384,8 @@ def forward(
         return_dict=None,
     ):
 
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         bert_outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -430,22 +401,22 @@ def forward(
         )
 
         # [batch_size, hidden_size]
-        pooler_output = bert_outputs.pooler_output
+        pooler_output = bert_outputs[1]
         # [batch_size, retriever_proj_size]
         projected_score = self.cls(pooler_output)
-        
+
         if not return_dict:
-            return (projected_score,) + bert_outputs[1:]
+            return (projected_score,) + bert_outputs[2:4]
         else:
             return RealmEmbedderOutput(
-                projected_score = projected_score,
-                hidden_states = bert_outputs.hidden_states,
-                attentions = bert_outputs.attentions,
+                projected_score=projected_score,
+                hidden_states=bert_outputs.hidden_states,
+                attentions=bert_outputs.attentions,
             )
 
 
 @add_start_docstrings(
-    "The retriever of REALM outputting raw hidden-states without any specific head on top.",
+    "The retriever of REALM outputting relevance score representing the score of document candidates (before softmax)",
     REALM_START_DOCSTRING,
 )
 class RealmRetriever(RealmPreTrainedModel):
@@ -453,12 +424,12 @@ def __init__(self, config, query_embedder=None):
         super().__init__(config)
 
         self.embedder = RealmEmbedder(self.config)
-        
+
         if query_embedder:
             self.query_embedder = query_embedder
         else:
             self.query_embedder = self.embedder
-        
+
         self.init_weights()
 
     def forward(
@@ -479,6 +450,8 @@ def forward(
         return_dict=None,
     ):
 
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         query_outputs = self.query_embedder(
             input_ids,
             attention_mask=attention_mask,
@@ -494,14 +467,8 @@ def forward(
         )
 
         # [batch_size * num_candidates, candidate_seq_len]
-        (
-            flattened_input_ids, 
-            flattened_attention_mask,
-            flattened_token_type_ids
-        ) = self._flatten_inputs(
-            candidate_input_ids, 
-            candidate_attention_mask, 
-            candidate_token_type_ids
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            candidate_input_ids, candidate_attention_mask, candidate_token_type_ids
         )
 
         candidate_outputs = self.embedder(
@@ -529,16 +496,14 @@ def forward(
 
         if not return_dict:
             return relevance_score, query_score, candidate_score
-        else:
-            return RealmRetrieverOutput(
-                relevance_score = relevance_score,
-                query_score = query_score,
-                candidate_score = candidate_score
-            )
+
+        return RealmRetrieverOutput(
+            relevance_score=relevance_score, query_score=query_score, candidate_score=candidate_score
+        )
 
 
 @add_start_docstrings(
-    "The encoder of REALM outputting raw hidden-states without any specific head on top.",
+    "The encoder of REALM outputting masked lm logits and marginal log-likelihood loss.",
     REALM_START_DOCSTRING,
 )
 class RealmEncoder(RealmPreTrainedModel):
@@ -577,14 +542,11 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
-        (
-            flattened_input_ids, 
-            flattened_attention_mask, 
-            flattened_token_type_ids
-        ) = self._flatten_inputs(
-            input_ids, 
-            attention_mask, 
-            token_type_ids
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            input_ids, attention_mask, token_type_ids
         )
 
         joint_outputs = self.bert(
@@ -616,8 +578,8 @@ def forward(
                 mlm_mask = mlm_mask.type(torch.float32)
 
             # Compute marginal log-likelihood
-            loss_fct = CrossEntropyLoss(reduction='none')  # -100 index = padding token
-            
+            loss_fct = CrossEntropyLoss(reduction="none")  # -100 index = padding token
+
             # [batch_size * num_candidates * joint_seq_len, vocab_size]
             mlm_logits = prediction_scores.view(-1, self.config.vocab_size)
             # [batch_size * num_candidates * joint_seq_len]
@@ -631,13 +593,10 @@ def forward(
             # [batch_size, joint_seq_len]
             marginal_gold_log_probs = joint_gold_log_prob.logsumexp(1)
             # []
-            masked_lm_loss = -torch.nansum(
-                torch.sum(marginal_gold_log_probs * mlm_mask) / 
-                torch.sum(mlm_mask)
-            )
+            masked_lm_loss = -torch.nansum(torch.sum(marginal_gold_log_probs * mlm_mask) / torch.sum(mlm_mask))
 
         if not return_dict:
-            output = (prediction_scores,) + joint_outputs[1:]
+            output = (prediction_scores,) + joint_outputs[2:4]
             return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
 
         return MaskedLMOutput(
@@ -645,4 +604,4 @@ def forward(
             logits=prediction_scores,
             hidden_states=joint_outputs.hidden_states,
             attentions=joint_outputs.attentions,
-        )
\ No newline at end of file
+        )
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 8eb648af1c25..9c51af8eaa2d 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -30,6 +30,7 @@
     import torch
 
     from transformers import (
+        RealmEmbedder,
         RealmEncoder,
         RealmRetriever,
     )
@@ -40,32 +41,35 @@
 
 class RealmModelTester:
     def __init__(
-            self,
-            parent,
-            batch_size=13,
-            seq_length=7,
-            is_training=True,
-            use_input_mask=True,
-            use_token_type_ids=True,
-            use_labels=True,
-            vocab_size=99,
-            hidden_size=32,
-            num_hidden_layers=5,
-            num_attention_heads=4,
-            intermediate_size=37,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=16,
-            type_sequence_label_size=2,
-            initializer_range=0.02,
-            num_labels=3,
-            num_choices=4,
-            scope=None,
+        self,
+        parent,
+        batch_size=13,
+        retriever_proj_size=128,
+        seq_length=7,
+        is_training=False,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        num_candidates=10,
+        scope=None,
     ):
         self.parent = parent
         self.batch_size = batch_size
+        self.retriever_proj_size = retriever_proj_size
         self.seq_length = seq_length
         self.is_training = is_training
         self.use_input_mask = use_input_mask
@@ -85,18 +89,26 @@ def __init__(
         self.initializer_range = initializer_range
         self.num_labels = num_labels
         self.num_choices = num_choices
+        self.num_candidates = num_candidates
         self.scope = scope
 
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
 
         input_mask = None
+        candiate_input_mask = None
         if self.use_input_mask:
             input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
 
         token_type_ids = None
+        candidate_token_type_ids = None
         if self.use_token_type_ids:
             token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+            candidate_token_type_ids = ids_tensor(
+                [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
+            )
 
         sequence_labels = None
         token_labels = None
@@ -108,14 +120,28 @@ def prepare_config_and_inputs(self):
 
         config = self.get_config()
 
-        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        # inputs with additional num_candidates axis.
+        candidate_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
+
+        return (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            candidate_inputs,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        )
 
     def get_config(self):
         return RealmConfig(
             vocab_size=self.vocab_size,
             hidden_size=self.hidden_size,
+            retriever_proj_size=self.retriever_proj_size,
             num_hidden_layers=self.num_hidden_layers,
             num_attention_heads=self.num_attention_heads,
+            num_candidates=self.num_candidates,
             intermediate_size=self.intermediate_size,
             hidden_act=self.hidden_act,
             hidden_dropout_prob=self.hidden_dropout_prob,
@@ -126,63 +152,76 @@ def get_config(self):
             initializer_range=self.initializer_range,
         )
 
-    def prepare_config_and_inputs_for_decoder(self):
-        (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-        ) = self.prepare_config_and_inputs()
-
-        config.is_decoder = True
-        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
-        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
-
-        return (
-            config,
-            input_ids,
-            token_type_ids,
-            input_mask,
-            sequence_labels,
-            token_labels,
-            choice_labels,
-            encoder_hidden_states,
-            encoder_attention_mask,
-        )
-
-    def create_and_check_encoder(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def create_and_check_embedder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        candidate_inputs,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
-        model = RealmEncoder(config=config)
+        model = RealmEmbedder(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+        self.parent.assertEqual(result.projected_score.shape, (self.batch_size, self.retriever_proj_size))
 
-    """
-    def create_and_check_model(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    def create_and_check_encoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        candidate_inputs,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
-        model = RealmModel(config=config)
+        model = RealmEncoder(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
-        result = model(input_ids, token_type_ids=token_type_ids)
-        result = model(input_ids)
-        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
-    """
+        relevance_score = floats_tensor([self.batch_size, self.num_candidates])
+        result = model(
+            candidate_inputs[0],
+            attention_mask=candidate_inputs[1],
+            token_type_ids=candidate_inputs[2],
+            relevance_score=relevance_score,
+            labels=token_labels,
+        )
+        self.parent.assertEqual(
+            result.logits.shape, (self.batch_size * self.num_candidates, self.seq_length, self.vocab_size)
+        )
 
     def create_and_check_retriever(
-            self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        candidate_inputs,
+        sequence_labels,
+        token_labels,
+        choice_labels,
     ):
         model = RealmRetriever(config=config)
         model.to(torch_device)
         model.eval()
-        result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            token_type_ids=token_type_ids,
+            candidate_input_ids=candidate_inputs[0],
+            candidate_attention_mask=candidate_inputs[1],
+            candidate_token_type_ids=candidate_inputs[2],
+        )
+        self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
+        self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
+        self.parent.assertEqual(
+            result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
+        )
 
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
@@ -191,6 +230,7 @@ def prepare_config_and_inputs_for_common(self):
             input_ids,
             token_type_ids,
             input_mask,
+            candidate_inputs,
             sequence_labels,
             token_labels,
             choice_labels,
@@ -204,36 +244,41 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
 
     all_model_classes = (
         (
+            RealmEmbedder,
             RealmEncoder,
-            RealmRetriever,
+            # RealmRetriever, # need overrite test_attention_outputs
         )
         if is_torch_available()
         else ()
     )
     all_generative_model_classes = ()
 
+    # disable these tests because there is no base_model in Realm
+    test_save_load_fast_init_from_base = False
+    test_save_load_fast_init_to_base = False
+
     def setUp(self):
+        self.test_pruning = False
         self.model_tester = RealmModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=RealmConfig, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=RealmConfig)
 
     def test_config(self):
         self.config_tester.run_common_tests()
 
+    def test_embedder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_embedder(*config_and_inputs)
+
     def test_encoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_encoder(*config_and_inputs)
-    
-    """
-    def test_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_model(*config_and_inputs)
-    """
 
     def test_model_various_embeddings(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         for type in ["absolute", "relative_key", "relative_key_query"]:
             config_and_inputs[0].position_embedding_type = type
-            self.model_tester.create_and_check_model(*config_and_inputs)
+            self.model_tester.create_and_check_embedder(*config_and_inputs)
+            self.model_tester.create_and_check_encoder(*config_and_inputs)
 
     def test_retriever(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
@@ -251,24 +296,70 @@ def test_retriever_from_pretrained(self):
             model = RealmRetriever.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+
 @require_torch
 class RealmModelIntegrationTest(unittest.TestCase):
     @slow
-    def test_inference_masked_lm(self):
-        model = RealmRetriever.from_pretrained("realm-cc-news-pretrained-embedder")
+    def test_inference_embedder(self):
+        retriever_projected_size = 128
+        vocab_size = 30522
+
+        model = RealmEmbedder.from_pretrained("qqaatw/realm-cc-news-pretrained-embedder")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
+        expected_shape = torch.Size((1, retriever_projected_size))
+        self.assertEqual(output.shape, expected_shape)
+
+        print('embedder', output[:, :3])
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[-0.0714, -0.0837, -0.1314]]
+        )
+
+        self.assertTrue(torch.allclose(output[:, :3], expected_slice, atol=1e-4))
+    
+    @slow
+    def test_inference_encoder(self):
+        num_candidates = 2
         vocab_size = 30522
 
-        expected_shape = torch.Size((1, 6, vocab_size))
+        model = RealmEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates)
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
+        relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
+        output = model(input_ids, relevance_score=relevance_score)[0]
+
+        expected_shape = torch.Size((2, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)
 
         # TODO Replace values below with what was printed above.
+
+        print('encoder', output[1, :2, :2])
         expected_slice = torch.tensor(
-            [[[-0.0483, 0.1188, -0.0313], [-0.0606, 0.1435, 0.0199], [-0.0235, 0.1519, 0.0175]]]
+            [[[-11.0888, -11.2544], [-10.2170, -10.3874]]]
         )
 
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+        self.assertTrue(torch.allclose(output[1, :2, :2], expected_slice, atol=1e-4))
 
+    @slow
+    def test_inference_retriever(self):
+        num_candidates = 2
+        vocab_size = 30522
+
+        model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates)
+        
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
+        output = model(input_ids, candidate_input_ids=candidate_input_ids)[0]
+
+        
+        expected_shape = torch.Size((1, 2))
+        self.assertEqual(output.shape, expected_shape)
+
+        print('retriever', output)
+        # TODO Replace values below with what was printed above.
+        expected_slice = torch.tensor(
+            [[0.7410, 0.7170]]
+        )
 
+        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))

From 1b3bba2baf1de28f9824f94b3d04fd64a559cc86 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 31 Aug 2021 23:34:22 +0800
Subject: [PATCH 12/98] Prune RealmModel

---
 src/transformers/models/realm/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index a99bb2b7d8c3..904b713859f7 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -31,7 +31,6 @@
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
         "RealmEmbedder",
         "RealmEncoder",
-        "RealmModel",
         "RealmPreTrainedModel",
         "RealmRetriever",
         "load_tf_weights_in_realm",
@@ -50,7 +49,6 @@
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
             RealmEncoder,
-            RealmModel,
             RealmPreTrainedModel,
             RealmRetriever,
             load_tf_weights_in_realm,

From 766d663b9e5f0e39490fb1a68af76cc4ebcf0a0a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 00:46:17 +0800
Subject: [PATCH 13/98] Update docs

---
 docs/source/index.rst                         |  1 +
 docs/source/model_doc/realm.rst               | 12 +--
 .../models/realm/configuration_realm.py       | 18 ++--
 .../models/realm/modeling_realm.py            | 95 +++++++++++++++----
 4 files changed, 90 insertions(+), 36 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index d1cce5aaa33a..6f658ff8461d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -570,6 +570,7 @@ Flax), PyTorch, and/or TensorFlow.
     model_doc/phobert
     model_doc/prophetnet
     model_doc/rag
+    model_doc/realm
     model_doc/reformer
     model_doc/rembert
     model_doc/retribert
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 21696675c7f6..83a924f3f771 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -19,7 +19,7 @@ Overview
 The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
 <https://arxiv.org/abs/2002.08909>`__  by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's
 a retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then 
-utilizes retrieved documents to process Open-QA tasks.
+utilizes retrieved documents to process question answering tasks.
 
 The abstract from the paper is the following:
 
@@ -27,12 +27,8 @@ The abstract from the paper is the following:
 To capture knowledge in a more modular and interpretable way, we augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents.
 We demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as interpretability and modularity.*
 
-Tips:
-
-<INSERT TIPS ABOUT MODEL HERE>
-
 This model was contributed by `qqaatw 
-<https://huggingface.co/qqaatw>>`__. The original code can be found `here 
+<https://huggingface.co/qqaatw>`__. The original code can be found `here 
 <https://github.com/google-research/language/tree/master/language/realm>`__.
 
 RealmConfig
@@ -57,10 +53,10 @@ RealmTokenizerFast
     :members:
 
 
-RealmModel
+RealmEmbedder
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RealmModel
+.. autoclass:: transformers.RealmEmbedder
     :members: forward
 
 
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 739e431a24fb..c0a605333cf6 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -47,13 +47,17 @@ class RealmConfig(PretrainedConfig):
             :class:`~transformers.TFRealmModel`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
+        retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
+            Dimension of the retriever(embedder) projection.
         num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
             Number of hidden layers in the Transformer encoder.
         num_attention_heads (:obj:`int`, `optional`, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
+        num_candidates (:obj:`int`, `optional`, defaults to 8):
+            Number of candidates inputted to the RealmRetriever or RealmEncoder.
         intermediate_size (:obj:`int`, `optional`, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler.
             If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
@@ -73,17 +77,16 @@ class RealmConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if ``config.is_decoder=True``.
-        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, use gradient checkpointing to save memory at the expense of slower backward pass.
-        Example::
+        
+    Example::
 
-        >>> from transformers import RealmModel, RealmConfig
+        >>> from transformers import RealmEmbedder, RealmConfig
 
-        >>> # Initializing a REALM realm-cc-news-pretrained style configuration
+        >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration
         >>> configuration = RealmConfig()
 
         >>> # Initializing a model from the qqaatw/realm-cc-news-pretrained-embedder style configuration
-        >>> model = RealmRetriever(configuration)
+        >>> model = RealmEmbedder(configuration)
 
         >>> # Accessing the model configuration
         >>> configuration = model.config
@@ -107,7 +110,6 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         use_cache=True,
-        is_encoder_decoder=False,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 4d20d46653ef..f27c038e2a00 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -150,7 +150,7 @@ class RealmEmbedderOutput(ModelOutput):
     Args:
         projected_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
 
-            Projected scores.
+            Projected score.
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
             of shape :obj:`(batch_size, sequence_length, hidden_size)`.
@@ -176,11 +176,11 @@ class RealmRetrieverOutput(ModelOutput):
 
     Args:
         relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates)`):
-            Relevance score.
+            The relevance score of document candidates (before softmax).
         query_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
-            Query score.
+            Query score derived from the query embedder.
         candidate_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates, config.retriever_proj_size)`):
-            Candidate score.
+            Candidate score derived from the embedder.
     """
 
     relevance_score: torch.FloatTensor = None
@@ -369,6 +369,8 @@ def get_input_embeddings(self):
     def set_input_embeddings(self, value):
         self.bert.embeddings.word_embeddings = value
 
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -377,12 +379,13 @@ def forward(
         position_ids=None,
         head_mask=None,
         inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        Returns:
+        """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -393,8 +396,6 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -420,6 +421,11 @@ def forward(
     REALM_START_DOCSTRING,
 )
 class RealmRetriever(RealmPreTrainedModel):
+    r"""
+    Parameters:
+        query_embedder (:class:`~transformers.RealmEmbedder`): 
+            Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
+    """
     def __init__(self, config, query_embedder=None):
         super().__init__(config)
 
@@ -432,6 +438,8 @@ def __init__(self, config, query_embedder=None):
 
         self.init_weights()
 
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmRetrieverOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -441,17 +449,50 @@ def forward(
         candidate_input_ids=None,
         candidate_attention_mask=None,
         candidate_token_type_ids=None,
+        candidate_inputs_embeds=None,
         head_mask=None,
         inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        candidate_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`):
+            Indices of candidate input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`transformers.RealmTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        candidate_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        candidate_token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        candidate_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_candidates, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`candidate_input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `candidate_input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        
+        Returns:
+        """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if not (any((input_ids, inputs_embeds)) and any((input_ids, inputs_embeds))):
+            raise ValueError("You have to specify both inputs and candidate inputs")
+
         query_outputs = self.query_embedder(
             input_ids,
             attention_mask=attention_mask,
@@ -459,8 +500,6 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -477,9 +516,7 @@ def forward(
             token_type_ids=flattened_token_type_ids,
             position_ids=position_ids,
             head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
+            inputs_embeds=candidate_inputs_embeds,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -525,23 +562,40 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
         attention_mask=None,
         token_type_ids=None,
         position_ids=None,
-        relevance_score=None,
         head_mask=None,
         inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
+        relevance_score=None,
         labels=None,
         mlm_mask=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
     ):
+        r"""
+        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates)`, `optional`):
+            Relevance score derived from RealmRetriever.
+ 
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+        mlm_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+        """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -556,8 +610,6 @@ def forward(
             position_ids=position_ids,
             head_mask=head_mask,
             inputs_embeds=inputs_embeds,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
@@ -572,6 +624,9 @@ def forward(
 
         masked_lm_loss = None
         if labels is not None:
+            if candidate_score is None:
+                raise ValueError("You have to specify relevance_score when `labels` is specified in order to calculate loss.")
+
             if mlm_mask is None:
                 mlm_mask = torch.ones_like(labels, dtype=torch.float32)
             else:

From eb1837be5f5379c85141801af4f5ce37a8566125 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 00:56:02 +0800
Subject: [PATCH 14/98] Add training test.

---
 src/transformers/models/realm/modeling_realm.py |  2 +-
 tests/test_modeling_realm.py                    | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index f27c038e2a00..c4fda1e97b7f 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -490,7 +490,7 @@ def forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if not (any((input_ids, inputs_embeds)) and any((input_ids, inputs_embeds))):
+        if not (any((input_ids is not None, inputs_embeds is not None)) and any((candidate_input_ids is not None, candidate_inputs_embeds is not None))):
             raise ValueError("You have to specify both inputs and candidate inputs")
 
         query_outputs = self.query_embedder(
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 9c51af8eaa2d..1763c40ab12f 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -284,6 +284,21 @@ def test_retriever(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_retriever(*config_and_inputs)
 
+    def test_training(self):
+        if not self.model_tester.is_training:
+            return
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        for model_class in [RealmEncoder]:
+            model = model_class(config)
+            model.to(torch_device)
+            model.train()
+            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            loss = model(**inputs).loss
+            loss.backward()
+
     @slow
     def test_encoder_from_pretrained(self):
         for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:

From b3161499345b11c0a5a31ee57f6cd80e6fb11d90 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 01:00:27 +0800
Subject: [PATCH 15/98] Remove completed TODO

---
 tests/test_modeling_realm.py | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 1763c40ab12f..1c0c0d9213eb 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -326,14 +326,9 @@ def test_inference_embedder(self):
         expected_shape = torch.Size((1, retriever_projected_size))
         self.assertEqual(output.shape, expected_shape)
 
-        print('embedder', output[:, :3])
-        # TODO Replace values below with what was printed above.
-        expected_slice = torch.tensor(
-            [[-0.0714, -0.0837, -0.1314]]
-        )
-
+        expected_slice = torch.tensor([[-0.0714, -0.0837, -0.1314]])
         self.assertTrue(torch.allclose(output[:, :3], expected_slice, atol=1e-4))
-    
+
     @slow
     def test_inference_encoder(self):
         num_candidates = 2
@@ -347,12 +342,7 @@ def test_inference_encoder(self):
         expected_shape = torch.Size((2, 6, vocab_size))
         self.assertEqual(output.shape, expected_shape)
 
-        # TODO Replace values below with what was printed above.
-
-        print('encoder', output[1, :2, :2])
-        expected_slice = torch.tensor(
-            [[[-11.0888, -11.2544], [-10.2170, -10.3874]]]
-        )
+        expected_slice = torch.tensor([[[-11.0888, -11.2544], [-10.2170, -10.3874]]])
 
         self.assertTrue(torch.allclose(output[1, :2, :2], expected_slice, atol=1e-4))
 
@@ -361,20 +351,16 @@ def test_inference_retriever(self):
         num_candidates = 2
         vocab_size = 30522
 
-        model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates)
-        
+        model = RealmRetriever.from_pretrained(
+            "qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates
+        )
+
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
         output = model(input_ids, candidate_input_ids=candidate_input_ids)[0]
 
-        
         expected_shape = torch.Size((1, 2))
         self.assertEqual(output.shape, expected_shape)
 
-        print('retriever', output)
-        # TODO Replace values below with what was printed above.
-        expected_slice = torch.tensor(
-            [[0.7410, 0.7170]]
-        )
-
+        expected_slice = torch.tensor([[0.7410, 0.7170]])
         self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))

From 1b14c70f249950bfe0a677fd0a76b7d2cdfa494d Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 01:14:11 +0800
Subject: [PATCH 16/98] Style & Quality

---
 src/transformers/models/__init__.py           |  2 +-
 src/transformers/models/realm/__init__.py     |  4 +-
 .../models/realm/configuration_realm.py       |  2 +-
 .../models/realm/modeling_realm.py            | 39 ++++++++-----------
 .../models/realm/tokenization_realm.py        |  2 +-
 .../models/realm/tokenization_realm_fast.py   |  2 +-
 tests/test_modeling_realm.py                  | 15 ++-----
 7 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 1d62e74bb2f1..25cffdfe8a62 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -17,7 +17,6 @@
 # limitations under the License.
 
 from . import (
-    realm,
     albert,
     auto,
     bart,
@@ -73,6 +72,7 @@
     phobert,
     prophetnet,
     rag,
+    realm,
     reformer,
     rembert,
     retribert,
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 904b713859f7..dc8d5684df34 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -16,7 +16,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from typing import TYPE_CHECKING
-from ...file_utils import _LazyModule, is_torch_available, is_tokenizers_available
+
+from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available
+
 
 _import_structure = {
     "configuration_realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig"],
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index c0a605333cf6..4e3c2afbd1e7 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -77,7 +77,7 @@ class RealmConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if ``config.is_decoder=True``.
-        
+
     Example::
 
         >>> from transformers import RealmEmbedder, RealmConfig
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index c4fda1e97b7f..1d893af32ed1 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -16,30 +16,18 @@
 
 
 import os
+from dataclasses import dataclass
+from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
-from typing import Optional, Tuple
-from dataclasses import dataclass
-from packaging import version
 from torch import nn
-from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import MaskedLMOutput, ModelOutput
-from ...modeling_utils import (
-    PreTrainedModel,
-    SequenceSummary,
-    apply_chunking_to_forward,
-    find_pruneable_heads_and_indices,
-    prune_linear_layer,
-)
+from ...modeling_utils import PreTrainedModel
 from ...utils import logging
 from ..bert import BertModel
 from .configuration_realm import RealmConfig
@@ -85,7 +73,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        original_name = name
 
         # embedder
         embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
@@ -423,9 +410,10 @@ def forward(
 class RealmRetriever(RealmPreTrainedModel):
     r"""
     Parameters:
-        query_embedder (:class:`~transformers.RealmEmbedder`): 
+        query_embedder (:class:`~transformers.RealmEmbedder`):
             Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
     """
+
     def __init__(self, config, query_embedder=None):
         super().__init__(config)
 
@@ -484,13 +472,16 @@ def forward(
             Optionally, instead of passing :obj:`candidate_input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `candidate_input_ids` indices into associated vectors
             than the model's internal embedding lookup matrix.
-        
+
         Returns:
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if not (any((input_ids is not None, inputs_embeds is not None)) and any((candidate_input_ids is not None, candidate_inputs_embeds is not None))):
+        if not (
+            any((input_ids is not None, inputs_embeds is not None))
+            and any((candidate_input_ids is not None, candidate_inputs_embeds is not None))
+        ):
             raise ValueError("You have to specify both inputs and candidate inputs")
 
         query_outputs = self.query_embedder(
@@ -582,7 +573,7 @@ def forward(
         r"""
         relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates)`, `optional`):
             Relevance score derived from RealmRetriever.
- 
+
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
             config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
@@ -625,7 +616,9 @@ def forward(
         masked_lm_loss = None
         if labels is not None:
             if candidate_score is None:
-                raise ValueError("You have to specify relevance_score when `labels` is specified in order to calculate loss.")
+                raise ValueError(
+                    "You have to specify relevance_score when `labels` is specified in order to calculate loss."
+                )
 
             if mlm_mask is None:
                 mlm_mask = torch.ones_like(labels, dtype=torch.float32)
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 4d992dd1ed79..2b4ad1f95678 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -32,7 +32,7 @@
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     "realm-cc-news-pretrained-embedder": 512,
     "realm-cc-news-pretrained-retriever": 512,
-    "realm-cc-news-pretrained-encoder": 512
+    "realm-cc-news-pretrained-encoder": 512,
 }
 
 
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
index 853156c6e7b7..14e9349a982d 100644
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ b/src/transformers/models/realm/tokenization_realm_fast.py
@@ -33,7 +33,7 @@
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     "realm-cc-news-pretrained-embedder": 512,
     "realm-cc-news-pretrained-retriever": 512,
-    "realm-cc-news-pretrained-encoder": 512
+    "realm-cc-news-pretrained-encoder": 512,
 }
 
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 1c0c0d9213eb..8c680b57edde 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -18,10 +18,9 @@
 import unittest
 
 from tests.test_modeling_common import floats_tensor
-from transformers import is_torch_available
+from transformers import RealmConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from transformers import RealmConfig
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
 
@@ -29,14 +28,8 @@
 if is_torch_available():
     import torch
 
-    from transformers import (
-        RealmEmbedder,
-        RealmEncoder,
-        RealmRetriever,
-    )
-    from transformers.models.realm.modeling_realm import (
-        REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
-    )
+    from transformers import RealmEmbedder, RealmEncoder, RealmRetriever
+    from transformers.models.realm.modeling_realm import REALM_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class RealmModelTester:
@@ -317,7 +310,6 @@ class RealmModelIntegrationTest(unittest.TestCase):
     @slow
     def test_inference_embedder(self):
         retriever_projected_size = 128
-        vocab_size = 30522
 
         model = RealmEmbedder.from_pretrained("qqaatw/realm-cc-news-pretrained-embedder")
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
@@ -349,7 +341,6 @@ def test_inference_encoder(self):
     @slow
     def test_inference_retriever(self):
         num_candidates = 2
-        vocab_size = 30522
 
         model = RealmRetriever.from_pretrained(
             "qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates

From ce0ef7067099fe04b9f7a0af56c644bf0ef32a5f Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 01:26:20 +0800
Subject: [PATCH 17/98] Prune `RealmModel`

---
 src/transformers/__init__.py                         | 2 --
 src/transformers/models/realm/configuration_realm.py | 9 ++++-----
 tests/test_modeling_realm.py                         | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 08e020c66b3d..d66548c23537 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -509,7 +509,6 @@
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
             "RealmEmbedder",
             "RealmEncoder",
-            "RealmModel",
             "RealmPreTrainedModel",
             "RealmRetriever",
             "load_tf_weights_in_realm",
@@ -2581,7 +2580,6 @@
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
             RealmEncoder,
-            RealmModel,
             RealmPreTrainedModel,
             RealmRetriever,
             load_tf_weights_in_realm,
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 4e3c2afbd1e7..e4676c06be8a 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -30,7 +30,7 @@
 
 class RealmConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RealmModel`.
+    This is the configuration class to store the configuration of :class:`~transformers.RealmEmbedder`, :class:`~transformers.RealmRetriever`, and :class:`~transformers.RealmEncoder`.
     It is used to instantiate an REALM model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
     the REALM `realm-cc-news-pretrained <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
@@ -43,8 +43,7 @@ class RealmConfig(PretrainedConfig):
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmModel` or
-            :class:`~transformers.TFRealmModel`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`, :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
@@ -68,8 +67,8 @@ class RealmConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with.
             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmModel` or
-            :class:`~transformers.TFRealmModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
+            :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 8c680b57edde..ac48a5ffb189 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -239,7 +239,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmEncoder,
-            # RealmRetriever, # need overrite test_attention_outputs
+            # RealmRetriever, # needs override test_attention_outputs
         )
         if is_torch_available()
         else ()

From 92a6a5be2eb4022c4cc3f4a069690f0f9cd07a26 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 11:30:13 +0800
Subject: [PATCH 18/98] Fixup

---
 docs/source/index.rst                         | 41 +++++++++++--------
 docs/source/model_doc/realm.rst               | 26 ++++++++----
 src/transformers/__init__.py                  | 20 ++++-----
 .../models/realm/configuration_realm.py       | 25 +++++------
 .../models/realm/modeling_realm.py            | 40 +++++++++---------
 src/transformers/utils/dummy_pt_objects.py    | 31 ++++++++++++++
 .../utils/dummy_tokenizers_objects.py         |  9 ++++
 tests/test_modeling_realm.py                  |  2 +-
 utils/check_repo.py                           |  4 ++
 9 files changed, 129 insertions(+), 69 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 086fd3d12d22..8a6fc056d417 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -251,58 +251,61 @@ Supported models
 51. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
     Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
     Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-52. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+52. :doc:`REALM <model_doc/realm>` (from Google Research) released with the paper `REALM: Retrieval-Augmented Language
+    Model Pre-Training <https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and
+    Ming-Wei Chang.
+53. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-53. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
+54. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
     pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
     Tsai, M. Johnson, Sebastian Ruder.
-54. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+55. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
     Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
     Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-55. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+56. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
     Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
     Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-56. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+57. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
     `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
     Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-57. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
+58. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
     Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
     Jonathan Berant, Amir Globerson, Omer Levy.
-58. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+59. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
     about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
     Krishna, and Kurt W. Keutzer.
-59. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+60. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-60. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+61. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
     Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
     Francesco Piccinno and Julian Martin Eisenschlos.
-61. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+62. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
     Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
     Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-62. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+63. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
     Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
     Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
     Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-63. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+64. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
     Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
     Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-64. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+65. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
     Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
     Zhou, Abdelrahman Mohamed, Michael Auli.
-65. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+66. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
     Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-66. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+67. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
     Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
     Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-67. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+68. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
     Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
     Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
     Zettlemoyer and Veselin Stoyanov.
-68. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
+69. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
     Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
     Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-69. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+70. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
     Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
     Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 
@@ -416,6 +419,8 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
++-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 83a924f3f771..a602c7828b67 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -17,18 +17,26 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
-<https://arxiv.org/abs/2002.08909>`__  by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's
-a retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then 
+<https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
+retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then
 utilizes retrieved documents to process question answering tasks.
 
 The abstract from the paper is the following:
 
-*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network, requiring ever-larger networks to cover more facts.
-To capture knowledge in a more modular and interpretable way, we augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents.
-We demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as interpretability and modularity.*
-
-This model was contributed by `qqaatw 
-<https://huggingface.co/qqaatw>`__. The original code can be found `here 
+*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
+such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
+requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
+augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
+over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
+first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
+modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
+demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
+challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
+explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
+methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
+interpretability and modularity.*
+
+This model was contributed by `qqaatw <https://huggingface.co/qqaatw>`__. The original code can be found `here
 <https://github.com/google-research/language/tree/master/language/realm>`__.
 
 RealmConfig
@@ -71,4 +79,4 @@ RealmEncoder
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.RealmEncoder
-    :members: forward
\ No newline at end of file
+    :members: forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 506d5db5a689..8f92164c35d8 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -514,16 +514,6 @@
 
     # PyTorch models structure
 
-    _import_structure["models.realm"].extend(
-        [
-            "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "RealmEmbedder",
-            "RealmEncoder",
-            "RealmPreTrainedModel",
-            "RealmRetriever",
-            "load_tf_weights_in_realm",
-        ]
-    )
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -1025,6 +1015,16 @@
     _import_structure["models.rag"].extend(
         ["RagModel", "RagPreTrainedModel", "RagSequenceForGeneration", "RagTokenForGeneration"]
     )
+    _import_structure["models.realm"].extend(
+        [
+            "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RealmEmbedder",
+            "RealmEncoder",
+            "RealmPreTrainedModel",
+            "RealmRetriever",
+            "load_tf_weights_in_realm",
+        ]
+    )
     _import_structure["models.reformer"].extend(
         [
             "REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index e4676c06be8a..e12f8e4e3f52 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -30,20 +30,21 @@
 
 class RealmConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of :class:`~transformers.RealmEmbedder`, :class:`~transformers.RealmRetriever`, and :class:`~transformers.RealmEncoder`.
-    It is used to instantiate an REALM model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the REALM `realm-cc-news-pretrained <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
+    This is the configuration class to store the configuration of :class:`~transformers.RealmEmbedder`,
+    :class:`~transformers.RealmRetriever`, and :class:`~transformers.RealmEncoder`. It is used to instantiate an REALM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the REALM `realm-cc-news-pretrained
+    <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
 
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
-    for more information.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
 
 
     Args:
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`, :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`,
+            :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
@@ -57,15 +58,15 @@ class RealmConfig(PretrainedConfig):
         intermediate_size (:obj:`int`, `optional`, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler.
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout ratio for the attention probabilities.
         max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
             :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 1d893af32ed1..025c213f2042 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -237,8 +237,8 @@ def forward(self, hidden_states):
 
 class RealmPreTrainedModel(PreTrainedModel):
     """
-    An abstract class to handle weights initialization and
-    a simple interface for downloading and loading pretrained models.
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
     """
 
     config_class = RealmConfig
@@ -277,14 +277,15 @@ def _flatten_inputs(self, *inputs):
 
 
 REALM_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
-    usage and behavior.
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config (:class:`~transformers.RealmConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the configuration.
-            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
 """
 
 REALM_INPUTS_DOCSTRING = r"""
@@ -292,9 +293,9 @@ def _flatten_inputs(self, *inputs):
         input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`transformers.RealmTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            Indices can be obtained using :class:`transformers.RealmTokenizer`. See
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+            details.
 
             `What are input IDs? <../glossary.html#input-ids>`__
         attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
@@ -313,8 +314,8 @@ def _flatten_inputs(self, *inputs):
 
             `What are token type IDs? <../glossary.html#token-type-ids>`_
         position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
 
             `What are position IDs? <../glossary.html#position-ids>`_
         head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
@@ -448,9 +449,9 @@ def forward(
         candidate_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`):
             Indices of candidate input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`transformers.RealmTokenizer`.
-            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            Indices can be obtained using :class:`transformers.RealmTokenizer`. See
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+            details.
 
             `What are input IDs? <../glossary.html#input-ids>`__
         candidate_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`, `optional`):
@@ -469,9 +470,9 @@ def forward(
 
             `What are token type IDs? <../glossary.html#token-type-ids>`_
         candidate_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_candidates, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`candidate_input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `candidate_input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
+            Optionally, instead of passing :obj:`candidate_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert `candidate_input_ids` indices
+            into associated vectors than the model's internal embedding lookup matrix.
 
         Returns:
         """
@@ -580,7 +581,8 @@ def forward(
             (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
 
         mlm_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked. Mask values selected in ``[0, 1]``:
+            Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked.
+            Mask values selected in ``[0, 1]``:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 86de6778ca1f..c37d1c6cb02b 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2809,6 +2809,37 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+REALM_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class RealmEmbedder:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmEncoder:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class RealmPreTrainedModel:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class RealmRetriever:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def load_tf_weights_in_realm(*args, **kwargs):
+    requires_backends(load_tf_weights_in_realm, ["torch"])
+
+
 REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 382f0314bbfe..1fddcce2caed 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -263,6 +263,15 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["tokenizers"])
 
 
+class RealmTokenizerFast:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tokenizers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["tokenizers"])
+
+
 class ReformerTokenizerFast:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index ac48a5ffb189..6a8ee27a3dea 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -239,7 +239,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmEncoder,
-            # RealmRetriever, # needs override test_attention_outputs
+            RealmRetriever,
         )
         if is_torch_available()
         else ()
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 088d760aa9b7..e9786752d162 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -71,6 +71,7 @@
     "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
+    "RealmRetriever",  # Submodels have been tested.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
     "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
@@ -114,6 +115,9 @@
     "RagModel",
     "RagSequenceForGeneration",
     "RagTokenForGeneration",
+    "RealmEmbedder",
+    "RealmEncoder",
+    "RealmRetriever",
     "TFDPRReader",
     "TFGPT2DoubleHeadsModel",
     "TFOpenAIGPTDoubleHeadsModel",

From 28b8dac5f3970795ec3d504ea98687a5d4579f80 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 16:29:34 +0800
Subject: [PATCH 19/98] Changes: 1. Remove RealmTokenizerFast 2. Update
 docstrings 3. Add a method to RealmTokenizer to handle candidates
 tokenization.

---
 docs/source/index.rst                         |  2 +-
 docs/source/model_doc/realm.rst               |  7 --
 src/transformers/__init__.py                  |  2 -
 src/transformers/models/realm/__init__.py     |  5 --
 .../models/realm/modeling_realm.py            | 36 +++++++--
 .../models/realm/tokenization_realm.py        | 76 +++++++++++++++++++
 .../models/realm/tokenization_realm_fast.py   | 62 ---------------
 .../utils/dummy_tokenizers_objects.py         |  9 ---
 tests/test_modeling_realm.py                  |  4 +-
 9 files changed, 109 insertions(+), 94 deletions(-)
 delete mode 100644 src/transformers/models/realm/tokenization_realm_fast.py

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8a6fc056d417..b23755778066 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -419,7 +419,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+|            Realm            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index a602c7828b67..c3da94bd6356 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -54,13 +54,6 @@ RealmTokenizer
         create_token_type_ids_from_sequences, save_vocabulary
 
 
-RealmTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmTokenizerFast
-    :members:
-
-
 RealmEmbedder
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 8f92164c35d8..3b484f37b2c1 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -357,7 +357,6 @@
 # tokenizers-backed objects
 if is_tokenizers_available():
     # Fast tokenizers
-    _import_structure["models.realm"].append("RealmTokenizerFast")
     _import_structure["models.roformer"].append("RoFormerTokenizerFast")
     _import_structure["models.clip"].append("CLIPTokenizerFast")
     _import_structure["models.convbert"].append("ConvBertTokenizerFast")
@@ -2153,7 +2152,6 @@
         from .models.mt5 import MT5TokenizerFast
         from .models.openai import OpenAIGPTTokenizerFast
         from .models.pegasus import PegasusTokenizerFast
-        from .models.realm import RealmTokenizerFast
         from .models.reformer import ReformerTokenizerFast
         from .models.rembert import RemBertTokenizerFast
         from .models.retribert import RetriBertTokenizerFast
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index dc8d5684df34..293ae5e55812 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -25,8 +25,6 @@
     "tokenization_realm": ["RealmTokenizer"],
 }
 
-if is_tokenizers_available():
-    _import_structure["tokenization_realm_fast"] = ["RealmTokenizerFast"]
 
 if is_torch_available():
     _import_structure["modeling_realm"] = [
@@ -43,9 +41,6 @@
     from .configuration_realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig
     from .tokenization_realm import RealmTokenizer
 
-    if is_tokenizers_available():
-        from .tokenization_realm_fast import RealmTokenizerFast
-
     if is_torch_available():
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 025c213f2042..b14b2cabb8bb 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -34,8 +34,9 @@
 
 
 logger = logging.get_logger(__name__)
-
-_CHECKPOINT_FOR_DOC = "realm-cc-news-pretrained"
+_BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
+_EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
+_RETRIEVER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-retriever"
 _CONFIG_FOR_DOC = "RealmConfig"
 _TOKENIZER_FOR_DOC = "RealmTokenizer"
 
@@ -405,7 +406,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The retriever of REALM outputting relevance score representing the score of document candidates (before softmax)",
+    "The retriever of REALM outputting relevance score representing the score of document candidates (before softmax).",
     REALM_START_DOCSTRING,
 )
 class RealmRetriever(RealmPreTrainedModel):
@@ -532,7 +533,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The encoder of REALM outputting masked lm logits and marginal log-likelihood loss.",
+    "The encoder of REALM outputting masked language model logits and marginal log-likelihood loss.",
     REALM_START_DOCSTRING,
 )
 class RealmEncoder(RealmPreTrainedModel):
@@ -554,7 +555,9 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.cls.predictions.decoder = new_embeddings
 
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_start_docstrings_to_model_forward(
+        REALM_INPUTS_DOCSTRING.format("batch_size, num_candidates, sequence_length")
+    )
     @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -573,7 +576,8 @@ def forward(
     ):
         r"""
         relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates)`, `optional`):
-            Relevance score derived from RealmRetriever.
+            Relevance score derived from RealmRetriever, must be specified if you want to compute the masked language
+            modeling loss.
 
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
             Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
@@ -588,6 +592,24 @@ def forward(
             - 0 for tokens that are **masked**.
 
         Returns:
+
+        Example:
+
+        >>> import torch
+        >>> from transformers import RealmTokenizer, RealmEncoder
+
+        >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
+        >>> model = RealmEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [
+        >>>     ["Hello world!", "Nice to meet you!"],
+        >>>     ["The cute cat.", "The adorable dog."]
+        >>> ]
+
+        >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10)
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -619,7 +641,7 @@ def forward(
         if labels is not None:
             if candidate_score is None:
                 raise ValueError(
-                    "You have to specify relevance_score when `labels` is specified in order to calculate loss."
+                    "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
                 )
 
             if mlm_mask is None:
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 2b4ad1f95678..aa74877ce524 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -13,6 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for REALM."""
+import torch
+
+from ...file_utils import PaddingStrategy
+from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
 from ..bert.tokenization_bert import BertTokenizer
 
@@ -58,3 +62,75 @@ class RealmTokenizer(BertTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
     pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+    def batch_encode_candidates(self, text, **kwargs):
+        r"""
+        Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following
+        differences:
+
+            1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
+            2. Always pad the sequences to `max_length` and always return PyTorch tensors..
+            3. Must specify `max_length` in order to stack packs of candidates into a batch.
+
+            - single sequence: ``[CLS] X [SEP]``
+            - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            text (:obj:`List[List[str]]`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            text_pair (:obj:`List[List[str]]`, `optional`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            **kwargs:
+                Keyword arguments of the __call__ method.
+
+        Returns:
+            :class:`~transformers.BatchEncoding`: Encoded text or text pair.
+
+        Example: >>> from transformers import RealmTokenizer
+
+        >>> # batch_size = 2, num_candidates = 2 >>> text = [ >>> ["Hello world!", "Nice to meet you!"], >>> ["The cute
+        cat.", "The adorable dog."] >>> ]
+
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10)
+        """
+
+        # Always return PyTorch tensor.
+        kwargs["return_tensors"] = "pt"
+        # Always using a fixed sequence length to encode in order to stack candidates into a batch.
+        kwargs["padding"] = PaddingStrategy.MAX_LENGTH
+
+        batch_text = text
+        batch_text_pair = kwargs.pop("text_pair", None)
+
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
+
+        for idx, candidate_text in enumerate(batch_text):
+            if batch_text_pair is not None:
+                candidate_text_pair = batch_text_pair[idx]
+            else:
+                candidate_text_pair = None
+
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, **kwargs)
+
+            encoded_input_ids = encoded_candidates.get("input_ids")
+            encoded_attention_mask = encoded_candidates.get("attention_mask")
+            encoded_token_type_ids = encoded_candidates.get("token_type_ids")
+
+            if encoded_input_ids is not None:
+                output_data["input_ids"].append(encoded_input_ids)
+            if encoded_attention_mask is not None:
+                output_data["attention_mask"].append(encoded_attention_mask)
+            if encoded_token_type_ids is not None:
+                output_data["token_type_ids"].append(encoded_token_type_ids)
+
+        output_data = dict((key, torch.stack(item)) for key, item in output_data.items() if len(item) != 0)
+
+        return BatchEncoding(output_data, tensor_type=kwargs["return_tensors"])
diff --git a/src/transformers/models/realm/tokenization_realm_fast.py b/src/transformers/models/realm/tokenization_realm_fast.py
deleted file mode 100644
index 14e9349a982d..000000000000
--- a/src/transformers/models/realm/tokenization_realm_fast.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for REALM."""
-from ...utils import logging
-from ..bert.tokenization_bert_fast import BertTokenizerFast
-from .tokenization_realm import RealmTokenizer
-
-
-logger = logging.get_logger(__name__)
-
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
-
-PRETRAINED_VOCAB_FILES_MAP = {
-    "vocab_file": {
-        "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/vocab.txt",
-        "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/vocab.txt",
-        "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/vocab.txt",
-    }
-}
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "realm-cc-news-pretrained-embedder": 512,
-    "realm-cc-news-pretrained-retriever": 512,
-    "realm-cc-news-pretrained-encoder": 512,
-}
-
-
-PRETRAINED_INIT_CONFIGURATION = {
-    "realm-cc-news-pretrained-embedder": {"do_lower_case": True},
-    "realm-cc-news-pretrained-retriever": {"do_lower_case": True},
-    "realm-cc-news-pretrained-encoder": {"do_lower_case": True},
-}
-
-
-class RealmTokenizerFast(BertTokenizerFast):
-    r"""
-    Construct a "fast" REALM tokenizer (backed by HuggingFace's `tokenizers` library).
-
-    :class:`~transformers.RealmTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
-    end-to-end tokenization: punctuation splitting and wordpiece.
-
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
-    parameters.
-    """
-
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
-    slow_tokenizer_class = RealmTokenizer
diff --git a/src/transformers/utils/dummy_tokenizers_objects.py b/src/transformers/utils/dummy_tokenizers_objects.py
index 1fddcce2caed..382f0314bbfe 100644
--- a/src/transformers/utils/dummy_tokenizers_objects.py
+++ b/src/transformers/utils/dummy_tokenizers_objects.py
@@ -263,15 +263,6 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["tokenizers"])
 
 
-class RealmTokenizerFast:
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tokenizers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["tokenizers"])
-
-
 class ReformerTokenizerFast:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["tokenizers"])
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 6a8ee27a3dea..94b388a5e41e 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -239,7 +239,9 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmEncoder,
-            RealmRetriever,
+            # RealmRetriever is excluded from common tests as it is a container model
+            # consisting of two RealmEmbedders & simple inner product calculation.
+            # RealmRetriever
         )
         if is_torch_available()
         else ()

From bd6d2eb58f3616c14dd97280f2c76ee8513c89d3 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 18:53:35 +0800
Subject: [PATCH 20/98] Fix up

---
 docs/source/model_doc/realm.rst               |  2 +-
 .../models/realm/modeling_realm.py            | 26 +++++++++---------
 .../models/realm/tokenization_realm.py        | 27 ++++++++++---------
 3 files changed, 29 insertions(+), 26 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index c3da94bd6356..a524690d24b1 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -51,7 +51,7 @@ RealmTokenizer
 
 .. autoclass:: transformers.RealmTokenizer
     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary
+        create_token_type_ids_from_sequences, save_vocabulary, batch_encode_candidates
 
 
 RealmEmbedder
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index b14b2cabb8bb..03168825e3ea 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -593,23 +593,23 @@ def forward(
 
         Returns:
 
-        Example:
+        Example::
 
-        >>> import torch
-        >>> from transformers import RealmTokenizer, RealmEncoder
+            >>> import torch
+            >>> from transformers import RealmTokenizer, RealmEncoder
 
-        >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
-        >>> model = RealmEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
+            >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
+            >>> model = RealmEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
 
-        >>> # batch_size = 2, num_candidates = 2
-        >>> text = [
-        >>>     ["Hello world!", "Nice to meet you!"],
-        >>>     ["The cute cat.", "The adorable dog."]
-        >>> ]
+            >>> # batch_size = 2, num_candidates = 2
+            >>> text = [
+            >>>     ["Hello world!", "Nice to meet you!"],
+            >>>     ["The cute cat.", "The adorable dog."]
+            >>> ]
 
-        >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10)
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
+            >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> logits = outputs.logits
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index aa74877ce524..f2f00aff013a 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Tokenization classes for REALM."""
-import torch
 
 from ...file_utils import PaddingStrategy
 from ...tokenization_utils_base import BatchEncoding
@@ -69,7 +68,7 @@ def batch_encode_candidates(self, text, **kwargs):
         differences:
 
             1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
-            2. Always pad the sequences to `max_length` and always return PyTorch tensors..
+            2. Always pad the sequences to `max_length`.
             3. Must specify `max_length` in order to stack packs of candidates into a batch.
 
             - single sequence: ``[CLS] X [SEP]``
@@ -88,23 +87,27 @@ def batch_encode_candidates(self, text, **kwargs):
         Returns:
             :class:`~transformers.BatchEncoding`: Encoded text or text pair.
 
-        Example: >>> from transformers import RealmTokenizer
+        Example::
 
-        >>> # batch_size = 2, num_candidates = 2 >>> text = [ >>> ["Hello world!", "Nice to meet you!"], >>> ["The cute
-        cat.", "The adorable dog."] >>> ]
+            >>> from transformers import RealmTokenizer
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+            >>> # batch_size = 2, num_candidates = 2
+            >>> text = [
+            >>>     ["Hello world!", "Nice to meet you!"],
+            >>>     ["The cute cat.", "The adorable dog."]
+            >>> ]
 
-        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10)
+            >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+
+            >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
         """
 
-        # Always return PyTorch tensor.
-        kwargs["return_tensors"] = "pt"
         # Always using a fixed sequence length to encode in order to stack candidates into a batch.
         kwargs["padding"] = PaddingStrategy.MAX_LENGTH
 
         batch_text = text
         batch_text_pair = kwargs.pop("text_pair", None)
+        return_tensors = kwargs.pop("return_tensors", None)
 
         output_data = {
             "input_ids": [],
@@ -118,7 +121,7 @@ def batch_encode_candidates(self, text, **kwargs):
             else:
                 candidate_text_pair = None
 
-            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, **kwargs)
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs)
 
             encoded_input_ids = encoded_candidates.get("input_ids")
             encoded_attention_mask = encoded_candidates.get("attention_mask")
@@ -131,6 +134,6 @@ def batch_encode_candidates(self, text, **kwargs):
             if encoded_token_type_ids is not None:
                 output_data["token_type_ids"].append(encoded_token_type_ids)
 
-        output_data = dict((key, torch.stack(item)) for key, item in output_data.items() if len(item) != 0)
+        output_data = dict((key, item) for key, item in output_data.items() if len(item) != 0)
 
-        return BatchEncoding(output_data, tensor_type=kwargs["return_tensors"])
+        return BatchEncoding(output_data, tensor_type=return_tensors)

From 633e452297c9a934d2262337fbd8ccefd94b79e4 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 21:38:57 +0800
Subject: [PATCH 21/98] Style

---
 docs/source/index.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index eab0c2cfde60..8a44afc5c278 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -176,8 +176,7 @@ Supported models
 25. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
     Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
     Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leve
-ing
+26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
     Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
     Narayan, Aliaksei Severyn.
 27. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:

From 609d7f38538366196ff5a9760331f203a596ee6b Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 1 Sep 2021 22:28:33 +0800
Subject: [PATCH 22/98] Add tokenization tests

---
 tests/test_tokenization_realm.py | 315 +++++++++++++++++++++++++++++++
 1 file changed, 315 insertions(+)
 create mode 100644 tests/test_tokenization_realm.py

diff --git a/tests/test_tokenization_realm.py b/tests/test_tokenization_realm.py
new file mode 100644
index 000000000000..2934c1e2309f
--- /dev/null
+++ b/tests/test_tokenization_realm.py
@@ -0,0 +1,315 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import unittest
+
+from transformers.models.bert.tokenization_bert import (
+    VOCAB_FILES_NAMES,
+    BasicTokenizer,
+    WordpieceTokenizer,
+    _is_control,
+    _is_punctuation,
+    _is_whitespace,
+)
+from transformers.models.realm.tokenization_realm import RealmTokenizer
+from transformers.testing_utils import require_tokenizers, slow
+
+from .test_tokenization_common import TokenizerTesterMixin, filter_non_english
+
+
+@require_tokenizers
+class RealmTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+
+    tokenizer_class = RealmTokenizer
+    rust_tokenizer_class = None
+    test_rust_tokenizer = False
+    space_between_special_tokens = True
+    from_pretrained_filter = filter_non_english
+
+    def setUp(self):
+        super().setUp()
+
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_input_output_texts(self, tokenizer):
+        input_text = "UNwant\u00E9d,running"
+        output_text = "unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize("UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])
+
+    def test_rust_and_python_full_tokenizers(self):
+        if not self.test_rust_tokenizer:
+            return
+
+        tokenizer = self.get_tokenizer()
+        rust_tokenizer = self.get_rust_tokenizer()
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+        # With lower casing
+        tokenizer = self.get_tokenizer(do_lower_case=True)
+        rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)
+
+        sequence = "UNwant\u00E9d,running"
+
+        tokens = tokenizer.tokenize(sequence)
+        rust_tokens = rust_tokenizer.tokenize(sequence)
+        self.assertListEqual(tokens, rust_tokens)
+
+        ids = tokenizer.encode(sequence, add_special_tokens=False)
+        rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
+        self.assertListEqual(ids, rust_ids)
+
+        rust_tokenizer = self.get_rust_tokenizer()
+        ids = tokenizer.encode(sequence)
+        rust_ids = rust_tokenizer.encode(sequence)
+        self.assertListEqual(ids, rust_ids)
+
+    def test_chinese(self):
+        tokenizer = BasicTokenizer()
+
+        self.assertListEqual(tokenizer.tokenize("ah\u535A\u63A8zz"), ["ah", "\u535A", "\u63A8", "zz"])
+
+    def test_basic_tokenizer_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["hello", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hällo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["h\u00E9llo"])
+
+    def test_basic_tokenizer_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=True, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_lower_strip_accents_default(self):
+        tokenizer = BasicTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["hallo", "!", "how", "are", "you", "?"]
+        )
+        self.assertListEqual(tokenizer.tokenize("H\u00E9llo"), ["hello"])
+
+    def test_basic_tokenizer_no_lower(self):
+        tokenizer = BasicTokenizer(do_lower_case=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_false(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HäLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_no_lower_strip_accents_true(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, strip_accents=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHäLLo!how  \n Are yoU?  "), ["HaLLo", "!", "how", "Are", "yoU", "?"]
+        )
+
+    def test_basic_tokenizer_respects_never_split_tokens(self):
+        tokenizer = BasicTokenizer(do_lower_case=False, never_split=["[UNK]"])
+
+        self.assertListEqual(
+            tokenizer.tokenize(" \tHeLLo!how  \n Are yoU? [UNK]"), ["HeLLo", "!", "how", "Are", "yoU", "?", "[UNK]"]
+        )
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(""), [])
+
+        self.assertListEqual(tokenizer.tokenize("unwanted running"), ["un", "##want", "##ed", "runn", "##ing"])
+
+        self.assertListEqual(tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
+
+    def test_is_whitespace(self):
+        self.assertTrue(_is_whitespace(" "))
+        self.assertTrue(_is_whitespace("\t"))
+        self.assertTrue(_is_whitespace("\r"))
+        self.assertTrue(_is_whitespace("\n"))
+        self.assertTrue(_is_whitespace("\u00A0"))
+
+        self.assertFalse(_is_whitespace("A"))
+        self.assertFalse(_is_whitespace("-"))
+
+    def test_is_control(self):
+        self.assertTrue(_is_control("\u0005"))
+
+        self.assertFalse(_is_control("A"))
+        self.assertFalse(_is_control(" "))
+        self.assertFalse(_is_control("\t"))
+        self.assertFalse(_is_control("\r"))
+
+    def test_is_punctuation(self):
+        self.assertTrue(_is_punctuation("-"))
+        self.assertTrue(_is_punctuation("$"))
+        self.assertTrue(_is_punctuation("`"))
+        self.assertTrue(_is_punctuation("."))
+
+        self.assertFalse(_is_punctuation("A"))
+        self.assertFalse(_is_punctuation(" "))
+
+    def test_clean_text(self):
+        tokenizer = self.get_tokenizer()
+
+        # Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
+        self.assertListEqual([tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]])
+
+        if self.test_rust_tokenizer:
+            rust_tokenizer = self.get_rust_tokenizer()
+            self.assertListEqual(
+                [rust_tokenizer.tokenize(t) for t in ["Test", "\xad", "test"]], [["[UNK]"], [], ["[UNK]"]]
+            )
+
+    @slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
+
+    def test_offsets_with_special_characters(self):
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name, **kwargs)
+
+                sentence = f"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
+                tokens = tokenizer_r.encode_plus(
+                    sentence,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    return_offsets_mapping=True,
+                    add_special_tokens=True,
+                )
+
+                do_lower_case = tokenizer_r.do_lower_case if hasattr(tokenizer_r, "do_lower_case") else False
+                expected_results = (
+                    [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "A"),
+                        ((1, 2), ","),
+                        ((3, 5), "na"),
+                        ((5, 6), "##ï"),
+                        ((6, 8), "##ve"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "Allen"),
+                        ((21, 23), "##NL"),
+                        ((23, 24), "##P"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                    if not do_lower_case
+                    else [
+                        ((0, 0), tokenizer_r.cls_token),
+                        ((0, 1), "a"),
+                        ((1, 2), ","),
+                        ((3, 8), "naive"),
+                        ((9, 15), tokenizer_r.mask_token),
+                        ((16, 21), "allen"),
+                        ((21, 23), "##nl"),
+                        ((23, 24), "##p"),
+                        ((25, 33), "sentence"),
+                        ((33, 34), "."),
+                        ((0, 0), tokenizer_r.sep_token),
+                    ]
+                )
+
+                self.assertEqual(
+                    [e[1] for e in expected_results], tokenizer_r.convert_ids_to_tokens(tokens["input_ids"])
+                )
+                self.assertEqual([e[0] for e in expected_results], tokens["offset_mapping"])
+
+    @slow
+    def test_batch_encode_candidates(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
+
+        text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        encoded_sentence = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+
+        expected_shape = (2, 2, 10)
+
+        assert encoded_sentence["input_ids"].shape == expected_shape
+        assert encoded_sentence["attention_mask"].shape == expected_shape
+        assert encoded_sentence["token_type_ids"].shape == expected_shape

From 8066399567a22316a450e0ce69f7f6c9124c9037 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 4 Sep 2021 00:23:00 +0800
Subject: [PATCH 23/98] Update `from_pretrained` tests

---
 tests/test_modeling_realm.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 94b388a5e41e..b1ee43582b07 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -294,17 +294,20 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
+    @slow
+    def test_embedder_from_pretrained(self):
+        model = RealmEmbedder.from_pretrained("qqaatw/realm-cc-news-pretrained-embedder")
+        self.assertIsNotNone(model)
+
     @slow
     def test_encoder_from_pretrained(self):
-        for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = RealmEncoder.from_pretrained(model_name)
-            self.assertIsNotNone(model)
+        model = RealmEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        self.assertIsNotNone(model)
 
     @slow
     def test_retriever_from_pretrained(self):
-        for model_name in REALM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = RealmRetriever.from_pretrained(model_name)
-            self.assertIsNotNone(model)
+        model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
+        self.assertIsNotNone(model)
 
 
 @require_torch

From 09a280cf83cc3a2875d6bb2d06653db05c700250 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 4 Sep 2021 00:55:24 +0800
Subject: [PATCH 24/98] Apply suggestions

---
 README.md                                           |  2 +-
 src/transformers/__init__.py                        |  1 -
 src/transformers/models/realm/modeling_realm.py     | 11 ++---------
 src/transformers/models/realm/tokenization_realm.py |  7 +------
 utils/check_repo.py                                 |  1 -
 5 files changed, 4 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 6a6dc285fea3..e25e67390016 100644
--- a/README.md
+++ b/README.md
@@ -262,7 +262,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index d6d3f71c933d..33e1fae6110d 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -520,7 +520,6 @@
     _import_structure["modeling_utils"] = ["Conv1D", "PreTrainedModel", "apply_chunking_to_forward", "prune_layer"]
 
     # PyTorch models structure
-
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 03168825e3ea..7282e55b88dd 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -101,10 +101,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
                 pointer = getattr(pointer, "weight")
             elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                 pointer = getattr(pointer, "bias")
-            elif scope_names[0] == "output_weights":
-                pointer = getattr(pointer, "weight")
-            # elif scope_names[0] == "squad":
-            #    pointer = getattr(pointer, "classifier")
             else:
                 try:
                     pointer = getattr(pointer, scope_names[0])
@@ -278,7 +274,7 @@ def _flatten_inputs(self, *inputs):
 
 
 REALM_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ sub-class. Use
     it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
     behavior.
 
@@ -421,10 +417,7 @@ def __init__(self, config, query_embedder=None):
 
         self.embedder = RealmEmbedder(self.config)
 
-        if query_embedder:
-            self.query_embedder = query_embedder
-        else:
-            self.query_embedder = self.embedder
+        self.query_embedder = query_embedder if query_embedder is not None else self.embedder
 
         self.init_weights()
 
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index f2f00aff013a..aab83c517cd0 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -98,7 +98,6 @@ def batch_encode_candidates(self, text, **kwargs):
             >>> ]
 
             >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
-
             >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
         """
 
@@ -109,11 +108,7 @@ def batch_encode_candidates(self, text, **kwargs):
         batch_text_pair = kwargs.pop("text_pair", None)
         return_tensors = kwargs.pop("return_tensors", None)
 
-        output_data = {
-            "input_ids": [],
-            "attention_mask": [],
-            "token_type_ids": [],
-        }
+        output_data = { "input_ids": [], "attention_mask": [],"token_type_ids": [],}
 
         for idx, candidate_text in enumerate(batch_text):
             if batch_text_pair is not None:
diff --git a/utils/check_repo.py b/utils/check_repo.py
index edef29740cd3..6eee03beca8b 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -116,7 +116,6 @@
     "RagSequenceForGeneration",
     "RagTokenForGeneration",
     "RealmEmbedder",
-    "RealmEncoder",
     "RealmRetriever",
     "TFDPRReader",
     "TFGPT2DoubleHeadsModel",

From 769e8fb1a928dc7f54e26eac5d4e6d0904340184 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 5 Sep 2021 00:10:12 +0800
Subject: [PATCH 25/98] Style & Quality

---
 docs/source/index.rst                               | 6 +++---
 src/transformers/models/realm/modeling_realm.py     | 6 +++---
 src/transformers/models/realm/tokenization_realm.py | 6 +++++-
 tests/test_modeling_realm.py                        | 1 -
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7b1073ee569d..a9fb8070dfd1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -254,9 +254,9 @@ Supported models
 52. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
     Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
     Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-53. :doc:`REALM <model_doc/realm>` (from Google Research) released with the paper `REALM: Retrieval-Augmented Language
-    Model Pre-Training <https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and
-    Ming-Wei Chang.
+53. `REALM <https://huggingface.co/transformers/master/model_doc/realm.html>`__ (from Google Research) released with
+    the paper `REALM: Retrieval-Augmented Language Model Pre-Training <https://arxiv.org/abs/2002.08909>`__ by Kelvin
+    Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 54. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
     Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 55. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7282e55b88dd..7238305531cb 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -274,9 +274,9 @@ def _flatten_inputs(self, *inputs):
 
 
 REALM_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ sub-class. Use
-    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
-    behavior.
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
 
     Parameters:
         config (:class:`~transformers.RealmConfig`): Model configuration class with all the parameters of the model.
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index aab83c517cd0..ebdd1d40ff9a 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -108,7 +108,11 @@ def batch_encode_candidates(self, text, **kwargs):
         batch_text_pair = kwargs.pop("text_pair", None)
         return_tensors = kwargs.pop("return_tensors", None)
 
-        output_data = { "input_ids": [], "attention_mask": [],"token_type_ids": [],}
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
 
         for idx, candidate_text in enumerate(batch_text):
             if batch_text_pair is not None:
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index b1ee43582b07..231ffec8e7b2 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -29,7 +29,6 @@
     import torch
 
     from transformers import RealmEmbedder, RealmEncoder, RealmRetriever
-    from transformers.models.realm.modeling_realm import REALM_PRETRAINED_MODEL_ARCHIVE_LIST
 
 
 class RealmModelTester:

From 9d5175b09df0a9daaecc7a7da2f8745d08b8a5e0 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 9 Sep 2021 15:53:29 +0800
Subject: [PATCH 26/98] Copy BERT model

---
 README_zh-hans.md                             |   1 +
 README_zh-hant.md                             |   1 +
 docs/source/index.rst                         |   4 +-
 .../models/realm/modeling_realm.py            | 640 +++++++++++++++++-
 4 files changed, 641 insertions(+), 5 deletions(-)

diff --git a/README_zh-hans.md b/README_zh-hans.md
index 62e0a48a0683..32f7d9954a68 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -284,6 +284,7 @@ conda install -c huggingface transformers
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 8230813e34c6..e851b5cd011e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -296,6 +296,7 @@ conda install -c huggingface transformers
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 108b558a1087..3dd4e5a6f473 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -279,10 +279,10 @@ Supported models
 61. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
     Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
     Jonathan Berant, Amir Globerson, Omer Levy.
-61. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
+62. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
     vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
     Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-62. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+63. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
     Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
     Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 64. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7238305531cb..167dd9787d1a 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -15,21 +15,32 @@
 """ PyTorch REALM model. """
 
 
+import math
 import os
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
 import torch
 import torch.utils.checkpoint
+from packaging import version
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
-from ...modeling_outputs import MaskedLMOutput, ModelOutput
-from ...modeling_utils import PreTrainedModel
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+)
+from ...modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
 from ...utils import logging
-from ..bert import BertModel
 from .configuration_realm import RealmConfig
 
 
@@ -126,6 +137,629 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     return model
 
 
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertModel # Modified
+class BertModel(PreTrainedModel):
+    """
+    Same as the original BertModel but remvoe docstrings and inherit PreTrainedModel directly.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        # Weight initialization is managed by Realm models.
+        # self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
 @dataclass
 class RealmEmbedderOutput(ModelOutput):
     """

From 6f640293048b0192422dec9535086596c660c52c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 9 Sep 2021 15:57:18 +0800
Subject: [PATCH 27/98] Fix comment to avoid docstring copying

---
 src/transformers/models/realm/modeling_realm.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 167dd9787d1a..57690252ae83 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -618,7 +618,6 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertModel # Modified
 class BertModel(PreTrainedModel):
     """
     Same as the original BertModel but remvoe docstrings and inherit PreTrainedModel directly.

From dc3695b877eec9f64718cb6cc60a96e2d5088431 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 9 Sep 2021 16:33:29 +0800
Subject: [PATCH 28/98] Make RealmBertModel private

---
 src/transformers/models/realm/modeling_realm.py | 6 +++---
 utils/check_repo.py                             | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 57690252ae83..08ea0bbaa81f 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -618,7 +618,7 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-class BertModel(PreTrainedModel):
+class RealmBertModel(PreTrainedModel):
     """
     Same as the original BertModel but remvoe docstrings and inherit PreTrainedModel directly.
     """
@@ -977,7 +977,7 @@ class RealmEmbedder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = BertModel(self.config)
+        self.bert = RealmBertModel(self.config)
         self.cls = RealmRetrieverProjection(self.config)
         self.init_weights()
 
@@ -1165,7 +1165,7 @@ def forward(
 class RealmEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.bert = BertModel(self.config)
+        self.bert = RealmBertModel(self.config)
         self.cls = RealmOnlyMLMHead(self.config)
         self.init_weights()
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 6eee03beca8b..84b658d152d9 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -34,6 +34,7 @@
 # Update this list with models that are supposed to be private.
 PRIVATE_MODELS = [
     "DPRSpanPredictor",
+    "RealmBertModel",
     "T5Stack",
     "TFDPRSpanPredictor",
 ]
@@ -71,6 +72,7 @@
     "PegasusDecoderWrapper",  # Building part of bigger (tested) model.
     "DPREncoder",  # Building part of bigger (tested) model.
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
+    "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmRetriever",  # Submodels have been tested.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
     "TFDPREncoder",  # Building part of bigger (tested) model.

From 850c38c37708573147088b0e5bb4a7430a9692a4 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 13 Sep 2021 01:30:47 +0800
Subject: [PATCH 29/98] Fix bug

---
 src/transformers/models/realm/modeling_realm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 08ea0bbaa81f..db2b17e04eee 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -893,7 +893,7 @@ def _init_weights(self, module):
             module.weight.data.fill_(1.0)
 
     def _flatten_inputs(self, *inputs):
-        """Flatten inputs to (batch_size, ..., input_shape[-1])"""
+        """Flatten inputs' shape to (-1, input_shape[-1])"""
         flattened_inputs = []
         for tensor in inputs:
             if tensor is None:
@@ -1269,6 +1269,8 @@ def forward(
                 raise ValueError(
                     "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
                 )
+            
+            batch_size, seq_length = labels.size()
 
             if mlm_mask is None:
                 mlm_mask = torch.ones_like(labels, dtype=torch.float32)
@@ -1283,7 +1285,7 @@ def forward(
             # [batch_size * num_candidates * joint_seq_len]
             mlm_targets = labels.tile(1, self.config.num_candidates).view(-1)
             # [batch_size, num_candidates, joint_seq_len]
-            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view_as(input_ids)
+            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view(batch_size, self.config.num_candidates, seq_length)
             # [batch_size, num_candidates, 1]
             candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1)
             # [batch_size, num_candidates, joint_seq_len]

From a11d5c5da02a8da335f367e08a5c8437fd8d1fa2 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 13 Sep 2021 15:37:16 +0800
Subject: [PATCH 30/98] Style

---
 src/transformers/models/realm/modeling_realm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index db2b17e04eee..3cac78454927 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1269,7 +1269,7 @@ def forward(
                 raise ValueError(
                     "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
                 )
-            
+
             batch_size, seq_length = labels.size()
 
             if mlm_mask is None:
@@ -1285,7 +1285,9 @@ def forward(
             # [batch_size * num_candidates * joint_seq_len]
             mlm_targets = labels.tile(1, self.config.num_candidates).view(-1)
             # [batch_size, num_candidates, joint_seq_len]
-            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view(batch_size, self.config.num_candidates, seq_length)
+            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view(
+                batch_size, self.config.num_candidates, seq_length
+            )
             # [batch_size, num_candidates, 1]
             candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1)
             # [batch_size, num_candidates, joint_seq_len]

From 831a230c1b87fdba0ff0e928d063c0630aac2da1 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Fri, 3 Sep 2021 17:59:18 +0800
Subject: [PATCH 31/98] Basic QA

---
 .../models/realm/configuration_realm.py       |   2 +
 .../models/realm/modeling_realm.py            | 126 +++++++++++++++++-
 2 files changed, 127 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index e12f8e4e3f52..ce79bcd6db6e 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -98,6 +98,7 @@ def __init__(
         vocab_size=30522,
         hidden_size=768,
         retriever_proj_size=128,
+        span_hidden_size=256,
         num_hidden_layers=12,
         num_attention_heads=12,
         num_candidates=8,
@@ -121,6 +122,7 @@ def __init__(
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.retriever_proj_size = retriever_proj_size
+        self.span_hidden_size = span_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_candidates = num_candidates
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 3cac78454927..88ce55fa48b3 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -27,12 +27,13 @@
 from torch.nn import CrossEntropyLoss
 
 from ...activations import ACT2FN
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
     ModelOutput,
+    QuestionAnsweringModelOutput
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -865,6 +866,35 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+
+class RealmQuestionAnsweringProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
+        self.dense_1 = nn.Linear(config.span_hidden_size, 1)
+        self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.layer_norm_eps)
+
+        self.relu = nn.ReLU()
+
+    def forward(self, hidden_states, start_positions, end_positions):
+        hidden_states = self.dense(hidden_states)
+        
+
+        # [reader_beam_size, max_sequence_len, span_hidden_size]
+        start_projection, end_projection = hidden_states.split(1, dim=-1)
+        
+        candidate_hidden = start_projection + end_projection
+
+        # [reader_beam_size, max_sequence_len, 1]
+        hidden_states = self.relu(hidden_states)
+        hidden_states = self.layer_normalization(hidden_states)
+        hidden_states = self.dense_1(hidden_states)
+
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+        return hidden_states
+
+
 class RealmPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1307,3 +1337,97 @@ def forward(
             hidden_states=joint_outputs.hidden_states,
             attentions=joint_outputs.attentions,
         )
+
+
+class RealmForQuestionAnswering(RealmPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.reader = RealmEncoder(config)
+        self.qa_outputs = RealmQuestionAnsweringProjection(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_BERT_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size * num_candidates, joint_seq_len, hidden_size]
+        sequence_output = outputs[0]
+
+        # [batch_size * num_candidates, 1]
+        start_logitsm, end_logits = self.qa_outputs(sequence_output)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

From 81985e641eb06cd3ee3edabef5302c9aba507d3f Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 19 Sep 2021 16:41:26 +0800
Subject: [PATCH 32/98] Save

---
 .../models/realm/modeling_realm.py            | 57 +++++++++++++------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 88ce55fa48b3..48aaeb25baae 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -79,21 +79,35 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
+    is_reader_checkpoint = False
+
     for name, shape in init_vars:
         logger.info(f"Loading TF weight {name} with shape {shape}")
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
-
+    
     for name, array in zip(names, arrays):
+        if "reader" in name:
+            is_reader_checkpoint=True
 
-        # embedder
+    for name, array in zip(names, arrays):
+        # For embedder and retriever
         embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
         name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
         name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
         name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
         name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
 
+        # For reader
+        if is_reader_checkpoint and isinstance(model, RealmReader) and "reader" not in name:
+            continue
+        name = name.replace("reader/module/bert/", "bert/")
+        name = name.replace("reader/module/cls/", "cls/")
+        name = name.replace("reader/dense/", "qa_outputs/dense_intermediate/")
+        name = name.replace("reader/dense_1/", "qa_outputs/dense_output/")
+        name = name.replace("reader/layer_normalization", "qa_outputs/layer_normalization")
+
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
@@ -120,6 +134,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
                     logger.info(f"Skipping {'/'.join(name)}")
                     continue
             if len(scope_names) >= 2:
+                print(scope_names)
                 num = int(scope_names[1])
                 pointer = pointer[num]
         if m_name[-11:] == "_embeddings":
@@ -866,29 +881,28 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-
-class RealmQuestionAnsweringProjection(nn.Module):
+class RealmReaderProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.dense = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
-        self.dense_1 = nn.Linear(config.span_hidden_size, 1)
+        self.dense_intermediate = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
+        self.dense_output = nn.Linear(config.span_hidden_size, 1)
         self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.layer_norm_eps)
 
         self.relu = nn.ReLU()
 
-    def forward(self, hidden_states, start_positions, end_positions):
-        hidden_states = self.dense(hidden_states)
-        
-
+    def forward(self, hidden_states, start_positions=None, end_positions=None):
+        # [reader_beam_size, max_sequence_len, span_hidden_size * 2]
+        hidden_states = self.dense_intermediate(hidden_states)
+        print('dense', hidden_states, hidden_states.shape)
         # [reader_beam_size, max_sequence_len, span_hidden_size]
-        start_projection, end_projection = hidden_states.split(1, dim=-1)
-        
+        start_projection, end_projection = hidden_states.split(2, dim=-1)
+        print(start_projection)
         candidate_hidden = start_projection + end_projection
 
         # [reader_beam_size, max_sequence_len, 1]
         hidden_states = self.relu(hidden_states)
         hidden_states = self.layer_normalization(hidden_states)
-        hidden_states = self.dense_1(hidden_states)
+        hidden_states = self.dense_output(hidden_states)
 
         start_logits = start_logits.squeeze(-1).contiguous()
         end_logits = end_logits.squeeze(-1).contiguous()
@@ -1339,19 +1353,21 @@ def forward(
         )
 
 
-class RealmForQuestionAnswering(RealmPreTrainedModel):
+class RealmReader(RealmPreTrainedModel):
 
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
 
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.reader = RealmEncoder(config)
-        self.qa_outputs = RealmQuestionAnsweringProjection(config)
+        self.bert = RealmBertModel(config)
+        self.cls = RealmOnlyMLMHead(config)
+        self.qa_outputs = RealmReaderProjection(config)
 
         self.init_weights()
 
+    """
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @add_code_sample_docstrings(
         tokenizer_class=_TOKENIZER_FOR_DOC,
@@ -1359,6 +1375,7 @@ def __init__(self, config):
         output_type=QuestionAnsweringModelOutput,
         config_class=_CONFIG_FOR_DOC,
     )
+    """
     def forward(
         self,
         input_ids=None,
@@ -1367,6 +1384,7 @@ def forward(
         position_ids=None,
         head_mask=None,
         inputs_embeds=None,
+        relevance_score=None,
         start_positions=None,
         end_positions=None,
         output_attentions=None,
@@ -1382,6 +1400,9 @@ def forward(
             Labels for position (index) of the end of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
             sequence are not taken into account for computing the loss.
+
+        Returns:
+
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1399,7 +1420,7 @@ def forward(
 
         # [batch_size * num_candidates, joint_seq_len, hidden_size]
         sequence_output = outputs[0]
-
+    
         # [batch_size * num_candidates, 1]
         start_logitsm, end_logits = self.qa_outputs(sequence_output)
 

From dbd925dc8df193595279ce34c0edf8578ab84e3c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 21 Sep 2021 12:35:55 +0800
Subject: [PATCH 33/98] Complete reader logits

---
 .../models/realm/configuration_realm.py       | 11 ++-
 .../models/realm/modeling_realm.py            | 95 +++++++++++++++----
 2 files changed, 85 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index ce79bcd6db6e..5b881a871efa 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -98,7 +98,6 @@ def __init__(
         vocab_size=30522,
         hidden_size=768,
         retriever_proj_size=128,
-        span_hidden_size=256,
         num_hidden_layers=12,
         num_attention_heads=12,
         num_candidates=8,
@@ -111,6 +110,9 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         use_cache=True,
+        span_hidden_size=256,
+        max_span_width=10,
+        reader_layer_norm_eps=1e-3,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
@@ -118,11 +120,11 @@ def __init__(
     ):
         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
+        # Common config
         self.vocab_size = vocab_size
         self.max_position_embeddings = max_position_embeddings
         self.hidden_size = hidden_size
         self.retriever_proj_size = retriever_proj_size
-        self.span_hidden_size = span_hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.num_candidates = num_candidates
@@ -134,3 +136,8 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
+
+        # Reader config
+        self.span_hidden_size = span_hidden_size
+        self.max_span_width = max_span_width
+        self.reader_layer_norm_eps = reader_layer_norm_eps
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 48aaeb25baae..e6e9c9dd334b 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -101,6 +101,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
 
         # For reader
         if is_reader_checkpoint and isinstance(model, RealmReader) and "reader" not in name:
+            logger.info(f"Skipping {name} as the it is not reader's parameter")
             continue
         name = name.replace("reader/module/bert/", "bert/")
         name = name.replace("reader/module/cls/", "cls/")
@@ -884,29 +885,70 @@ def forward(self, hidden_states):
 class RealmReaderProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.config = config
         self.dense_intermediate = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
         self.dense_output = nn.Linear(config.span_hidden_size, 1)
-        self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.layer_norm_eps)
-
+        self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.reader_layer_norm_eps)
         self.relu = nn.ReLU()
 
-    def forward(self, hidden_states, start_positions=None, end_positions=None):
+    def forward(self, hidden_states, token_type_ids):
+        def span_candidates(masks):
+            """Generate span candidates.
+
+                Args:
+                masks: <int32> [num_retrievals, max_sequence_len]
+
+                Returns:
+                starts: <int32> [num_spans]
+                ends: <int32> [num_spans]
+                span_masks: <int32> [num_retrievals, num_spans]
+            """
+            _, max_sequence_len = masks.shape
+            def _spans_given_width(width):
+                current_starts = torch.arange(max_sequence_len - width + 1)
+                current_ends = torch.arange(width - 1, max_sequence_len)
+                return current_starts, current_ends
+
+            starts, ends = zip(*(_spans_given_width(w + 1)
+                                for w in range(self.config.max_span_width)))
+
+            # [num_spans]
+            starts = torch.cat(starts, 0)
+            ends = torch.cat(ends, 0)
+
+            # [num_retrievals, num_spans]
+            start_masks = torch.index_select(masks, dim=-1, index=starts)
+            end_masks = torch.index_select(masks, dim=-1, index=ends)
+            span_masks = start_masks * end_masks
+
+            return starts, ends, span_masks
+
+        def mask_to_score(mask):
+            return (1.0 - mask.type(torch.float32)) * -10000.0
+
+
         # [reader_beam_size, max_sequence_len, span_hidden_size * 2]
         hidden_states = self.dense_intermediate(hidden_states)
-        print('dense', hidden_states, hidden_states.shape)
         # [reader_beam_size, max_sequence_len, span_hidden_size]
-        start_projection, end_projection = hidden_states.split(2, dim=-1)
-        print(start_projection)
-        candidate_hidden = start_projection + end_projection
-
-        # [reader_beam_size, max_sequence_len, 1]
-        hidden_states = self.relu(hidden_states)
-        hidden_states = self.layer_normalization(hidden_states)
-        hidden_states = self.dense_output(hidden_states)
-
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        return hidden_states
+        start_projection, end_projection = hidden_states.chunk(2, dim=-1)
+        block_mask = token_type_ids.detach().clone()
+        block_mask[:, -1] = 0
+        candidates_starts, candidates_ends, candidate_mask = span_candidates(block_mask)
+        
+        candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidates_starts)
+        candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidates_ends)
+        candidate_hidden = candidate_start_projections + candidate_end_projections
+
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.relu(candidate_hidden)
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.layer_normalization(candidate_hidden)
+        # [reader_beam_size, num_candidates]
+        reader_logits = self.dense_output(candidate_hidden).squeeze(-1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += mask_to_score(candidate_mask)
+        
+        return reader_logits
 
 
 class RealmPreTrainedModel(PreTrainedModel):
@@ -1406,6 +1448,11 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if token_type_ids is None:
+            raise ValueError(
+                "You have to specify `token_type_ids` for separating question block and evidence block."
+            )
+
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1418,14 +1465,24 @@ def forward(
             return_dict=return_dict,
         )
 
-        # [batch_size * num_candidates, joint_seq_len, hidden_size]
+        # [reader_beam_size, joint_seq_len, hidden_size]
         sequence_output = outputs[0]
     
-        # [batch_size * num_candidates, 1]
-        start_logitsm, end_logits = self.qa_outputs(sequence_output)
+        # [reader_beam_size, num_candidates]
+        reader_logits = self.qa_outputs(sequence_output, token_type_ids)
+        # [retriever_beam_size, 1]
+        retriever_logits = torch.unsequeeze(relevance_score, -1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += retriever_logits
+
+        predicted_block_index = tf.argmax(tf.reduce_max(reader_outputs.logits, 1))
+        predicted_candidate = tf.argmax(tf.reduce_max(reader_outputs.logits, 0))
 
         total_loss = None
         if start_positions is not None and end_positions is not None:
+            
+            
+            
             # If we are on multi-GPU, split add a dimension
             if len(start_positions.size()) > 1:
                 start_positions = start_positions.squeeze(-1)

From f6ffc1e4ba4cc69850a183f92f19f4ed570bdcb7 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 23 Sep 2021 20:06:35 +0800
Subject: [PATCH 34/98] Add searcher

---
 .../models/realm/configuration_realm.py       |  8 ++
 .../models/realm/modeling_realm.py            | 75 ++++++++++++++++++-
 src/transformers/models/realm/utils_realm.py  | 26 +++++++
 3 files changed, 108 insertions(+), 1 deletion(-)
 create mode 100644 src/transformers/models/realm/utils_realm.py

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 5b881a871efa..cfe4fbaa5083 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -113,6 +113,9 @@ def __init__(
         span_hidden_size=256,
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
+        reader_beam_size=5,
+        num_block_records=13353718,
+        searcher_beam_size=5000,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
@@ -141,3 +144,8 @@ def __init__(
         self.span_hidden_size = span_hidden_size
         self.max_span_width = max_span_width
         self.reader_layer_norm_eps = reader_layer_norm_eps
+        self.reader_beam_size = reader_beam_size
+
+        # Searcher config
+        self.num_block_records = num_block_records
+        self.searcher_beam_size = searcher_beam_size
\ No newline at end of file
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index e6e9c9dd334b..d5da632cbba0 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -43,7 +43,7 @@
 )
 from ...utils import logging
 from .configuration_realm import RealmConfig
-
+from .utils_realm import load_scann_searcher
 
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
@@ -109,6 +109,9 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         name = name.replace("reader/dense_1/", "qa_outputs/dense_output/")
         name = name.replace("reader/layer_normalization", "qa_outputs/layer_normalization")
 
+        ## For block_emb
+        #name = name.replace("block_emb", "block_emb")
+
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
@@ -1395,6 +1398,76 @@ def forward(
         )
 
 
+class RealmSearcher(RealmPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embedder = RealmEmbedder(config)
+        self.block_emb = torch.zeros(()).new_empty(
+            size=(config.num_block_records, config.retriever_proj_size),
+            dtype=torch.float32,
+            device=torch.device('cpu')
+        )
+        self.init_weights()
+    
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is not None and input_ids.shape[0] != 1) or (inputs_embeds is not None and inputs_embeds.shape[0] != 1):
+            raise ValueError(
+                "The batch_size of inputs should be 1."
+            )
+
+        if self.training:
+            beam_size = self.config.searcher_beam_size
+        else:
+            beam_size = self.config.reader_beam_size
+       
+        question_outputs = self.embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+
+        searcher = load_scann_searcher(
+            db = self.block_emb,
+            num_neighbors=beam_size)
+        
+        retrieved_block_ids, _ = searcher.search_batched(question_projection)
+        # [retriever_beam_size]
+        retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
+
+        # [retriever_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
+
+        retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb)
+
+        print(retrieved_logits)
+
+
+
 class RealmReader(RealmPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
new file mode 100644
index 000000000000..84e05f4a6ab1
--- /dev/null
+++ b/src/transformers/models/realm/utils_realm.py
@@ -0,0 +1,26 @@
+import torch
+import numpy as np
+
+def load_scann_searcher(db,
+                        num_neighbors,
+                        dimensions_per_block=2,
+                        num_leaves=1000,
+                        num_leaves_to_search=100,
+                        training_sample_size=100000):
+    """Load scann searcher from checkpoint."""
+    
+    from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+        
+
+    builder = Builder(
+        db=db,
+        num_neighbors=num_neighbors,
+        distance_measure="dot_product")
+    builder = builder.tree(
+        num_leaves=num_leaves,
+        num_leaves_to_search=num_leaves_to_search,
+        training_sample_size=training_sample_size)
+    builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
+
+    searcher = builder.build()
+    return searcher
\ No newline at end of file

From 8ee98d7aade289d7668ad091fba8c74f5f39069e Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 27 Sep 2021 00:43:12 +0800
Subject: [PATCH 35/98] Complete searcher & reader

---
 .../models/realm/modeling_realm.py            | 143 ++++++++++++++----
 src/transformers/models/realm/utils_realm.py  |  14 +-
 2 files changed, 125 insertions(+), 32 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index d5da632cbba0..75747a9cb22e 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -25,6 +25,8 @@
 from packaging import version
 from torch import nn
 from torch.nn import CrossEntropyLoss
+import numpy as np
+
 
 from ...activations import ACT2FN
 from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
@@ -33,7 +35,6 @@
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
     ModelOutput,
-    QuestionAnsweringModelOutput
 )
 from ...modeling_utils import (
     PreTrainedModel,
@@ -43,7 +44,7 @@
 )
 from ...utils import logging
 from .configuration_realm import RealmConfig
-from .utils_realm import load_scann_searcher
+from .utils_realm import load_scann_searcher, convert_tfrecord_to_np
 
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
@@ -825,6 +826,23 @@ class RealmRetrieverOutput(ModelOutput):
     candidate_score: torch.FloatTensor = None
 
 
+@dataclass
+class RealmSearcherOutput(ModelOutput):
+    retrieved_logits: torch.FloatTensor = None
+    retrieved_blocks: np.ndarray = None
+    retrieved_block_ids: torch.int64 = None
+
+
+@dataclass
+class RealmReaderOutput(ModelOutput):
+    loss: torch.FloatTensor = None
+    block_idx: torch.int64 = None
+    start_pos: torch.int32 = None
+    end_pos: torch.int32 = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
 class RealmPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -936,10 +954,10 @@ def mask_to_score(mask):
         start_projection, end_projection = hidden_states.chunk(2, dim=-1)
         block_mask = token_type_ids.detach().clone()
         block_mask[:, -1] = 0
-        candidates_starts, candidates_ends, candidate_mask = span_candidates(block_mask)
+        candidate_starts, candidate_ends, candidate_mask = span_candidates(block_mask)
         
-        candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidates_starts)
-        candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidates_ends)
+        candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidate_starts)
+        candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidate_ends)
         candidate_hidden = candidate_start_projections + candidate_end_projections
 
         # [reader_beam_size, num_candidates, span_hidden_size]
@@ -951,7 +969,7 @@ def mask_to_score(mask):
         # [reader_beam_size, num_candidates]
         reader_logits += mask_to_score(candidate_mask)
         
-        return reader_logits
+        return reader_logits, candidate_starts, candidate_ends
 
 
 class RealmPreTrainedModel(PreTrainedModel):
@@ -1422,6 +1440,7 @@ def forward(
         inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
+        block_records_path=None,
         return_dict=None,
     ):
 
@@ -1454,17 +1473,35 @@ def forward(
         searcher = load_scann_searcher(
             db = self.block_emb,
             num_neighbors=beam_size)
+
+        retrieved_blocks = convert_tfrecord_to_np(
+            block_records_path = block_records_path,
+            num_block_records = self.config.num_block_records,
+        )
+        
         
         retrieved_block_ids, _ = searcher.search_batched(question_projection)
+
         # [retriever_beam_size]
         retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
 
+        # [retriever_beam_size]
+        retrieved_blocks = np.take(retrieved_blocks, indices=retrieved_block_ids, axis=0)
+
         # [retriever_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
+        #retrieved_block_emb = np.take(self.block_emb, indices=retrieved_block_ids, axis=0)
 
         retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb)
-
-        print(retrieved_logits)
+        
+        if not return_dict:
+            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
+        
+        return RealmSearcherOutput(
+            retrieved_logits=retrieved_logits,
+            retrieved_blocks=retrieved_blocks,
+            retrieved_block_ids=retrieved_block_ids,
+        )
 
 
 
@@ -1500,8 +1537,10 @@ def forward(
         head_mask=None,
         inputs_embeds=None,
         relevance_score=None,
+        retrieved_blocks=None,
         start_positions=None,
         end_positions=None,
+        has_answers=None,
         output_attentions=None,
         output_hidden_states=None,
         return_dict=None,
@@ -1541,44 +1580,90 @@ def forward(
         # [reader_beam_size, joint_seq_len, hidden_size]
         sequence_output = outputs[0]
     
-        # [reader_beam_size, num_candidates]
-        reader_logits = self.qa_outputs(sequence_output, token_type_ids)
+        # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
+        reader_logits, candidate_starts, candidate_ends = self.qa_outputs(sequence_output, token_type_ids)
         # [retriever_beam_size, 1]
-        retriever_logits = torch.unsequeeze(relevance_score, -1)
+        retriever_logits = torch.unsqueeze(relevance_score, -1)
         # [reader_beam_size, num_candidates]
         reader_logits += retriever_logits
+        # []
+        predicted_block_index = torch.argmax(torch.max(reader_logits, dim=1).values)
+        # []
+        predicted_candidate = torch.argmax(torch.max(reader_logits, dim=0).values)
+
+        #predicted_block = torch.index_select(retrieved_blocks, dim=0, index=predicted_block_index)
+
+        #predicted_token_ids = tf.gather(reader_outputs.token_ids,
+        #                                predicted_block_index)
 
-        predicted_block_index = tf.argmax(tf.reduce_max(reader_outputs.logits, 1))
-        predicted_candidate = tf.argmax(tf.reduce_max(reader_outputs.logits, 0))
+        predicted_start = torch.index_select(candidate_starts, dim=0, index=predicted_candidate)
+        predicted_end = torch.index_select(candidate_ends, dim=0, index=predicted_candidate)
 
         total_loss = None
-        if start_positions is not None and end_positions is not None:
-            
-            
+        if start_positions is not None and end_positions is not None and has_answers is not None:
+            def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts,
+                               gold_ends):
+                """Compute correct span."""
+                # [reader_beam_size, num_answers, num_candidates]
+                is_gold_start = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0),
+                    torch.unsqueeze(gold_starts, -1))
+                is_gold_end = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0),
+                    torch.unsqueeze(gold_ends, -1))
+
+                # [reader_beam_size, num_candidates]
+                return torch.any(torch.logical_and(is_gold_start, is_gold_end), 1)
+
+            def marginal_log_loss(logits, is_correct):
+                """Loss based on the negative marginal log-likelihood."""
+                
+                def mask_to_score(mask):
+                    return (1.0 - mask.type(torch.float32)) * -10000.0
+                
+                # []
+                log_numerator = torch.logsumexp(logits + mask_to_score(is_correct), dim=-1)
+                log_denominator = torch.logsumexp(logits, -1)
+                return log_denominator - log_numerator
             
+
             # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
+            #if len(start_positions.size()) > 1:
+            #    start_positions = start_positions.squeeze(-1)
+            #if len(end_positions.size()) > 1:
+            #    end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
+            ignored_index = sequence_output.size(1)
             start_positions = start_positions.clamp(0, ignored_index)
             end_positions = end_positions.clamp(0, ignored_index)
 
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
+            retriever_correct = has_answers
+            any_retriever_correct = torch.any(retriever_correct)
+
+            reader_correct = compute_correct_candidates(
+                candidate_starts=candidate_starts,
+                candidate_ends=candidate_ends,
+                gold_starts=start_positions,
+                gold_ends=end_positions,
+            )
+            any_reader_correct = torch.any(reader_correct)
+
+            retriever_loss = marginal_log_loss(retriever_logits, retriever_correct)
+            reader_loss = marginal_log_loss(reader_logits, reader_correct)
+            retriever_loss *= any_retriever_correct.type(torch.float32)
+            reader_loss *= any_reader_correct.type(torch.float32)
+
+            total_loss = torch.mean(retriever_loss + reader_loss)
 
         if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
+            output = (predicted_block_index, predicted_start, predicted_end) + outputs[2:]
             return ((total_loss,) + output) if total_loss is not None else output
 
-        return QuestionAnsweringModelOutput(
+        return RealmReaderOutput(
             loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
+            block_idx=predicted_block_index,
+            start_pos=predicted_start,
+            end_pos=predicted_end,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 84e05f4a6ab1..81ffbbff4ca1 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -1,5 +1,4 @@
-import torch
-import numpy as np
+import tensorflow.compat.v1 as tf
 
 def load_scann_searcher(db,
                         num_neighbors,
@@ -23,4 +22,13 @@ def load_scann_searcher(db,
     builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
 
     searcher = builder.build()
-    return searcher
\ No newline at end of file
+    return searcher
+
+def convert_tfrecord_to_np(block_records_path, num_block_records):
+    blocks_dataset = tf.data.TFRecordDataset(
+        block_records_path, buffer_size=512 * 1024 * 1024)
+    blocks_dataset = blocks_dataset.batch(
+        num_block_records, drop_remainder=True)
+    np_record = [raw_record.numpy() for raw_record in blocks_dataset.take(1)][0]
+
+    return np_record
\ No newline at end of file

From 7158fe855681eaeff806f2c8637961f1be176608 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 27 Sep 2021 09:26:48 +0800
Subject: [PATCH 36/98] Move block records init to constructor

---
 src/transformers/models/realm/modeling_realm.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 75747a9cb22e..58f72fd824f1 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1420,7 +1420,7 @@ class RealmSearcher(RealmPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
 
-    def __init__(self, config):
+    def __init__(self, config, block_records_path):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
         self.block_emb = torch.zeros(()).new_empty(
@@ -1428,6 +1428,10 @@ def __init__(self, config):
             dtype=torch.float32,
             device=torch.device('cpu')
         )
+        self.block_records = convert_tfrecord_to_np(
+            block_records_path = block_records_path,
+            num_block_records = self.config.num_block_records,
+        )
         self.init_weights()
     
     def forward(
@@ -1440,7 +1444,6 @@ def forward(
         inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
-        block_records_path=None,
         return_dict=None,
     ):
 
@@ -1474,19 +1477,13 @@ def forward(
             db = self.block_emb,
             num_neighbors=beam_size)
 
-        retrieved_blocks = convert_tfrecord_to_np(
-            block_records_path = block_records_path,
-            num_block_records = self.config.num_block_records,
-        )
-        
-        
         retrieved_block_ids, _ = searcher.search_batched(question_projection)
 
         # [retriever_beam_size]
         retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
 
         # [retriever_beam_size]
-        retrieved_blocks = np.take(retrieved_blocks, indices=retrieved_block_ids, axis=0)
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
         # [retriever_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)

From 938ad0a2d10bb2c95891596a4a0af7bfdf88a5ee Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 28 Sep 2021 18:42:22 +0800
Subject: [PATCH 37/98] Fix training bug

---
 .../models/realm/configuration_realm.py       |  2 +
 .../models/realm/modeling_realm.py            | 60 +++++++++----------
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index cfe4fbaa5083..9a9cfef67888 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -114,6 +114,7 @@ def __init__(
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
         reader_beam_size=5,
+        reader_seq_len=288+32,
         num_block_records=13353718,
         searcher_beam_size=5000,
         pad_token_id=1,
@@ -145,6 +146,7 @@ def __init__(
         self.max_span_width = max_span_width
         self.reader_layer_norm_eps = reader_layer_norm_eps
         self.reader_beam_size = reader_beam_size
+        self.reader_seq_len = reader_seq_len
 
         # Searcher config
         self.num_block_records = num_block_records
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 58f72fd824f1..c04dce8bfdf0 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -139,7 +139,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
                     logger.info(f"Skipping {'/'.join(name)}")
                     continue
             if len(scope_names) >= 2:
-                print(scope_names)
                 num = int(scope_names[1])
                 pointer = pointer[num]
         if m_name[-11:] == "_embeddings":
@@ -836,6 +835,8 @@ class RealmSearcherOutput(ModelOutput):
 @dataclass
 class RealmReaderOutput(ModelOutput):
     loss: torch.FloatTensor = None
+    retriever_loss: torch.FloatTensor = None
+    reader_loss:torch.FloatTensor = None
     block_idx: torch.int64 = None
     start_pos: torch.int32 = None
     end_pos: torch.int32 = None
@@ -926,8 +927,8 @@ def span_candidates(masks):
             """
             _, max_sequence_len = masks.shape
             def _spans_given_width(width):
-                current_starts = torch.arange(max_sequence_len - width + 1)
-                current_ends = torch.arange(width - 1, max_sequence_len)
+                current_starts = torch.arange(max_sequence_len - width + 1, device=masks.device)
+                current_ends = torch.arange(width - 1, max_sequence_len, device=masks.device)
                 return current_starts, current_ends
 
             starts, ends = zip(*(_spans_given_width(w + 1)
@@ -1428,6 +1429,7 @@ def __init__(self, config, block_records_path):
             dtype=torch.float32,
             device=torch.device('cpu')
         )
+        self.searcher = None
         self.block_records = convert_tfrecord_to_np(
             block_records_path = block_records_path,
             num_block_records = self.config.num_block_records,
@@ -1459,6 +1461,12 @@ def forward(
         else:
             beam_size = self.config.reader_beam_size
        
+        if self.searcher is None:
+            self.searcher = load_scann_searcher(
+                db = self.block_emb,
+                num_neighbors=beam_size
+            )
+
         question_outputs = self.embedder(
             input_ids,
             attention_mask=attention_mask,
@@ -1473,11 +1481,8 @@ def forward(
         # [1, projection_size]
         question_projection = question_outputs[0]
 
-        searcher = load_scann_searcher(
-            db = self.block_emb,
-            num_neighbors=beam_size)
 
-        retrieved_block_ids, _ = searcher.search_batched(question_projection)
+        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
 
         # [retriever_beam_size]
         retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
@@ -1487,9 +1492,9 @@ def forward(
 
         # [retriever_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
-        #retrieved_block_emb = np.take(self.block_emb, indices=retrieved_block_ids, axis=0)
 
-        retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb)
+        # [retriever_beam_size]
+        retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device))
         
         if not return_dict:
             return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
@@ -1580,23 +1585,21 @@ def forward(
         # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
         reader_logits, candidate_starts, candidate_ends = self.qa_outputs(sequence_output, token_type_ids)
         # [retriever_beam_size, 1]
-        retriever_logits = torch.unsqueeze(relevance_score, -1)
+        retriever_logits = torch.unsqueeze(relevance_score[0: self.config.reader_beam_size], -1)
         # [reader_beam_size, num_candidates]
         reader_logits += retriever_logits
         # []
         predicted_block_index = torch.argmax(torch.max(reader_logits, dim=1).values)
         # []
         predicted_candidate = torch.argmax(torch.max(reader_logits, dim=0).values)
-
-        #predicted_block = torch.index_select(retrieved_blocks, dim=0, index=predicted_block_index)
-
-        #predicted_token_ids = tf.gather(reader_outputs.token_ids,
-        #                                predicted_block_index)
-
+        # [1]
         predicted_start = torch.index_select(candidate_starts, dim=0, index=predicted_candidate)
+        # [1]
         predicted_end = torch.index_select(candidate_ends, dim=0, index=predicted_candidate)
 
         total_loss = None
+        retriever_loss = None
+        reader_loss = None
         if start_positions is not None and end_positions is not None and has_answers is not None:
             def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts,
                                gold_ends):
@@ -1620,19 +1623,14 @@ def mask_to_score(mask):
                 
                 # []
                 log_numerator = torch.logsumexp(logits + mask_to_score(is_correct), dim=-1)
-                log_denominator = torch.logsumexp(logits, -1)
+                log_denominator = torch.logsumexp(logits, dim=-1)
                 return log_denominator - log_numerator
             
-
-            # If we are on multi-GPU, split add a dimension
-            #if len(start_positions.size()) > 1:
-            #    start_positions = start_positions.squeeze(-1)
-            #if len(end_positions.size()) > 1:
-            #    end_positions = end_positions.squeeze(-1)
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            # `-1` is reserved for no answer.
             ignored_index = sequence_output.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
+            start_positions = start_positions.clamp(-1, ignored_index)
+            end_positions = end_positions.clamp(-1, ignored_index)
 
             retriever_correct = has_answers
             any_retriever_correct = torch.any(retriever_correct)
@@ -1640,24 +1638,26 @@ def mask_to_score(mask):
             reader_correct = compute_correct_candidates(
                 candidate_starts=candidate_starts,
                 candidate_ends=candidate_ends,
-                gold_starts=start_positions,
-                gold_ends=end_positions,
+                gold_starts=start_positions[0: self.config.reader_beam_size],
+                gold_ends=end_positions[0: self.config.reader_beam_size],
             )
             any_reader_correct = torch.any(reader_correct)
 
-            retriever_loss = marginal_log_loss(retriever_logits, retriever_correct)
+            retriever_loss = marginal_log_loss(relevance_score, retriever_correct)
             reader_loss = marginal_log_loss(reader_logits, reader_correct)
             retriever_loss *= any_retriever_correct.type(torch.float32)
             reader_loss *= any_reader_correct.type(torch.float32)
 
-            total_loss = torch.mean(retriever_loss + reader_loss)
+            total_loss = (retriever_loss + reader_loss).mean()
 
         if not return_dict:
             output = (predicted_block_index, predicted_start, predicted_end) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
+            return ((total_loss, retriever_loss, reader_loss) + output) if total_loss is not None else output
 
         return RealmReaderOutput(
             loss=total_loss,
+            retriever_loss=retriever_loss,
+            reader_loss=reader_loss,
             block_idx=predicted_block_index,
             start_pos=predicted_start,
             end_pos=predicted_end,

From 55f6531917fb82eba6adc5a67c7d672608a2b4fb Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 28 Sep 2021 23:43:20 +0800
Subject: [PATCH 38/98] Add some outputs to RealmReader

---
 src/transformers/models/realm/modeling_realm.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index c04dce8bfdf0..3a3394de0839 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -837,7 +837,10 @@ class RealmReaderOutput(ModelOutput):
     loss: torch.FloatTensor = None
     retriever_loss: torch.FloatTensor = None
     reader_loss:torch.FloatTensor = None
+    retriever_correct: torch.BoolTensor = None
+    reader_correct: torch.BoolTensor = None
     block_idx: torch.int64 = None
+    candidate: torch.int32 = None
     start_pos: torch.int32 = None
     end_pos: torch.int32 = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
@@ -1651,14 +1654,17 @@ def mask_to_score(mask):
             total_loss = (retriever_loss + reader_loss).mean()
 
         if not return_dict:
-            output = (predicted_block_index, predicted_start, predicted_end) + outputs[2:]
-            return ((total_loss, retriever_loss, reader_loss) + output) if total_loss is not None else output
+            output = (predicted_block_index, predicted_candidate, predicted_start, predicted_end) + outputs[2:]
+            return ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output) if total_loss is not None else output
 
         return RealmReaderOutput(
             loss=total_loss,
             retriever_loss=retriever_loss,
             reader_loss=reader_loss,
+            retriever_correct=retriever_correct,
+            reader_correct=reader_correct,
             block_idx=predicted_block_index,
+            candidate=predicted_candidate,
             start_pos=predicted_start,
             end_pos=predicted_end,
             hidden_states=outputs.hidden_states,

From 89fd9c79c430b0044272f9957362b9dec02e0628 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 30 Sep 2021 01:21:19 +0800
Subject: [PATCH 39/98] Add finetuned checkpoint variable names parsing

---
 .../models/realm/modeling_realm.py            | 35 ++++++++++---------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 3a3394de0839..b8196a85abb8 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -89,29 +89,32 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         arrays.append(array)
     
     for name, array in zip(names, arrays):
-        if "reader" in name:
-            is_reader_checkpoint=True
-
-    for name, array in zip(names, arrays):
-        # For embedder and retriever
-        embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
-        name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
-        name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
-        name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
-        name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
-
         # For reader
-        if is_reader_checkpoint and isinstance(model, RealmReader) and "reader" not in name:
-            logger.info(f"Skipping {name} as the it is not reader's parameter")
+        if isinstance(model, RealmReader) and "reader" not in name:
+            logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
+            continue
+        elif not isinstance(model, RealmReader) and "reader" in name:
+            logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
             continue
         name = name.replace("reader/module/bert/", "bert/")
         name = name.replace("reader/module/cls/", "cls/")
         name = name.replace("reader/dense/", "qa_outputs/dense_intermediate/")
         name = name.replace("reader/dense_1/", "qa_outputs/dense_output/")
         name = name.replace("reader/layer_normalization", "qa_outputs/layer_normalization")
-
-        ## For block_emb
-        #name = name.replace("block_emb", "block_emb")
+        
+        
+        # For embedder and retriever
+        embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+        name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
+        name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+        name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
+        name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+        
+        # Fine-tuned checkpoints
+        name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}bert/")
+        name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+        name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/")
+        name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
 
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v

From 3e57b529768e54efbb5457434214a6e4c28bc54a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 30 Sep 2021 01:56:28 +0800
Subject: [PATCH 40/98] Fix bug

---
 src/transformers/models/realm/modeling_realm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index b8196a85abb8..2db75e502310 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1606,6 +1606,8 @@ def forward(
         total_loss = None
         retriever_loss = None
         reader_loss = None
+        retriever_correct = None
+        reader_correct = None
         if start_positions is not None and end_positions is not None and has_answers is not None:
             def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts,
                                gold_ends):

From 136b3ff0c05eb8006cd320066c1476f2c8341f67 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Fri, 1 Oct 2021 21:09:04 +0800
Subject: [PATCH 41/98] Update REALM config

---
 src/transformers/models/realm/configuration_realm.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 9a9cfef67888..a591cf12952d 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -109,6 +109,7 @@ def __init__(
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
+        use_scann=True,
         use_cache=True,
         span_hidden_size=256,
         max_span_width=10,
@@ -117,6 +118,7 @@ def __init__(
         reader_seq_len=288+32,
         num_block_records=13353718,
         searcher_beam_size=5000,
+        searcher_seq_len=64,
         pad_token_id=1,
         bos_token_id=0,
         eos_token_id=2,
@@ -139,6 +141,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
+        self.use_scann = use_scann
         self.use_cache = use_cache
 
         # Reader config
@@ -150,4 +153,5 @@ def __init__(
 
         # Searcher config
         self.num_block_records = num_block_records
-        self.searcher_beam_size = searcher_beam_size
\ No newline at end of file
+        self.searcher_beam_size = searcher_beam_size
+        self.searcher_seq_len = searcher_seq_len
\ No newline at end of file

From 9f629611d27404c363ea812d8aaa5eb14761afc6 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 00:28:17 +0800
Subject: [PATCH 42/98] Add RealmForOpenQA

---
 .../models/realm/modeling_realm.py            | 150 +++++++++++++++++-
 src/transformers/models/realm/utils_realm.py  |   5 +-
 2 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 2db75e502310..ff2b907a8d39 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -26,6 +26,7 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 import numpy as np
+from transformers.models.realm.tokenization_realm import RealmTokenizer
 
 
 from ...activations import ACT2FN
@@ -929,7 +930,7 @@ def span_candidates(masks):
                 Returns:
                 starts: <int32> [num_spans]
                 ends: <int32> [num_spans]
-                span_masks: <int32> [num_retrievals, num_spans]
+                span_masks: <int32> [num_retrievals, num_spans] whether spans locate in evidence block.
             """
             _, max_sequence_len = masks.shape
             def _spans_given_width(width):
@@ -1430,17 +1431,30 @@ class RealmSearcher(RealmPreTrainedModel):
     def __init__(self, config, block_records_path):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
-        self.block_emb = torch.zeros(()).new_empty(
-            size=(config.num_block_records, config.retriever_proj_size),
-            dtype=torch.float32,
-            device=torch.device('cpu')
-        )
         self.searcher = None
         self.block_records = convert_tfrecord_to_np(
             block_records_path = block_records_path,
-            num_block_records = self.config.num_block_records,
+            num_block_records = config.num_block_records,
+        )
+        self.register_buffer("block_emb", 
+            torch.zeros(()).new_empty(
+                size=(config.num_block_records, config.retriever_proj_size),
+                dtype=torch.float32,
+                device=torch.device('cpu')
+            )
         )
         self.init_weights()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        try:
+            import scann
+        except ImportError:
+            raise ImportError(
+                "RealmSearcher requires ScaNN to retrieve documents from the corpus."
+                "Please install it through `pip install scann`."
+            )
+        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
     
     def forward(
         self,
@@ -1512,7 +1526,6 @@ def forward(
         )
 
 
-
 class RealmReader(RealmPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
@@ -1675,3 +1688,124 @@ def mask_to_score(mask):
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+class RealmForOpenQA(RealmPreTrainedModel):
+    def __init__(self, config, searcher, reader, tokenizer):
+        super().__init__(config)
+        self.searcher = searcher
+        self.reader = reader
+        self.tokenizer = tokenizer
+
+    @classmethod
+    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, *args, **kwargs):
+        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
+        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, *args, **kwargs)
+        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, *args, **kwargs)
+        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
+        return cls(config, searcher, reader, tokenizer)
+    
+    def save_pretrained(self, save_directory):
+        self.searcher.save_pretrained(save_directory)
+        self.reader.save_pretrained(save_directory)
+
+    def retrieve(self, input_ids, **kwargs):
+        output = self.searcher(
+            input_ids,
+            return_dict=True, 
+            **kwargs)
+        return output
+
+    def read(self, searcher_output, question, answers):
+        def block_has_answer(concat_inputs, answers):
+            """check if retrieved_blocks has answers."""
+            has_answers = []
+            start_pos = []
+            end_pos = []
+            max_answers = 0
+
+            for input_id in concat_inputs.input_ids:
+                pass_sep = False
+                answer_pos = 0
+                start=-1
+                start_pos.append([])
+                end_pos.append([])
+                for answer in answers:
+                    for idx, id in enumerate(input_id):
+                        if id == self.tokenizer.sep_token_id:
+                            pass_sep = True
+                        if not pass_sep:
+                            continue
+                        if answer[answer_pos] == id:
+                            if start == -1:
+                                start = idx
+                            if answer_pos == len(answer) - 1:
+                                start_pos[-1].append(start)
+                                end_pos[-1].append(idx)
+                                answer_pos = 0
+                                start = -1
+                                break
+                            else:
+                                answer_pos += 1
+                        else:
+                            answer_pos = 0
+                            start = -1
+                
+                if len(start_pos[-1]) == 0:
+                    has_answers.append(False)
+                else:
+                    has_answers.append(True)
+                    if len(start_pos[-1]) > max_answers:
+                        max_answers = len(start_pos[-1])
+
+            # Pad -1 to max_answers
+            for start_pos_, end_pos_ in zip(start_pos, end_pos):
+                while len(start_pos_) < max_answers:
+                    start_pos_.append(-1)
+                while len(end_pos_) < max_answers:
+                    end_pos_.append(-1)
+
+            assert len(has_answers) == len(start_pos) == len(end_pos)
+
+            return (
+                torch.tensor(has_answers, dtype=torch.bool, device=concat_inputs.input_ids.device),
+                torch.tensor(start_pos, dtype=torch.int64, device=concat_inputs.input_ids.device),
+                torch.tensor(end_pos, dtype=torch.int64, device=concat_inputs.input_ids.device),
+            )
+
+        text = []
+        text_pair = []
+        for retrieved_block in searcher_output.retrieved_blocks:
+            text.append(question)
+            text_pair.append(retrieved_block.decode())
+
+        concat_inputs = self.tokenizer(text, text_pair, return_tensors='pt', padding=True, truncation=True, max_length=self.config.reader_seq_len)
+
+        if answers is not None:
+            has_answers, start_positions, end_positions = block_has_answer(concat_inputs.to(searcher_output.retrieved_logits.device), answers)
+        else:
+            has_answers, start_positions, end_positions = (None, None, None)
+
+        output = self.reader(
+            input_ids=concat_inputs.input_ids[0: self.config.reader_beam_size],
+            attention_mask=concat_inputs.attention_mask[0: self.config.reader_beam_size],
+            token_type_ids=concat_inputs.token_type_ids[0: self.config.reader_beam_size],
+            relevance_score=searcher_output.retrieved_logits,
+            has_answers=has_answers,
+            start_positions=start_positions,
+            end_positions=end_positions,
+            return_dict=True,
+        )
+
+        answer = self.tokenizer.decode(concat_inputs.input_ids[output.block_idx][output.start_pos: output.end_pos + 1])
+
+        return output, answer
+
+    def forward(self, question, answers=None):
+        question_ids = self.tokenizer([question], max_length=self.config.searcher_seq_len)
+        
+        searcher_output = self.retrieve(**question_ids)
+
+        reader_output, predicted_answer = self.read(searcher_output, question, answers)
+
+        return searcher_output, reader_output, predicted_answer
\ No newline at end of file
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 81ffbbff4ca1..dfffb57f501a 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -1,5 +1,3 @@
-import tensorflow.compat.v1 as tf
-
 def load_scann_searcher(db,
                         num_neighbors,
                         dimensions_per_block=2,
@@ -25,6 +23,9 @@ def load_scann_searcher(db,
     return searcher
 
 def convert_tfrecord_to_np(block_records_path, num_block_records):
+
+    import tensorflow.compat.v1 as tf
+
     blocks_dataset = tf.data.TFRecordDataset(
         block_records_path, buffer_size=512 * 1024 * 1024)
     blocks_dataset = blocks_dataset.batch(

From de1f3f00227b346cd55f1f88896a9862d957905a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 13:06:33 +0800
Subject: [PATCH 43/98] Update convert_tfrecord logits

---
 src/transformers/models/realm/utils_realm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index dfffb57f501a..45358744fd44 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -30,6 +30,6 @@ def convert_tfrecord_to_np(block_records_path, num_block_records):
         block_records_path, buffer_size=512 * 1024 * 1024)
     blocks_dataset = blocks_dataset.batch(
         num_block_records, drop_remainder=True)
-    np_record = [raw_record.numpy() for raw_record in blocks_dataset.take(1)][0]
+    np_record = next(blocks_dataset.take(1).as_numpy_iterator())
 
     return np_record
\ No newline at end of file

From bd16314b6a6dfd6e244b9bf8778e6a29ca442d8b Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 16:05:32 +0800
Subject: [PATCH 44/98] Fix bugs

---
 .../models/realm/modeling_realm.py            | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index ff2b907a8d39..80443022812c 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1436,7 +1436,7 @@ def __init__(self, config, block_records_path):
             block_records_path = block_records_path,
             num_block_records = config.num_block_records,
         )
-        self.register_buffer("block_emb", 
+        self.register_buffer("block_emb",
             torch.zeros(()).new_empty(
                 size=(config.num_block_records, config.retriever_proj_size),
                 dtype=torch.float32,
@@ -1481,6 +1481,8 @@ def forward(
         else:
             beam_size = self.config.reader_beam_size
        
+        if self.block_emb.device != torch.device("cpu"):
+            self.block_emb = self.block_emb.cpu()
         if self.searcher is None:
             self.searcher = load_scann_searcher(
                 db = self.block_emb,
@@ -1501,7 +1503,7 @@ def forward(
         # [1, projection_size]
         question_projection = question_outputs[0]
 
-
+        # [1, retriever_beam_size]
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
 
         # [retriever_beam_size]
@@ -1665,7 +1667,7 @@ def mask_to_score(mask):
             any_reader_correct = torch.any(reader_correct)
 
             retriever_loss = marginal_log_loss(relevance_score, retriever_correct)
-            reader_loss = marginal_log_loss(reader_logits, reader_correct)
+            reader_loss = marginal_log_loss(reader_logits.view(-1), reader_correct.view(-1))
             retriever_loss *= any_retriever_correct.type(torch.float32)
             reader_loss *= any_reader_correct.type(torch.float32)
 
@@ -1698,11 +1700,11 @@ def __init__(self, config, searcher, reader, tokenizer):
         self.tokenizer = tokenizer
 
     @classmethod
-    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, *args, **kwargs):
+    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs):
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, *args, **kwargs)
-        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, *args, **kwargs)
-        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
+        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs)
+        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
+        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
         return cls(config, searcher, reader, tokenizer)
     
     def save_pretrained(self, save_directory):
@@ -1779,7 +1781,7 @@ def block_has_answer(concat_inputs, answers):
             text.append(question)
             text_pair.append(retrieved_block.decode())
 
-        concat_inputs = self.tokenizer(text, text_pair, return_tensors='pt', padding=True, truncation=True, max_length=self.config.reader_seq_len)
+        concat_inputs = self.tokenizer(text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors='pt')
 
         if answers is not None:
             has_answers, start_positions, end_positions = block_has_answer(concat_inputs.to(searcher_output.retrieved_logits.device), answers)
@@ -1801,11 +1803,11 @@ def block_has_answer(concat_inputs, answers):
 
         return output, answer
 
-    def forward(self, question, answers=None):
-        question_ids = self.tokenizer([question], max_length=self.config.searcher_seq_len)
-        
+    def forward(self, question, answer_ids=None):
+        question_ids = self.tokenizer([question], padding=True, truncation=True, max_length=self.config.searcher_seq_len, return_tensors='pt')
+
         searcher_output = self.retrieve(**question_ids)
 
-        reader_output, predicted_answer = self.read(searcher_output, question, answers)
+        reader_output, predicted_answer = self.read(searcher_output, question, answer_ids)
 
         return searcher_output, reader_output, predicted_answer
\ No newline at end of file

From c917ef9aab10ea7107f4b231a5d5a3446ba0ea8a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 16:38:56 +0800
Subject: [PATCH 45/98] Complete imports

---
 src/transformers/__init__.py                  |  6 +++
 src/transformers/models/realm/__init__.py     |  6 +++
 .../models/realm/configuration_realm.py       | 38 +++++++++++++++----
 utils/check_repo.py                           |  3 ++
 4 files changed, 46 insertions(+), 7 deletions(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 1e94911936cd..28e13c98e037 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1031,8 +1031,11 @@
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
             "RealmEmbedder",
             "RealmEncoder",
+            "RealmForOpenQA",
             "RealmPreTrainedModel",
+            "RealmReader",
             "RealmRetriever",
+            "RealmSearcher",
             "load_tf_weights_in_realm",
         ]
     )
@@ -2704,6 +2707,9 @@
             RealmEncoder,
             RealmPreTrainedModel,
             RealmRetriever,
+            RealmSearcher,
+            RealmReader,
+            RealmForOpenQA,
             load_tf_weights_in_realm,
         )
         from .models.reformer import (
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 293ae5e55812..b17804f24153 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -31,8 +31,11 @@
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
         "RealmEmbedder",
         "RealmEncoder",
+        "RealmForOpenQA",
         "RealmPreTrainedModel",
+        "RealmReader",
         "RealmRetriever",
+        "RealmSearcher",
         "load_tf_weights_in_realm",
     ]
 
@@ -46,8 +49,11 @@
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
             RealmEncoder,
+            RealmForOpenQA,
             RealmPreTrainedModel,
+            RealmReader,
             RealmRetriever,
+            RealmSearcher,
             load_tf_weights_in_realm,
         )
 
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index a591cf12952d..38a28e36b862 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -30,9 +30,14 @@
 
 class RealmConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of :class:`~transformers.RealmEmbedder`,
-    :class:`~transformers.RealmRetriever`, and :class:`~transformers.RealmEncoder`. It is used to instantiate an REALM
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    This is the configuration class to store the configuration of 
+    1. :class:`~transformers.RealmEmbedder`
+    2. :class:`~transformers.RealmRetriever` 
+    3. :class:`~transformers.RealmEncoder` 
+    4. :class:`~transformers.RealmSearcher`
+    5. :class:`~transformers.RealmReader`
+    6. :class:`~transformers.RealmForOpenQA` 
+    It is used to instantiate an REALM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the REALM `realm-cc-news-pretrained
     <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
 
@@ -44,7 +49,7 @@ class RealmConfig(PretrainedConfig):
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
             :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
@@ -69,7 +74,7 @@ class RealmConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, or :class:`~transformers.RealmEncoder`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
@@ -77,6 +82,25 @@ class RealmConfig(PretrainedConfig):
         use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if ``config.is_decoder=True``.
+        use_scann (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not :class:`~transformers.RealmSearcher` uses `ScaNN` as the vector similarity searcher.
+            This option has no effect and is reserved for future development.
+        span_hidden_size (:obj:`int`, `optional`, defaults to 256):
+            Dimension of the reader's spans.
+        max_span_width (:obj:`int`, `optional`, defaults to 10):
+            Max span width of the reader.
+        reader_layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-3):
+            The epsilon used by the reader's layer normalization layers.
+        reader_beam_size (:obj:`int`, `optional`, defaults to 5):
+            Beam size of the reader.
+        reader_seq_len (:obj:`int`, `optional`, defaults to 288+32):
+            Maximum sequence length of the reader.
+        num_block_records (:obj:`int`, `optional`, defaults to 13353718):
+            Number of block records.
+        searcher_beam_size (:obj:`int`, `optional`, defaults to 5000):
+            Beam size of the searcher. Note that when eval mode is enabled, `searcher_beam_size` will be the same as `reader_beam_size`.
+        searcher_seq_len (:obj:`int`, `optional`, defaults to 64):
+            Maximum sequence length of the searcher.
 
     Example::
 
@@ -109,8 +133,8 @@ def __init__(
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        use_scann=True,
         use_cache=True,
+        use_scann=True,
         span_hidden_size=256,
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
@@ -141,8 +165,8 @@ def __init__(
         self.initializer_range = initializer_range
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
-        self.use_scann = use_scann
         self.use_cache = use_cache
+        self.use_scann = use_scann
 
         # Reader config
         self.span_hidden_size = span_hidden_size
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 84b658d152d9..e7e04e8e2af2 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -118,7 +118,10 @@
     "RagSequenceForGeneration",
     "RagTokenForGeneration",
     "RealmEmbedder",
+    "RealmForOpenQA",
     "RealmRetriever",
+    "RealmReader",
+    "RealmSearcher",
     "TFDPRReader",
     "TFGPT2DoubleHeadsModel",
     "TFOpenAIGPTDoubleHeadsModel",

From 113807f840138bd9b6485d75228ca8ab3ef3b1b8 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 22:01:47 +0800
Subject: [PATCH 46/98] Update docs

---
 docs/source/model_doc/realm.rst               |  21 ++
 .../models/realm/configuration_realm.py       |   4 +-
 .../models/realm/modeling_realm.py            | 195 +++++++++++++++---
 .../models/realm/tokenization_realm.py        |   8 +-
 src/transformers/models/realm/utils_realm.py  |  16 ++
 tests/test_modeling_realm.py                  |  50 ++++-
 6 files changed, 262 insertions(+), 32 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index a524690d24b1..88e564958629 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -73,3 +73,24 @@ RealmEncoder
 
 .. autoclass:: transformers.RealmEncoder
     :members: forward
+
+
+RealmSearcher
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RealmSearcher
+    :members: forward
+
+
+RealmReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RealmReader
+    :members: forward
+
+
+RealmForOpenQA
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RealmForOpenQA
+    :members: from_pretrained, forward
\ No newline at end of file
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 38a28e36b862..4170d7e69575 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
     "realm-cc-news-pretrained-bert": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-bert/resolve/main/config.json",
     "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/config.json",
     "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/config.json",
+    "realm-orqa-nq-searcher": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/config.json",
+    "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/config.json",
     # See all REALM models at https://huggingface.co/models?filter=realm
 }
 
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 80443022812c..bbfc0e02942b 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Team The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,6 +58,8 @@
     "qqaatw/realm-cc-news-pretrained-bert",
     "qqaatw/realm-cc-news-pretrained-embedder",
     "qqaatw/realm-cc-news-pretrained-retriever",
+    "qqaatw/realm-orqa-nq-searcher",
+    "qqaatw/realm-orqa-nq-reader",
     # See all REALM models at https://huggingface.co/models?filter=realm
 ]
 
@@ -831,26 +833,93 @@ class RealmRetrieverOutput(ModelOutput):
 
 @dataclass
 class RealmSearcherOutput(ModelOutput):
+    """
+    Outputs of RealmSearcher models.
+
+    Args:
+        retrieved_logits (:obj:`torch.FloatTensor` of shape :obj:`(config.searcher_beam_size,)`):
+            The relevance score of document candidates (before softmax).
+        retrieved_blocks (:obj:`np.ndarray` of shape :obj:`(config.searcher_beam_size,)`):
+            Retrieved document blocks.
+        retrieved_block_ids (:obj:`torch.LongTensor` of shape :obj:`(config.searcher_beam_size,)`):
+            IDs of retrieved blocks.
+    """
+
     retrieved_logits: torch.FloatTensor = None
     retrieved_blocks: np.ndarray = None
-    retrieved_block_ids: torch.int64 = None
+    retrieved_block_ids: torch.LongTensor = None
 
 
 @dataclass
 class RealmReaderOutput(ModelOutput):
+    """
+    Outputs of RealmReader models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+            Total loss.
+        retriever_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+            Retriever loss.
+        reader_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+            Reader loss.
+        retriever_correct (:obj:`torch.BoolTensor` of shape :obj:`(config.searcher_beam_size,)`, `optional`):
+            Whether or not a evidence block derived from `RealmSearcher` contains answer.
+        reader_correct (:obj:`torch.BoolTensor` of shape :obj:`(config.reader_beam_size, num_candidates)`, `optional`):
+            Whether or not a span candidate contains answer.
+        block_idx (:obj:`torch.LongTensor` of shape :obj:`()`):
+            The index of retrieved evidence blocks in which the predicted answer in most likely
+        candidate (:obj:`torch.LongTensor` of shape :obj:`()`):
+            .
+        start_pos (:obj:`torch.IntTensor` of shape :obj:`()`):
+            Predicted answer starting position in `RealmReader`'s inputs.
+        end_pos: (:obj:`torch.IntTensor` of shape :obj:`()`):
+            Predicted answer ending position in `RealmReader`'s inputs.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
     loss: torch.FloatTensor = None
     retriever_loss: torch.FloatTensor = None
     reader_loss:torch.FloatTensor = None
     retriever_correct: torch.BoolTensor = None
     reader_correct: torch.BoolTensor = None
-    block_idx: torch.int64 = None
-    candidate: torch.int32 = None
+    block_idx: torch.LongTensor = None
+    candidate: torch.LongTensor = None
     start_pos: torch.int32 = None
     end_pos: torch.int32 = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class RealmForOpenQAOutput(ModelOutput):
+    """
+
+    Outputs of RealmReader models.
+
+    Args:
+        searcher_output (:obj:`dict`):
+            Searcher output.
+        reader_output (:obj:`dict`):
+            Reader output.
+        predicted_answer (:obj:`str`):
+            Predicted answer.
+    """
+
+    searcher_output: dict = None
+    reader_output: dict = None
+    predicted_answer: str = None
+
+
 class RealmPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1155,7 +1224,7 @@ def forward(
 )
 class RealmRetriever(RealmPreTrainedModel):
     r"""
-    Parameters:
+    Args:
         query_embedder (:class:`~transformers.RealmEmbedder`):
             Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
     """
@@ -1423,11 +1492,16 @@ def forward(
             attentions=joint_outputs.attentions,
         )
 
-
+@add_start_docstrings(
+    "The searcher of REALM outputting relevance score (before softmax) and corresponding document blocks.",
+    REALM_START_DOCSTRING,
+)
 class RealmSearcher(RealmPreTrainedModel):
-
-    _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
-
+    r"""
+    Args:
+        block_records_path (:obj:`str`):
+            Block records path.
+    """
     def __init__(self, config, block_records_path):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
@@ -1456,6 +1530,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             )
         return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
     
+    @add_start_docstrings_to_model_forward(
+        REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len")
+    )
+    @replace_return_docstrings(output_type=RealmSearcherOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -1468,12 +1546,14 @@ def forward(
         output_hidden_states=None,
         return_dict=None,
     ):
-
+        r"""
+        Returns:
+        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if (input_ids is not None and input_ids.shape[0] != 1) or (inputs_embeds is not None and inputs_embeds.shape[0] != 1):
             raise ValueError(
-                "The batch_size of inputs should be 1."
+                "The batch_size of the inputs must be 1."
             )
 
         if self.training:
@@ -1503,19 +1583,19 @@ def forward(
         # [1, projection_size]
         question_projection = question_outputs[0]
 
-        # [1, retriever_beam_size]
+        # [1, searcher_beam_size]
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
 
-        # [retriever_beam_size]
+        # [searcher_beam_size]
         retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
 
-        # [retriever_beam_size]
+        # [searcher_beam_size]
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
-        # [retriever_beam_size, projection_size]
+        # [searcher_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
 
-        # [retriever_beam_size]
+        # [searcher_beam_size]
         retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device))
         
         if not return_dict:
@@ -1528,6 +1608,10 @@ def forward(
         )
 
 
+@add_start_docstrings(
+    "The reader of REALM.",
+    REALM_START_DOCSTRING,
+)
 class RealmReader(RealmPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]
@@ -1542,15 +1626,11 @@ def __init__(self, config):
 
         self.init_weights()
 
-    """
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @add_code_sample_docstrings(
-        tokenizer_class=_TOKENIZER_FOR_DOC,
-        checkpoint=_BERT_CHECKPOINT_FOR_DOC,
-        output_type=QuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
+    
+    @add_start_docstrings_to_model_forward(
+        REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length")
     )
-    """
+    @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -1569,6 +1649,8 @@ def forward(
         return_dict=None,
     ):
         r"""
+        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
+            Relevance score derived from `RealmSearcher`, must be specified if you want to compute the marginal log loss.
         start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
@@ -1577,9 +1659,10 @@ def forward(
             Labels for position (index) of the end of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
             sequence are not taken into account for computing the loss.
-
+        has_answers (:obj:`torch.BoolTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
+            Whether or not the evidence blocks derived from `RealmSearcher` have answer(s).
+        
         Returns:
-
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1605,7 +1688,7 @@ def forward(
     
         # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
         reader_logits, candidate_starts, candidate_ends = self.qa_outputs(sequence_output, token_type_ids)
-        # [retriever_beam_size, 1]
+        # [searcher_beam_size, 1]
         retriever_logits = torch.unsqueeze(relevance_score[0: self.config.reader_beam_size], -1)
         # [reader_beam_size, num_candidates]
         reader_logits += retriever_logits
@@ -1692,6 +1775,21 @@ def mask_to_score(mask):
         )
 
 
+REALM_FOR_OPEN_QA_DOCSTRING = r"""
+    Args:
+        question (:obj:`str`):
+            OpenQA Question.
+        answer_ids (:obj:`torch.LongTensor` of shape :obj:`(num_answers, answer_length)`, `optional`):
+            Answer ids for computing the marginal log-likelihood loss. Indices should be in ``[-1, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-1`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
+    REALM_START_DOCSTRING,
+)
 class RealmForOpenQA(RealmPreTrainedModel):
     def __init__(self, config, searcher, reader, tokenizer):
         super().__init__(config)
@@ -1701,6 +1799,16 @@ def __init__(self, config, searcher, reader, tokenizer):
 
     @classmethod
     def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs):
+        """
+        Args:
+            searcher_pretrained_name_or_path (:obj:`str`):
+                Searcher pretrained name or path.
+            reader_pretrained_name_or_path (:obj:`str`):
+                Reader pretrained name or path.
+            block_records_path (:obj:`str`):
+                Block records path.
+
+        """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
         searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs)
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
@@ -1803,11 +1911,40 @@ def block_has_answer(concat_inputs, answers):
 
         return output, answer
 
-    def forward(self, question, answer_ids=None):
+    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
+    @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(self, question, answer_ids=None, return_dict=None):
+        r"""
+        Returns:
+
+        Example::
+
+            >>> import torch
+            >>> from transformers import RealmForOpenQA, RealmTokenizer
+
+            >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
+
+            >>> question = "Who is the pioneer in modern computer science?"
+            >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
+
+            >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
+            >>> loss = reader_output.loss
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         question_ids = self.tokenizer([question], padding=True, truncation=True, max_length=self.config.searcher_seq_len, return_tensors='pt')
 
         searcher_output = self.retrieve(**question_ids)
 
         reader_output, predicted_answer = self.read(searcher_output, question, answer_ids)
 
-        return searcher_output, reader_output, predicted_answer
\ No newline at end of file
+        if return_dict:
+            return searcher_output, reader_output, predicted_answer
+        
+        return RealmForOpenQAOutput(
+            searcher_output=searcher_output,
+            reader_output=reader_output,
+            predicted_answer=predicted_answer,
+        )
+        
\ No newline at end of file
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index ebdd1d40ff9a..64848f7795bd 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
         "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/vocab.txt",
         "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/vocab.txt",
         "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/vocab.txt",
+        "realm-orqa-nq-searcher": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/vocab.txt",
+        "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/vocab.txt",
     }
 }
 
@@ -36,6 +38,8 @@
     "realm-cc-news-pretrained-embedder": 512,
     "realm-cc-news-pretrained-retriever": 512,
     "realm-cc-news-pretrained-encoder": 512,
+    "realm-orqa-nq-searcher": 512,
+    "realm-orqa-nq-reader": 512,
 }
 
 
@@ -43,6 +47,8 @@
     "realm-cc-news-pretrained-embedder": {"do_lower_case": True},
     "realm-cc-news-pretrained-retriever": {"do_lower_case": True},
     "realm-cc-news-pretrained-encoder": {"do_lower_case": True},
+    "realm-orqa-nq-searcher": {"do_lower_case": True},
+    "realm-orqa-nq-reader": {"do_lower_case": True},
 }
 
 
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 45358744fd44..2e128d5db56c 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -1,3 +1,19 @@
+# coding=utf-8
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utilities for REALM."""
+
 def load_scann_searcher(db,
                         num_neighbors,
                         dimensions_per_block=2,
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 231ffec8e7b2..52053b11fdb3 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -28,7 +28,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import RealmEmbedder, RealmEncoder, RealmRetriever
+    from transformers import RealmEmbedder, RealmEncoder, RealmRetriever, RealmSearcher, RealmReader, RealmForOpenQA
 
 
 class RealmModelTester:
@@ -342,6 +342,39 @@ def test_inference_encoder(self):
 
         self.assertTrue(torch.allclose(output[1, :2, :2], expected_slice, atol=1e-4))
 
+    @slow
+    def test_inference_open_qa(self):
+        model = RealmForOpenQA.from_pretrained(
+            r"/mnt/sda1/testing/pytorch-realm-orqa/export/searcher",
+            r"/mnt/sda1/testing/pytorch-realm-orqa/export/reader",
+            r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+        )
+        
+        question = "Who is the pioneer in modern computer science?"
+        searcher_output, reader_output, predicted_answer = model(question)
+
+        self.assertEqual(predicted_answer, "alan mathison turing")
+
+    @slow
+    def test_inference_reader(self):
+        config = RealmConfig(searcher_beam_size=5)
+        model = RealmReader.from_pretrained("qqaatw/realm-orqa-nq-reader", config)
+
+        concat_inputs = torch.arange(25).view((5, 5))
+        output = model(
+            concat_inputs,
+            return_dict=True)
+
+        block_idx_expected_shape = torch.Size(())
+        start_pos_expected_shape = torch.Size((1))
+        end_pos_expected_shape = torch.Size((1))
+        self.assertEqual(output.block_idx.shape, block_idx_expected_shape)
+        self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
+        self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
+
+        expected_slice = torch.tensor([[0.7410, 0.7170]])
+        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
+
     @slow
     def test_inference_retriever(self):
         num_candidates = 2
@@ -359,3 +392,18 @@ def test_inference_retriever(self):
 
         expected_slice = torch.tensor([[0.7410, 0.7170]])
         self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
+    
+    @slow
+    def test_inference_searcher(self):
+        config = RealmConfig(searcher_beam_size=5)
+        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", config=config)
+
+        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        output = model(input_ids)[0]
+
+        expected_shape = torch.Size((5))
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor([[0.7410, 0.7170]])
+        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
+    

From f46b43e44097c6eda0fa1da8bde0febcf09f440c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 2 Oct 2021 23:28:15 +0800
Subject: [PATCH 47/98] Update naming

---
 docs/source/model_doc/realm.rst               |  4 +-
 src/transformers/__init__.py                  |  4 +-
 src/transformers/models/realm/__init__.py     |  4 +-
 .../models/realm/configuration_realm.py       | 10 ++--
 .../models/realm/modeling_realm.py            | 46 +++++++++----------
 src/transformers/utils/dummy_pt_objects.py    |  2 +-
 tests/test_modeling_realm.py                  | 32 ++++++++++---
 7 files changed, 60 insertions(+), 42 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 88e564958629..338771e1948b 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -68,10 +68,10 @@ RealmRetriever
     :members: forward
 
 
-RealmEncoder
+RealmKnowledgeAugEncoder
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RealmEncoder
+.. autoclass:: transformers.RealmKnowledgeAugEncoder
     :members: forward
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 28e13c98e037..fee1589a4993 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1030,8 +1030,8 @@
         [
             "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
             "RealmEmbedder",
-            "RealmEncoder",
             "RealmForOpenQA",
+            "RealmKnowledgeAugEncoder",
             "RealmPreTrainedModel",
             "RealmReader",
             "RealmRetriever",
@@ -2704,7 +2704,7 @@
         from .models.realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
-            RealmEncoder,
+            RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
             RealmRetriever,
             RealmSearcher,
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index b17804f24153..df8fd2376fc7 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -30,8 +30,8 @@
     _import_structure["modeling_realm"] = [
         "REALM_PRETRAINED_MODEL_ARCHIVE_LIST",
         "RealmEmbedder",
-        "RealmEncoder",
         "RealmForOpenQA",
+        "RealmKnowledgeAugEncoder",
         "RealmPreTrainedModel",
         "RealmReader",
         "RealmRetriever",
@@ -48,7 +48,7 @@
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
-            RealmEncoder,
+            RealmKnowledgeAugEncoder,
             RealmForOpenQA,
             RealmPreTrainedModel,
             RealmReader,
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 4170d7e69575..03455483b59e 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -35,13 +35,13 @@ class RealmConfig(PretrainedConfig):
     This is the configuration class to store the configuration of 
     1. :class:`~transformers.RealmEmbedder`
     2. :class:`~transformers.RealmRetriever` 
-    3. :class:`~transformers.RealmEncoder` 
+    3. :class:`~transformers.RealmKnowledgeAugEncoder` 
     4. :class:`~transformers.RealmSearcher`
     5. :class:`~transformers.RealmReader`
     6. :class:`~transformers.RealmForOpenQA` 
     It is used to instantiate an REALM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
     defaults will yield a similar configuration to that of the REALM `realm-cc-news-pretrained
-    <https://huggingface.co/realm-cc-news-pretrained>`__ architecture.
+    <https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -51,7 +51,7 @@ class RealmConfig(PretrainedConfig):
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
             :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
@@ -61,7 +61,7 @@ class RealmConfig(PretrainedConfig):
         num_attention_heads (:obj:`int`, `optional`, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
         num_candidates (:obj:`int`, `optional`, defaults to 8):
-            Number of candidates inputted to the RealmRetriever or RealmEncoder.
+            Number of candidates inputted to the RealmRetriever or RealmKnowledgeAugEncoder.
         intermediate_size (:obj:`int`, `optional`, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
@@ -76,7 +76,7 @@ class RealmConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index bbfc0e02942b..3188640cbf54 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -164,7 +164,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertEmbeddings
-class BertEmbeddings(nn.Module):
+class RealmEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
     def __init__(self, config):
@@ -225,7 +225,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention
-class BertSelfAttention(nn.Module):
+class RealmSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
@@ -351,7 +351,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfOutput
-class BertSelfOutput(nn.Module):
+class RealmSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -366,11 +366,11 @@ def forward(self, hidden_states, input_tensor):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertAttention
-class BertAttention(nn.Module):
+class RealmAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
+        self.self = RealmSelfAttention(config)
+        self.output = RealmSelfOutput(config)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -416,7 +416,7 @@ def forward(
 
 
 # Copied from transformers.models.bert.modeling_bert.BertIntermediate
-class BertIntermediate(nn.Module):
+class RealmIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
@@ -432,7 +432,7 @@ def forward(self, hidden_states):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertOutput
-class BertOutput(nn.Module):
+class RealmOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
@@ -447,19 +447,19 @@ def forward(self, hidden_states, input_tensor):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertLayer
-class BertLayer(nn.Module):
+class RealmLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.chunk_size_feed_forward = config.chunk_size_feed_forward
         self.seq_len_dim = 1
-        self.attention = BertAttention(config)
+        self.attention = RealmAttention(config)
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
             assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
-            self.crossattention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
+            self.crossattention = RealmAttention(config)
+        self.intermediate = RealmIntermediate(config)
+        self.output = RealmOutput(config)
 
     def forward(
         self,
@@ -531,11 +531,11 @@ def feed_forward_chunk(self, attention_output):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertEncoder
-class BertEncoder(nn.Module):
+class RealmEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.config = config
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(
         self,
@@ -629,7 +629,7 @@ def custom_forward(*inputs):
 
 
 # Copied from transformers.models.bert.modeling_bert.BertPooler
-class BertPooler(nn.Module):
+class RealmPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
@@ -653,10 +653,10 @@ def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
 
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
+        self.embeddings = RealmEmbeddings(config)
+        self.encoder = RealmEncoder(config)
 
-        self.pooler = BertPooler(config) if add_pooling_layer else None
+        self.pooler = RealmPooler(config) if add_pooling_layer else None
 
         # Weight initialization is managed by Realm models.
         # self.init_weights()
@@ -1343,10 +1343,10 @@ def forward(
 
 
 @add_start_docstrings(
-    "The encoder of REALM outputting masked language model logits and marginal log-likelihood loss.",
+    "The knowledge-augmented encoder of REALM outputting masked language model logits and marginal log-likelihood loss.",
     REALM_START_DOCSTRING,
 )
-class RealmEncoder(RealmPreTrainedModel):
+class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.bert = RealmBertModel(self.config)
@@ -1406,10 +1406,10 @@ def forward(
         Example::
 
             >>> import torch
-            >>> from transformers import RealmTokenizer, RealmEncoder
+            >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
 
             >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
-            >>> model = RealmEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
+            >>> model = RealmKnowledgeAugEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
 
             >>> # batch_size = 2, num_candidates = 2
             >>> text = [
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 19145ea1cfa8..299b28f9baa9 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2838,7 +2838,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmEncoder:
+class RealmKnowledgeAugEncoder:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 52053b11fdb3..a8f1c19159d5 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -28,7 +28,7 @@
 if is_torch_available():
     import torch
 
-    from transformers import RealmEmbedder, RealmEncoder, RealmRetriever, RealmSearcher, RealmReader, RealmForOpenQA
+    from transformers import RealmEmbedder, RealmKnowledgeAugEncoder, RealmRetriever, RealmSearcher, RealmReader, RealmForOpenQA
 
 
 class RealmModelTester:
@@ -172,7 +172,7 @@ def create_and_check_encoder(
         token_labels,
         choice_labels,
     ):
-        model = RealmEncoder(config=config)
+        model = RealmKnowledgeAugEncoder(config=config)
         model.to(torch_device)
         model.eval()
         relevance_score = floats_tensor([self.batch_size, self.num_candidates])
@@ -237,7 +237,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             RealmEmbedder,
-            RealmEncoder,
+            RealmKnowledgeAugEncoder,
             # RealmRetriever is excluded from common tests as it is a container model
             # consisting of two RealmEmbedders & simple inner product calculation.
             # RealmRetriever
@@ -285,7 +285,7 @@ def test_training(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
 
-        for model_class in [RealmEncoder]:
+        for model_class in [RealmKnowledgeAugEncoder]:
             model = model_class(config)
             model.to(torch_device)
             model.train()
@@ -300,14 +300,30 @@ def test_embedder_from_pretrained(self):
 
     @slow
     def test_encoder_from_pretrained(self):
-        model = RealmEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        self.assertIsNotNone(model)
+    
+    @slow
+    def test_open_qa_from_pretrained(self):
+        #TODO: TF record dataset
+        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader")
+        self.assertIsNotNone(model)
+
+    @slow
+    def test_reader_from_pretrained(self):
+        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-reader")
         self.assertIsNotNone(model)
 
     @slow
     def test_retriever_from_pretrained(self):
         model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
         self.assertIsNotNone(model)
-
+    
+    @slow
+    def test_searcher_from_pretrained(self):
+        #TODO: TF record dataset
+        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-searcher")
+        self.assertIsNotNone(model)
 
 @require_torch
 class RealmModelIntegrationTest(unittest.TestCase):
@@ -330,7 +346,7 @@ def test_inference_encoder(self):
         num_candidates = 2
         vocab_size = 30522
 
-        model = RealmEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates)
+        model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates)
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
         relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
         output = model(input_ids, relevance_score=relevance_score)[0]
@@ -344,6 +360,7 @@ def test_inference_encoder(self):
 
     @slow
     def test_inference_open_qa(self):
+        #TODO: TF record dataset
         model = RealmForOpenQA.from_pretrained(
             r"/mnt/sda1/testing/pytorch-realm-orqa/export/searcher",
             r"/mnt/sda1/testing/pytorch-realm-orqa/export/reader",
@@ -395,6 +412,7 @@ def test_inference_retriever(self):
     
     @slow
     def test_inference_searcher(self):
+        #TODO: TF record dataset
         config = RealmConfig(searcher_beam_size=5)
         model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", config=config)
 

From a7b727dbc579b849b94f021852cc85cfd1eacbe2 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Mon, 4 Oct 2021 13:48:37 +0800
Subject: [PATCH 48/98] Add brute-force searcher

---
 .../models/realm/modeling_realm.py            | 62 +++++++++++--------
 src/transformers/models/realm/utils_realm.py  | 62 +++++++++++++------
 2 files changed, 77 insertions(+), 47 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 3188640cbf54..7bc691f3384d 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -45,7 +45,7 @@
 )
 from ...utils import logging
 from .configuration_realm import RealmConfig
-from .utils_realm import load_scann_searcher, convert_tfrecord_to_np
+from .utils_realm import BruteForceSearcher, ScaNNSearcher, convert_tfrecord_to_np
 
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
@@ -163,7 +163,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     return model
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings->RealmEmbeddings
 class RealmEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -224,7 +224,7 @@ def forward(
         return embeddings
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention->RealmSelfAttention
 class RealmSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -350,7 +350,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput->RealmSelfOutput
 class RealmSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -365,7 +365,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention
+# Copied from transformers.models.bert.modeling_bert.BertAttention->RealmAttention
 class RealmAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -415,7 +415,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate->RealmIntermediate
 class RealmIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -431,7 +431,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput
+# Copied from transformers.models.bert.modeling_bert.BertOutput->RealmOutput
 class RealmOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -446,7 +446,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer
+# Copied from transformers.models.bert.modeling_bert.BertLayer->RealmLayer
 class RealmLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -530,7 +530,7 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder
+# Copied from transformers.models.bert.modeling_bert.BertEncoder->RealmEncoder
 class RealmEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -628,7 +628,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler
+# Copied from transformers.models.bert.modeling_bert.BertPooler->RealmPooler
 class RealmPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1521,15 +1521,17 @@ def __init__(self, config, block_records_path):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        try:
-            import scann
-        except ImportError:
-            raise ImportError(
-                "RealmSearcher requires ScaNN to retrieve documents from the corpus."
-                "Please install it through `pip install scann`."
-            )
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-    
+        realm_searcher = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        if realm_searcher.config.use_scann:
+            try:
+                import scann
+            except ImportError:
+                raise ImportError(
+                    "RealmSearcher requires ScaNN to retrieve documents from the corpus."
+                    "Please install it through `pip install scann`."
+                )
+        return realm_searcher
+
     @add_start_docstrings_to_model_forward(
         REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len")
     )
@@ -1561,13 +1563,19 @@ def forward(
         else:
             beam_size = self.config.reader_beam_size
        
-        if self.block_emb.device != torch.device("cpu"):
+        if self.config.use_scann and self.block_emb.device != torch.device("cpu"):
             self.block_emb = self.block_emb.cpu()
         if self.searcher is None:
-            self.searcher = load_scann_searcher(
-                db = self.block_emb,
-                num_neighbors=beam_size
-            )
+            if self.config.use_scann:
+                self.searcher = ScaNNSearcher(
+                    db = self.block_emb,
+                    num_neighbors = beam_size,
+                )
+            else:
+                self.searcher = BruteForceSearcher(
+                    db = self.block_emb,
+                    num_neighbors = beam_size,
+                )
 
         question_outputs = self.embedder(
             input_ids,
@@ -1584,16 +1592,16 @@ def forward(
         question_projection = question_outputs[0]
 
         # [1, searcher_beam_size]
-        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
+        retrieved_block_ids = self.searcher.search_batched(question_projection)
 
         # [searcher_beam_size]
-        retrieved_block_ids = torch.tensor(retrieved_block_ids.astype('int64').squeeze())
+        retrieved_block_ids = retrieved_block_ids.squeeze()
 
         # [searcher_beam_size]
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
         # [searcher_beam_size, projection_size]
-        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
+        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device))
 
         # [searcher_beam_size]
         retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device))
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 2e128d5db56c..c0748002fc81 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -14,29 +14,51 @@
 # limitations under the License.
 """Utilities for REALM."""
 
-def load_scann_searcher(db,
-                        num_neighbors,
-                        dimensions_per_block=2,
-                        num_leaves=1000,
-                        num_leaves_to_search=100,
-                        training_sample_size=100000):
-    """Load scann searcher from checkpoint."""
+import torch
+
+
+class BruteForceSearcher():
+    def __init__(self, db, num_neighbors):
+        """Build brute force searcher."""
+        self.db = db
+        self.num_neighbors = num_neighbors
     
-    from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+    def search_batched(self, question_projection):
+        batch_scores = torch.einsum("BD,QD->QB", self.db, question_projection)
+        _, retrieved_block_ids = torch.topk(batch_scores, k=self.num_neighbors, dim=-1)
+        # Must return cpu tensor for subsequent numpy operations 
+        return retrieved_block_ids.cpu()
+
+
+class ScaNNSearcher():
+    def __init__(self, db,
+        num_neighbors,
+        dimensions_per_block=2,
+        num_leaves=1000,
+        num_leaves_to_search=100,
+        training_sample_size=100000):
+        """Build scann searcher."""
         
+        from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+            
+
+        builder = Builder(
+            db=db,
+            num_neighbors=num_neighbors,
+            distance_measure="dot_product")
+        builder = builder.tree(
+            num_leaves=num_leaves,
+            num_leaves_to_search=num_leaves_to_search,
+            training_sample_size=training_sample_size)
+        builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
+
+        self.searcher = builder.build()
+
+    def search_batched(self, question_projection):
+        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
+        # Must return cpu tensor for subsequent numpy operations 
+        return torch.tensor(retrieved_block_ids.astype('int64'), device=torch.device("cpu"))
 
-    builder = Builder(
-        db=db,
-        num_neighbors=num_neighbors,
-        distance_measure="dot_product")
-    builder = builder.tree(
-        num_leaves=num_leaves,
-        num_leaves_to_search=num_leaves_to_search,
-        training_sample_size=training_sample_size)
-    builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
-
-    searcher = builder.build()
-    return searcher
 
 def convert_tfrecord_to_np(block_records_path, num_block_records):
 

From 426121c2f673b2ad6fb72022794871cea993f618 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Fri, 8 Oct 2021 22:42:37 +0800
Subject: [PATCH 49/98] Pass realm model tests

---
 src/transformers/modeling_utils.py            |   1 -
 .../models/realm/configuration_realm.py       |   4 +-
 .../models/realm/modeling_realm.py            |  21 +-
 tests/test_modeling_realm.py                  | 186 +++++++++++++++---
 utils/check_repo.py                           |   4 +-
 5 files changed, 172 insertions(+), 44 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 531fd373ff70..355623ff1278 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1186,7 +1186,6 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             config_path = config if config is not None else pretrained_model_name_or_path
             config, model_kwargs = cls.config_class.from_pretrained(
                 config_path,
-                *model_args,
                 cache_dir=cache_dir,
                 return_unused_kwargs=True,
                 force_download=force_download,
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 03455483b59e..4ebb220620b1 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -180,4 +180,6 @@ def __init__(
         # Searcher config
         self.num_block_records = num_block_records
         self.searcher_beam_size = searcher_beam_size
-        self.searcher_seq_len = searcher_seq_len
\ No newline at end of file
+        self.searcher_seq_len = searcher_seq_len
+
+        # TODO: Remove use_cache
\ No newline at end of file
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7bc691f3384d..a046e18b2aa8 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1517,12 +1517,7 @@ def __init__(self, config, block_records_path):
                 device=torch.device('cpu')
             )
         )
-        self.init_weights()
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        realm_searcher = super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        if realm_searcher.config.use_scann:
+        if config.use_scann:
             try:
                 import scann
             except ImportError:
@@ -1530,7 +1525,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                     "RealmSearcher requires ScaNN to retrieve documents from the corpus."
                     "Please install it through `pip install scann`."
                 )
-        return realm_searcher
+        self.init_weights()
 
     @add_start_docstrings_to_model_forward(
         REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len")
@@ -1648,7 +1643,6 @@ def forward(
         head_mask=None,
         inputs_embeds=None,
         relevance_score=None,
-        retrieved_blocks=None,
         start_positions=None,
         end_positions=None,
         has_answers=None,
@@ -1674,11 +1668,18 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if relevance_score is None:
+            raise ValueError(
+                "You have to specify `relevance_score` to calculate logits and loss."
+            )
         if token_type_ids is None:
             raise ValueError(
-                "You have to specify `token_type_ids` for separating question block and evidence block."
+                "You have to specify `token_type_ids` to separate question block and evidence block."
+            )
+        if token_type_ids.size(1) < self.config.max_span_width:
+            raise ValueError(
+                "The input sequence length must be greater than or equal to config.max_span_width."
             )
-
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index a8f1c19159d5..86f9c84a2e2a 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -30,6 +30,9 @@
 
     from transformers import RealmEmbedder, RealmKnowledgeAugEncoder, RealmRetriever, RealmSearcher, RealmReader, RealmForOpenQA
 
+# Direct download link
+# https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
+BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
 
 class RealmModelTester:
     def __init__(
@@ -54,11 +57,22 @@ def __init__(
         type_vocab_size=16,
         type_sequence_label_size=2,
         initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_scann=True,
+        span_hidden_size=50,
+        max_span_width=10,
+        reader_layer_norm_eps=1e-3,
+        reader_beam_size=4,
+        reader_seq_len=288+32,
+        num_block_records=13353718,
+        searcher_beam_size=8,
+        searcher_seq_len=64,
         num_labels=3,
         num_choices=4,
         num_candidates=10,
         scope=None,
     ):
+        # General config
         self.parent = parent
         self.batch_size = batch_size
         self.retriever_proj_size = retriever_proj_size
@@ -79,6 +93,21 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.use_scann = use_scann
+
+        # Reader config
+        self.span_hidden_size = span_hidden_size
+        self.max_span_width = max_span_width
+        self.reader_layer_norm_eps = reader_layer_norm_eps
+        self.reader_beam_size = reader_beam_size
+        self.reader_seq_len = reader_seq_len
+
+        # Searcher config
+        self.num_block_records = num_block_records
+        self.searcher_beam_size = searcher_beam_size
+        self.searcher_seq_len = searcher_seq_len
+        
         self.num_labels = num_labels
         self.num_choices = num_choices
         self.num_candidates = num_candidates
@@ -87,20 +116,28 @@ def __init__(
     def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
+        reader_input_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.vocab_size)
+        searcher_input_ids = ids_tensor([1, self.searcher_seq_len], self.vocab_size)
 
         input_mask = None
         candiate_input_mask = None
+        reader_input_mask = None
         if self.use_input_mask:
             input_mask = random_attention_mask([self.batch_size, self.seq_length])
             candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
+            reader_input_mask = random_attention_mask([self.reader_beam_size, self.reader_seq_len])
+            searcher_input_mask = random_attention_mask([1, self.reader_seq_len])
 
         token_type_ids = None
         candidate_token_type_ids = None
+        reader_token_type_ids = None
         if self.use_token_type_ids:
             token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
             candidate_token_type_ids = ids_tensor(
                 [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
             )
+            reader_token_type_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.type_vocab_size)
+            searcher_token_type_ids = ids_tensor([1, self.searcher_seq_len], self.type_vocab_size)
 
         sequence_labels = None
         token_labels = None
@@ -113,14 +150,19 @@ def prepare_config_and_inputs(self):
         config = self.get_config()
 
         # inputs with additional num_candidates axis.
-        candidate_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
+        retriever_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
+        # reader inputs
+        reader_inputs = (reader_input_ids, reader_input_mask, reader_token_type_ids)
+        searcher_inputs = (searcher_input_ids, searcher_input_mask, searcher_token_type_ids)
 
         return (
             config,
             input_ids,
             token_type_ids,
             input_mask,
-            candidate_inputs,
+            retriever_encoder_inputs,
+            reader_inputs,
+            searcher_inputs,
             sequence_labels,
             token_labels,
             choice_labels,
@@ -140,7 +182,6 @@ def get_config(self):
             attention_probs_dropout_prob=self.attention_probs_dropout_prob,
             max_position_embeddings=self.max_position_embeddings,
             type_vocab_size=self.type_vocab_size,
-            is_decoder=False,
             initializer_range=self.initializer_range,
         )
 
@@ -150,7 +191,9 @@ def create_and_check_embedder(
         input_ids,
         token_type_ids,
         input_mask,
-        candidate_inputs,
+        retriever_encoder_inputs,
+        reader_inputs,
+        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -167,7 +210,9 @@ def create_and_check_encoder(
         input_ids,
         token_type_ids,
         input_mask,
-        candidate_inputs,
+        retriever_encoder_inputs,
+        reader_inputs,
+        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -177,9 +222,9 @@ def create_and_check_encoder(
         model.eval()
         relevance_score = floats_tensor([self.batch_size, self.num_candidates])
         result = model(
-            candidate_inputs[0],
-            attention_mask=candidate_inputs[1],
-            token_type_ids=candidate_inputs[2],
+            retriever_encoder_inputs[0],
+            attention_mask=retriever_encoder_inputs[1],
+            token_type_ids=retriever_encoder_inputs[2],
             relevance_score=relevance_score,
             labels=token_labels,
         )
@@ -187,13 +232,43 @@ def create_and_check_encoder(
             result.logits.shape, (self.batch_size * self.num_candidates, self.seq_length, self.vocab_size)
         )
 
+    def create_and_check_reader(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        retriever_encoder_inputs,
+        reader_inputs,
+        searcher_inputs,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = RealmReader(config=config)
+        model.to(torch_device)
+        model.eval()
+        relevance_score = floats_tensor([self.reader_beam_size])
+        result = model(
+            reader_inputs[0],
+            attention_mask=reader_inputs[1],
+            token_type_ids=reader_inputs[2],
+            relevance_score=relevance_score,
+        )
+        self.parent.assertEqual(result.block_idx.shape, ())
+        self.parent.assertEqual(result.candidate.shape, ())
+        self.parent.assertEqual(result.start_pos.shape, ())
+        self.parent.assertEqual(result.end_pos.shape, ())
+
     def create_and_check_retriever(
         self,
         config,
         input_ids,
         token_type_ids,
         input_mask,
-        candidate_inputs,
+        retriever_encoder_inputs,
+        reader_inputs,
+        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -205,9 +280,9 @@ def create_and_check_retriever(
             input_ids,
             attention_mask=input_mask,
             token_type_ids=token_type_ids,
-            candidate_input_ids=candidate_inputs[0],
-            candidate_attention_mask=candidate_inputs[1],
-            candidate_token_type_ids=candidate_inputs[2],
+            candidate_input_ids=retriever_encoder_inputs[0],
+            candidate_attention_mask=retriever_encoder_inputs[1],
+            candidate_token_type_ids=retriever_encoder_inputs[2],
         )
         self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
         self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
@@ -215,6 +290,31 @@ def create_and_check_retriever(
             result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
         )
 
+    def create_and_check_searcher(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        retriever_encoder_inputs,
+        reader_inputs,
+        searcher_inputs,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+    ):
+        model = RealmSearcher(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            searcher_inputs[0],
+            attention_mask=searcher_inputs[1],
+            token_type_ids=searcher_inputs[2],
+        )
+        self.parent.assertEqual(result.retrieved_logits.shape, (self.searcher_beam_size,))
+        self.parent.assertEqual(result.retrieved_blocks.shape, (self.searcher_beam_size,))
+        self.parent.assertEqual(result.retrieved_block_ids.shape, (self.searcher_beam_size,))
+
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -222,7 +322,9 @@ def prepare_config_and_inputs_for_common(self):
             input_ids,
             token_type_ids,
             input_mask,
-            candidate_inputs,
+            retriever_encoder_inputs,
+            reader_inputs,
+            searcher_inputs,
             sequence_labels,
             token_labels,
             choice_labels,
@@ -238,6 +340,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmKnowledgeAugEncoder,
+            RealmReader,
             # RealmRetriever is excluded from common tests as it is a container model
             # consisting of two RealmEmbedders & simple inner product calculation.
             # RealmRetriever
@@ -306,12 +409,12 @@ def test_encoder_from_pretrained(self):
     @slow
     def test_open_qa_from_pretrained(self):
         #TODO: TF record dataset
-        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader")
+        model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", BLOCK_RECORDS_PATH)
         self.assertIsNotNone(model)
 
     @slow
     def test_reader_from_pretrained(self):
-        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-reader")
+        model = RealmReader.from_pretrained("qqaatw/realm-orqa-nq-reader")
         self.assertIsNotNone(model)
 
     @slow
@@ -322,7 +425,7 @@ def test_retriever_from_pretrained(self):
     @slow
     def test_searcher_from_pretrained(self):
         #TODO: TF record dataset
-        model = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-searcher")
+        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", BLOCK_RECORDS_PATH)
         self.assertIsNotNone(model)
 
 @require_torch
@@ -362,9 +465,9 @@ def test_inference_encoder(self):
     def test_inference_open_qa(self):
         #TODO: TF record dataset
         model = RealmForOpenQA.from_pretrained(
-            r"/mnt/sda1/testing/pytorch-realm-orqa/export/searcher",
-            r"/mnt/sda1/testing/pytorch-realm-orqa/export/reader",
-            r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+            r"qqaatw/realm-orqa-nq-searcher",
+            r"qqaatw/realm-orqa-nq-reader",
+            BLOCK_RECORDS_PATH,
         )
         
         question = "Who is the pioneer in modern computer science?"
@@ -374,23 +477,40 @@ def test_inference_open_qa(self):
 
     @slow
     def test_inference_reader(self):
-        config = RealmConfig(searcher_beam_size=5)
-        model = RealmReader.from_pretrained("qqaatw/realm-orqa-nq-reader", config)
+        config = RealmConfig(reader_beam_size=2, max_span_width=3)
+        model = RealmReader.from_pretrained("qqaatw/realm-orqa-nq-reader", config=config)
+
+        concat_input_ids = torch.arange(10).view((2, 5))
+        concat_token_type_ids = torch.tensor(
+            [
+                [0, 0, 1, 1, 1],
+                [0, 0, 1, 1, 1]
+            ],
+            dtype=torch.int64
+        )
+        relevance_score = torch.tensor([0.3, 0.7], dtype=torch.float32)
 
-        concat_inputs = torch.arange(25).view((5, 5))
         output = model(
-            concat_inputs,
+            concat_input_ids,
+            token_type_ids=concat_token_type_ids,
+            relevance_score=relevance_score,
             return_dict=True)
 
         block_idx_expected_shape = torch.Size(())
-        start_pos_expected_shape = torch.Size((1))
-        end_pos_expected_shape = torch.Size((1))
+        start_pos_expected_shape = torch.Size((1,))
+        end_pos_expected_shape = torch.Size((1,))
         self.assertEqual(output.block_idx.shape, block_idx_expected_shape)
         self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
         self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
 
-        expected_slice = torch.tensor([[0.7410, 0.7170]])
-        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
+
+        expected_block_idx = torch.tensor(1)
+        expected_start_pos = torch.tensor(3)
+        expected_end_pos = torch.tensor(3)
+
+        self.assertTrue(torch.allclose(output.block_idx, expected_block_idx, atol=1e-4))
+        self.assertTrue(torch.allclose(output.start_pos, expected_start_pos, atol=1e-4))
+        self.assertTrue(torch.allclose(output.end_pos, expected_end_pos, atol=1e-4))
 
     @slow
     def test_inference_retriever(self):
@@ -414,14 +534,18 @@ def test_inference_retriever(self):
     def test_inference_searcher(self):
         #TODO: TF record dataset
         config = RealmConfig(searcher_beam_size=5)
-        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", config=config)
+        model = RealmSearcher.from_pretrained(
+            "qqaatw/realm-orqa-nq-searcher", 
+            BLOCK_RECORDS_PATH,
+            config=config
+        )
 
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
 
-        expected_shape = torch.Size((5))
+        expected_shape = torch.Size((5,))
         self.assertEqual(output.shape, expected_shape)
 
-        expected_slice = torch.tensor([[0.7410, 0.7170]])
-        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
+        expected_slice = torch.tensor([[5.2747, 4.3768, 5.0444, 5.4152, 5.2922]])
+        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4), output)
     
diff --git a/utils/check_repo.py b/utils/check_repo.py
index e7e04e8e2af2..29f82dd8ccff 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -73,7 +73,9 @@
     "DPREncoder",  # Building part of bigger (tested) model.
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
     "RealmBertModel",  # Building part of bigger (tested) model.
-    "RealmRetriever",  # Submodels have been tested.
+    "RealmRetriever",  # Not regular model.
+    "RealmSearcher",  # Not regular model.
+    "RealmForOpenQA" # Not regular model.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
     "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)

From 225b2e6abca41773de9646586554c900bc468e3c Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 9 Oct 2021 00:07:45 +0800
Subject: [PATCH 50/98] Style

---
 docs/source/model_doc/realm.rst               |   2 +-
 src/transformers/__init__.py                  |   6 +-
 src/transformers/models/realm/__init__.py     |   2 +-
 .../models/realm/configuration_realm.py       |  32 +--
 .../models/realm/modeling_realm.py            | 195 ++++++++++--------
 src/transformers/models/realm/utils_realm.py  |  40 ++--
 tests/test_modeling_realm.py                  |  62 +++---
 utils/check_repo.py                           |   4 +-
 8 files changed, 178 insertions(+), 165 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 338771e1948b..b8fcf5b2d53f 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -93,4 +93,4 @@ RealmForOpenQA
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. autoclass:: transformers.RealmForOpenQA
-    :members: from_pretrained, forward
\ No newline at end of file
+    :members: from_pretrained, forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 2a7fab3403a3..4e4ac3a5e342 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -125,7 +125,6 @@
         "is_vision_available",
     ],
     "hf_argparser": ["HfArgumentParser"],
-    "keras_callbacks": [],
     "integrations": [
         "is_comet_available",
         "is_optuna_available",
@@ -135,6 +134,7 @@
         "is_tensorboard_available",
         "is_wandb_available",
     ],
+    "keras_callbacks": [],
     "modelcard": ["ModelCard"],
     "modeling_tf_pytorch_utils": [
         "convert_tf_weight_name_to_pt_weight_name",
@@ -2768,12 +2768,12 @@
         from .models.realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
+            RealmForOpenQA,
             RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
+            RealmReader,
             RealmRetriever,
             RealmSearcher,
-            RealmReader,
-            RealmForOpenQA,
             load_tf_weights_in_realm,
         )
         from .models.reformer import (
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index df8fd2376fc7..94dcc2e1e5fa 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -48,8 +48,8 @@
         from .modeling_realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
-            RealmKnowledgeAugEncoder,
             RealmForOpenQA,
+            RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
             RealmReader,
             RealmRetriever,
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 4ebb220620b1..0e5fe55a9195 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -32,16 +32,17 @@
 
 class RealmConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of 
+    This is the configuration class to store the configuration of
+
     1. :class:`~transformers.RealmEmbedder`
-    2. :class:`~transformers.RealmRetriever` 
-    3. :class:`~transformers.RealmKnowledgeAugEncoder` 
+    2. :class:`~transformers.RealmRetriever`
+    3. :class:`~transformers.RealmKnowledgeAugEncoder`
     4. :class:`~transformers.RealmSearcher`
     5. :class:`~transformers.RealmReader`
-    6. :class:`~transformers.RealmForOpenQA` 
-    It is used to instantiate an REALM model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the REALM `realm-cc-news-pretrained
-    <https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder>`__ architecture.
+    6. :class:`~transformers.RealmForOpenQA`
+    It is used to instantiate an REALM model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
+    `realm-cc-news-pretrained <https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder>`__ architecture.
 
     Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
     outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
@@ -51,7 +52,8 @@ class RealmConfig(PretrainedConfig):
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
             :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`,
+            :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         hidden_size (:obj:`int`, `optional`, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
@@ -76,7 +78,8 @@ class RealmConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (:obj:`int`, `optional`, defaults to 2):
             The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`, :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
+            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`,
+            :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
@@ -85,8 +88,8 @@ class RealmConfig(PretrainedConfig):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if ``config.is_decoder=True``.
         use_scann (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not :class:`~transformers.RealmSearcher` uses `ScaNN` as the vector similarity searcher.
-            This option has no effect and is reserved for future development.
+            Whether or not :class:`~transformers.RealmSearcher` uses `ScaNN` as the vector similarity searcher. This
+            option has no effect and is reserved for future development.
         span_hidden_size (:obj:`int`, `optional`, defaults to 256):
             Dimension of the reader's spans.
         max_span_width (:obj:`int`, `optional`, defaults to 10):
@@ -100,7 +103,8 @@ class RealmConfig(PretrainedConfig):
         num_block_records (:obj:`int`, `optional`, defaults to 13353718):
             Number of block records.
         searcher_beam_size (:obj:`int`, `optional`, defaults to 5000):
-            Beam size of the searcher. Note that when eval mode is enabled, `searcher_beam_size` will be the same as `reader_beam_size`.
+            Beam size of the searcher. Note that when eval mode is enabled, `searcher_beam_size` will be the same as
+            `reader_beam_size`.
         searcher_seq_len (:obj:`int`, `optional`, defaults to 64):
             Maximum sequence length of the searcher.
 
@@ -141,7 +145,7 @@ def __init__(
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
         reader_beam_size=5,
-        reader_seq_len=288+32,
+        reader_seq_len=288 + 32,
         num_block_records=13353718,
         searcher_beam_size=5000,
         searcher_seq_len=64,
@@ -182,4 +186,4 @@ def __init__(
         self.searcher_beam_size = searcher_beam_size
         self.searcher_seq_len = searcher_seq_len
 
-        # TODO: Remove use_cache
\ No newline at end of file
+        # TODO: Remove use_cache
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index a046e18b2aa8..ed12f85e2212 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -20,17 +20,22 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
+import numpy as np
 import torch
 import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import CrossEntropyLoss
-import numpy as np
-from transformers.models.realm.tokenization_realm import RealmTokenizer
 
+from transformers.models.realm.tokenization_realm import RealmTokenizer
 
 from ...activations import ACT2FN
-from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -47,6 +52,7 @@
 from .configuration_realm import RealmConfig
 from .utils_realm import BruteForceSearcher, ScaNNSearcher, convert_tfrecord_to_np
 
+
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
 _EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
@@ -90,7 +96,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
         arrays.append(array)
-    
+
     for name, array in zip(names, arrays):
         # For reader
         if isinstance(model, RealmReader) and "reader" not in name:
@@ -104,15 +110,14 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         name = name.replace("reader/dense/", "qa_outputs/dense_intermediate/")
         name = name.replace("reader/dense_1/", "qa_outputs/dense_output/")
         name = name.replace("reader/layer_normalization", "qa_outputs/layer_normalization")
-        
-        
+
         # For embedder and retriever
         embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
         name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
         name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
         name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
         name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
-        
+
         # Fine-tuned checkpoints
         name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}bert/")
         name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
@@ -889,7 +894,7 @@ class RealmReaderOutput(ModelOutput):
 
     loss: torch.FloatTensor = None
     retriever_loss: torch.FloatTensor = None
-    reader_loss:torch.FloatTensor = None
+    reader_loss: torch.FloatTensor = None
     retriever_correct: torch.BoolTensor = None
     reader_correct: torch.BoolTensor = None
     block_idx: torch.LongTensor = None
@@ -991,24 +996,24 @@ def __init__(self, config):
 
     def forward(self, hidden_states, token_type_ids):
         def span_candidates(masks):
-            """Generate span candidates.
+            """
+            Generate span candidates.
 
-                Args:
-                masks: <int32> [num_retrievals, max_sequence_len]
+            Args:
+            masks: <int32> [num_retrievals, max_sequence_len]
 
-                Returns:
-                starts: <int32> [num_spans]
-                ends: <int32> [num_spans]
-                span_masks: <int32> [num_retrievals, num_spans] whether spans locate in evidence block.
+            Returns:
+            starts: <int32> [num_spans] ends: <int32> [num_spans] span_masks: <int32> [num_retrievals, num_spans]
+            whether spans locate in evidence block.
             """
             _, max_sequence_len = masks.shape
+
             def _spans_given_width(width):
                 current_starts = torch.arange(max_sequence_len - width + 1, device=masks.device)
                 current_ends = torch.arange(width - 1, max_sequence_len, device=masks.device)
                 return current_starts, current_ends
 
-            starts, ends = zip(*(_spans_given_width(w + 1)
-                                for w in range(self.config.max_span_width)))
+            starts, ends = zip(*(_spans_given_width(w + 1) for w in range(self.config.max_span_width)))
 
             # [num_spans]
             starts = torch.cat(starts, 0)
@@ -1024,7 +1029,6 @@ def _spans_given_width(width):
         def mask_to_score(mask):
             return (1.0 - mask.type(torch.float32)) * -10000.0
 
-
         # [reader_beam_size, max_sequence_len, span_hidden_size * 2]
         hidden_states = self.dense_intermediate(hidden_states)
         # [reader_beam_size, max_sequence_len, span_hidden_size]
@@ -1032,7 +1036,7 @@ def mask_to_score(mask):
         block_mask = token_type_ids.detach().clone()
         block_mask[:, -1] = 0
         candidate_starts, candidate_ends, candidate_mask = span_candidates(block_mask)
-        
+
         candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidate_starts)
         candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidate_ends)
         candidate_hidden = candidate_start_projections + candidate_end_projections
@@ -1045,7 +1049,7 @@ def mask_to_score(mask):
         reader_logits = self.dense_output(candidate_hidden).squeeze(-1)
         # [reader_beam_size, num_candidates]
         reader_logits += mask_to_score(candidate_mask)
-        
+
         return reader_logits, candidate_starts, candidate_ends
 
 
@@ -1492,6 +1496,7 @@ def forward(
             attentions=joint_outputs.attentions,
         )
 
+
 @add_start_docstrings(
     "The searcher of REALM outputting relevance score (before softmax) and corresponding document blocks.",
     REALM_START_DOCSTRING,
@@ -1502,20 +1507,22 @@ class RealmSearcher(RealmPreTrainedModel):
         block_records_path (:obj:`str`):
             Block records path.
     """
+
     def __init__(self, config, block_records_path):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
         self.searcher = None
         self.block_records = convert_tfrecord_to_np(
-            block_records_path = block_records_path,
-            num_block_records = config.num_block_records,
+            block_records_path=block_records_path,
+            num_block_records=config.num_block_records,
         )
-        self.register_buffer("block_emb",
+        self.register_buffer(
+            "block_emb",
             torch.zeros(()).new_empty(
                 size=(config.num_block_records, config.retriever_proj_size),
                 dtype=torch.float32,
-                device=torch.device('cpu')
-            )
+                device=torch.device("cpu"),
+            ),
         )
         if config.use_scann:
             try:
@@ -1527,9 +1534,7 @@ def __init__(self, config, block_records_path):
                 )
         self.init_weights()
 
-    @add_start_docstrings_to_model_forward(
-        REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len")
-    )
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len"))
     @replace_return_docstrings(output_type=RealmSearcherOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1548,28 +1553,28 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if (input_ids is not None and input_ids.shape[0] != 1) or (inputs_embeds is not None and inputs_embeds.shape[0] != 1):
-            raise ValueError(
-                "The batch_size of the inputs must be 1."
-            )
+        if (input_ids is not None and input_ids.shape[0] != 1) or (
+            inputs_embeds is not None and inputs_embeds.shape[0] != 1
+        ):
+            raise ValueError("The batch_size of the inputs must be 1.")
 
         if self.training:
             beam_size = self.config.searcher_beam_size
         else:
             beam_size = self.config.reader_beam_size
-       
+
         if self.config.use_scann and self.block_emb.device != torch.device("cpu"):
             self.block_emb = self.block_emb.cpu()
         if self.searcher is None:
             if self.config.use_scann:
                 self.searcher = ScaNNSearcher(
-                    db = self.block_emb,
-                    num_neighbors = beam_size,
+                    db=self.block_emb,
+                    num_neighbors=beam_size,
                 )
             else:
                 self.searcher = BruteForceSearcher(
-                    db = self.block_emb,
-                    num_neighbors = beam_size,
+                    db=self.block_emb,
+                    num_neighbors=beam_size,
                 )
 
         question_outputs = self.embedder(
@@ -1596,14 +1601,18 @@ def forward(
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
         # [searcher_beam_size, projection_size]
-        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device))
+        retrieved_block_emb = torch.index_select(
+            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
+        )
 
         # [searcher_beam_size]
-        retrieved_logits = torch.einsum("D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device))
-        
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
+        )
+
         if not return_dict:
             return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
-        
+
         return RealmSearcherOutput(
             retrieved_logits=retrieved_logits,
             retrieved_blocks=retrieved_blocks,
@@ -1629,10 +1638,7 @@ def __init__(self, config):
 
         self.init_weights()
 
-    
-    @add_start_docstrings_to_model_forward(
-        REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length")
-    )
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length"))
     @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
@@ -1652,7 +1658,8 @@ def forward(
     ):
         r"""
         relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
-            Relevance score derived from `RealmSearcher`, must be specified if you want to compute the marginal log loss.
+            Relevance score derived from `RealmSearcher`, must be specified if you want to compute the marginal log
+            loss.
         start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
             Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
@@ -1663,23 +1670,17 @@ def forward(
             sequence are not taken into account for computing the loss.
         has_answers (:obj:`torch.BoolTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
             Whether or not the evidence blocks derived from `RealmSearcher` have answer(s).
-        
+
         Returns:
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if relevance_score is None:
-            raise ValueError(
-                "You have to specify `relevance_score` to calculate logits and loss."
-            )
+            raise ValueError("You have to specify `relevance_score` to calculate logits and loss.")
         if token_type_ids is None:
-            raise ValueError(
-                "You have to specify `token_type_ids` to separate question block and evidence block."
-            )
+            raise ValueError("You have to specify `token_type_ids` to separate question block and evidence block.")
         if token_type_ids.size(1) < self.config.max_span_width:
-            raise ValueError(
-                "The input sequence length must be greater than or equal to config.max_span_width."
-            )
+            raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.")
         outputs = self.bert(
             input_ids,
             attention_mask=attention_mask,
@@ -1694,11 +1695,11 @@ def forward(
 
         # [reader_beam_size, joint_seq_len, hidden_size]
         sequence_output = outputs[0]
-    
+
         # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
         reader_logits, candidate_starts, candidate_ends = self.qa_outputs(sequence_output, token_type_ids)
         # [searcher_beam_size, 1]
-        retriever_logits = torch.unsqueeze(relevance_score[0: self.config.reader_beam_size], -1)
+        retriever_logits = torch.unsqueeze(relevance_score[0 : self.config.reader_beam_size], -1)
         # [reader_beam_size, num_candidates]
         reader_logits += retriever_logits
         # []
@@ -1716,31 +1717,31 @@ def forward(
         retriever_correct = None
         reader_correct = None
         if start_positions is not None and end_positions is not None and has_answers is not None:
-            def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts,
-                               gold_ends):
+
+            def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts, gold_ends):
                 """Compute correct span."""
                 # [reader_beam_size, num_answers, num_candidates]
                 is_gold_start = torch.eq(
-                    torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0),
-                    torch.unsqueeze(gold_starts, -1))
+                    torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0), torch.unsqueeze(gold_starts, -1)
+                )
                 is_gold_end = torch.eq(
-                    torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0),
-                    torch.unsqueeze(gold_ends, -1))
+                    torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0), torch.unsqueeze(gold_ends, -1)
+                )
 
                 # [reader_beam_size, num_candidates]
                 return torch.any(torch.logical_and(is_gold_start, is_gold_end), 1)
 
             def marginal_log_loss(logits, is_correct):
                 """Loss based on the negative marginal log-likelihood."""
-                
+
                 def mask_to_score(mask):
                     return (1.0 - mask.type(torch.float32)) * -10000.0
-                
+
                 # []
                 log_numerator = torch.logsumexp(logits + mask_to_score(is_correct), dim=-1)
                 log_denominator = torch.logsumexp(logits, dim=-1)
                 return log_denominator - log_numerator
-            
+
             # sometimes the start/end positions are outside our model inputs, we ignore these terms
             # `-1` is reserved for no answer.
             ignored_index = sequence_output.size(1)
@@ -1753,8 +1754,8 @@ def mask_to_score(mask):
             reader_correct = compute_correct_candidates(
                 candidate_starts=candidate_starts,
                 candidate_ends=candidate_ends,
-                gold_starts=start_positions[0: self.config.reader_beam_size],
-                gold_ends=end_positions[0: self.config.reader_beam_size],
+                gold_starts=start_positions[0 : self.config.reader_beam_size],
+                gold_ends=end_positions[0 : self.config.reader_beam_size],
             )
             any_reader_correct = torch.any(reader_correct)
 
@@ -1767,7 +1768,11 @@ def mask_to_score(mask):
 
         if not return_dict:
             output = (predicted_block_index, predicted_candidate, predicted_start, predicted_end) + outputs[2:]
-            return ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output) if total_loss is not None else output
+            return (
+                ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output)
+                if total_loss is not None
+                else output
+            )
 
         return RealmReaderOutput(
             loss=total_loss,
@@ -1790,11 +1795,13 @@ def mask_to_score(mask):
             OpenQA Question.
         answer_ids (:obj:`torch.LongTensor` of shape :obj:`(num_answers, answer_length)`, `optional`):
             Answer ids for computing the marginal log-likelihood loss. Indices should be in ``[-1, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-1`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-1`` are ignored (masked),
+            the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
         return_dict (:obj:`bool`, `optional`):
             Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
 """
+
+
 @add_start_docstrings(
     "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
     REALM_START_DOCSTRING,
@@ -1807,7 +1814,9 @@ def __init__(self, config, searcher, reader, tokenizer):
         self.tokenizer = tokenizer
 
     @classmethod
-    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs):
+    def from_pretrained(
+        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs
+    ):
         """
         Args:
             searcher_pretrained_name_or_path (:obj:`str`):
@@ -1819,20 +1828,19 @@ def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_nam
 
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs)
+        searcher = RealmSearcher.from_pretrained(
+            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
+        )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
         tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
         return cls(config, searcher, reader, tokenizer)
-    
+
     def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
         self.reader.save_pretrained(save_directory)
 
     def retrieve(self, input_ids, **kwargs):
-        output = self.searcher(
-            input_ids,
-            return_dict=True, 
-            **kwargs)
+        output = self.searcher(input_ids, return_dict=True, **kwargs)
         return output
 
     def read(self, searcher_output, question, answers):
@@ -1846,7 +1854,7 @@ def block_has_answer(concat_inputs, answers):
             for input_id in concat_inputs.input_ids:
                 pass_sep = False
                 answer_pos = 0
-                start=-1
+                start = -1
                 start_pos.append([])
                 end_pos.append([])
                 for answer in answers:
@@ -1869,7 +1877,7 @@ def block_has_answer(concat_inputs, answers):
                         else:
                             answer_pos = 0
                             start = -1
-                
+
                 if len(start_pos[-1]) == 0:
                     has_answers.append(False)
                 else:
@@ -1898,17 +1906,21 @@ def block_has_answer(concat_inputs, answers):
             text.append(question)
             text_pair.append(retrieved_block.decode())
 
-        concat_inputs = self.tokenizer(text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors='pt')
+        concat_inputs = self.tokenizer(
+            text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors="pt"
+        )
 
         if answers is not None:
-            has_answers, start_positions, end_positions = block_has_answer(concat_inputs.to(searcher_output.retrieved_logits.device), answers)
+            has_answers, start_positions, end_positions = block_has_answer(
+                concat_inputs.to(searcher_output.retrieved_logits.device), answers
+            )
         else:
             has_answers, start_positions, end_positions = (None, None, None)
 
         output = self.reader(
-            input_ids=concat_inputs.input_ids[0: self.config.reader_beam_size],
-            attention_mask=concat_inputs.attention_mask[0: self.config.reader_beam_size],
-            token_type_ids=concat_inputs.token_type_ids[0: self.config.reader_beam_size],
+            input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size],
+            attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size],
+            token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size],
             relevance_score=searcher_output.retrieved_logits,
             has_answers=has_answers,
             start_positions=start_positions,
@@ -1916,7 +1928,9 @@ def block_has_answer(concat_inputs, answers):
             return_dict=True,
         )
 
-        answer = self.tokenizer.decode(concat_inputs.input_ids[output.block_idx][output.start_pos: output.end_pos + 1])
+        answer = self.tokenizer.decode(
+            concat_inputs.input_ids[output.block_idx][output.start_pos : output.end_pos + 1]
+        )
 
         return output, answer
 
@@ -1942,7 +1956,9 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        question_ids = self.tokenizer([question], padding=True, truncation=True, max_length=self.config.searcher_seq_len, return_tensors='pt')
+        question_ids = self.tokenizer(
+            [question], padding=True, truncation=True, max_length=self.config.searcher_seq_len, return_tensors="pt"
+        )
 
         searcher_output = self.retrieve(**question_ids)
 
@@ -1950,10 +1966,9 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         if return_dict:
             return searcher_output, reader_output, predicted_answer
-        
+
         return RealmForOpenQAOutput(
             searcher_output=searcher_output,
             reader_output=reader_output,
             predicted_answer=predicted_answer,
         )
-        
\ No newline at end of file
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index c0748002fc81..24c927b8ecd8 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -17,57 +17,53 @@
 import torch
 
 
-class BruteForceSearcher():
+class BruteForceSearcher:
     def __init__(self, db, num_neighbors):
         """Build brute force searcher."""
         self.db = db
         self.num_neighbors = num_neighbors
-    
+
     def search_batched(self, question_projection):
         batch_scores = torch.einsum("BD,QD->QB", self.db, question_projection)
         _, retrieved_block_ids = torch.topk(batch_scores, k=self.num_neighbors, dim=-1)
-        # Must return cpu tensor for subsequent numpy operations 
+        # Must return cpu tensor for subsequent numpy operations
         return retrieved_block_ids.cpu()
 
 
-class ScaNNSearcher():
-    def __init__(self, db,
+class ScaNNSearcher:
+    def __init__(
+        self,
+        db,
         num_neighbors,
         dimensions_per_block=2,
         num_leaves=1000,
         num_leaves_to_search=100,
-        training_sample_size=100000):
+        training_sample_size=100000,
+    ):
         """Build scann searcher."""
-        
+
         from scann.scann_ops.py.scann_ops_pybind import builder as Builder
-            
 
-        builder = Builder(
-            db=db,
-            num_neighbors=num_neighbors,
-            distance_measure="dot_product")
+        builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product")
         builder = builder.tree(
-            num_leaves=num_leaves,
-            num_leaves_to_search=num_leaves_to_search,
-            training_sample_size=training_sample_size)
+            num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size
+        )
         builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
 
         self.searcher = builder.build()
 
     def search_batched(self, question_projection):
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
-        # Must return cpu tensor for subsequent numpy operations 
-        return torch.tensor(retrieved_block_ids.astype('int64'), device=torch.device("cpu"))
+        # Must return cpu tensor for subsequent numpy operations
+        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
 
 
 def convert_tfrecord_to_np(block_records_path, num_block_records):
 
     import tensorflow.compat.v1 as tf
 
-    blocks_dataset = tf.data.TFRecordDataset(
-        block_records_path, buffer_size=512 * 1024 * 1024)
-    blocks_dataset = blocks_dataset.batch(
-        num_block_records, drop_remainder=True)
+    blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
+    blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True)
     np_record = next(blocks_dataset.take(1).as_numpy_iterator())
 
-    return np_record
\ No newline at end of file
+    return np_record
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 86f9c84a2e2a..928eb0294f73 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -28,12 +28,20 @@
 if is_torch_available():
     import torch
 
-    from transformers import RealmEmbedder, RealmKnowledgeAugEncoder, RealmRetriever, RealmSearcher, RealmReader, RealmForOpenQA
+    from transformers import (
+        RealmEmbedder,
+        RealmForOpenQA,
+        RealmKnowledgeAugEncoder,
+        RealmReader,
+        RealmRetriever,
+        RealmSearcher,
+    )
 
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
 BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
 
+
 class RealmModelTester:
     def __init__(
         self,
@@ -63,7 +71,7 @@ def __init__(
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
         reader_beam_size=4,
-        reader_seq_len=288+32,
+        reader_seq_len=288 + 32,
         num_block_records=13353718,
         searcher_beam_size=8,
         searcher_seq_len=64,
@@ -107,7 +115,7 @@ def __init__(
         self.num_block_records = num_block_records
         self.searcher_beam_size = searcher_beam_size
         self.searcher_seq_len = searcher_seq_len
-        
+
         self.num_labels = num_labels
         self.num_choices = num_choices
         self.num_candidates = num_candidates
@@ -340,7 +348,6 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmKnowledgeAugEncoder,
-            RealmReader,
             # RealmRetriever is excluded from common tests as it is a container model
             # consisting of two RealmEmbedders & simple inner product calculation.
             # RealmRetriever
@@ -405,11 +412,13 @@ def test_embedder_from_pretrained(self):
     def test_encoder_from_pretrained(self):
         model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
         self.assertIsNotNone(model)
-    
+
     @slow
     def test_open_qa_from_pretrained(self):
-        #TODO: TF record dataset
-        model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", BLOCK_RECORDS_PATH)
+        # TODO: TF record dataset
+        model = RealmForOpenQA.from_pretrained(
+            "qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", BLOCK_RECORDS_PATH
+        )
         self.assertIsNotNone(model)
 
     @slow
@@ -421,13 +430,14 @@ def test_reader_from_pretrained(self):
     def test_retriever_from_pretrained(self):
         model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
         self.assertIsNotNone(model)
-    
+
     @slow
     def test_searcher_from_pretrained(self):
-        #TODO: TF record dataset
+        # TODO: TF record dataset
         model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", BLOCK_RECORDS_PATH)
         self.assertIsNotNone(model)
 
+
 @require_torch
 class RealmModelIntegrationTest(unittest.TestCase):
     @slow
@@ -449,7 +459,9 @@ def test_inference_encoder(self):
         num_candidates = 2
         vocab_size = 30522
 
-        model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates)
+        model = RealmKnowledgeAugEncoder.from_pretrained(
+            "qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates
+        )
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
         relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
         output = model(input_ids, relevance_score=relevance_score)[0]
@@ -463,13 +475,13 @@ def test_inference_encoder(self):
 
     @slow
     def test_inference_open_qa(self):
-        #TODO: TF record dataset
+        # TODO: TF record dataset
         model = RealmForOpenQA.from_pretrained(
             r"qqaatw/realm-orqa-nq-searcher",
             r"qqaatw/realm-orqa-nq-reader",
             BLOCK_RECORDS_PATH,
         )
-        
+
         question = "Who is the pioneer in modern computer science?"
         searcher_output, reader_output, predicted_answer = model(question)
 
@@ -481,20 +493,12 @@ def test_inference_reader(self):
         model = RealmReader.from_pretrained("qqaatw/realm-orqa-nq-reader", config=config)
 
         concat_input_ids = torch.arange(10).view((2, 5))
-        concat_token_type_ids = torch.tensor(
-            [
-                [0, 0, 1, 1, 1],
-                [0, 0, 1, 1, 1]
-            ],
-            dtype=torch.int64
-        )
+        concat_token_type_ids = torch.tensor([[0, 0, 1, 1, 1], [0, 0, 1, 1, 1]], dtype=torch.int64)
         relevance_score = torch.tensor([0.3, 0.7], dtype=torch.float32)
 
         output = model(
-            concat_input_ids,
-            token_type_ids=concat_token_type_ids,
-            relevance_score=relevance_score,
-            return_dict=True)
+            concat_input_ids, token_type_ids=concat_token_type_ids, relevance_score=relevance_score, return_dict=True
+        )
 
         block_idx_expected_shape = torch.Size(())
         start_pos_expected_shape = torch.Size((1,))
@@ -503,7 +507,6 @@ def test_inference_reader(self):
         self.assertEqual(output.start_pos.shape, start_pos_expected_shape)
         self.assertEqual(output.end_pos.shape, end_pos_expected_shape)
 
-
         expected_block_idx = torch.tensor(1)
         expected_start_pos = torch.tensor(3)
         expected_end_pos = torch.tensor(3)
@@ -529,16 +532,12 @@ def test_inference_retriever(self):
 
         expected_slice = torch.tensor([[0.7410, 0.7170]])
         self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
-    
+
     @slow
     def test_inference_searcher(self):
-        #TODO: TF record dataset
+        # TODO: TF record dataset
         config = RealmConfig(searcher_beam_size=5)
-        model = RealmSearcher.from_pretrained(
-            "qqaatw/realm-orqa-nq-searcher", 
-            BLOCK_RECORDS_PATH,
-            config=config
-        )
+        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", BLOCK_RECORDS_PATH, config=config)
 
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         output = model(input_ids)[0]
@@ -548,4 +547,3 @@ def test_inference_searcher(self):
 
         expected_slice = torch.tensor([[5.2747, 4.3768, 5.0444, 5.4152, 5.2922]])
         self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4), output)
-    
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 65642570f333..19be1677465c 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -75,8 +75,8 @@
     "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmRetriever",  # Not regular model.
     "RealmSearcher",  # Not regular model.
-    "RealmForOpenQA" # Not regular model.
-    "ReformerForMaskedLM",  # Needs to be setup as decoder.
+    "RealmForOpenQA",
+    "ReformerForMaskedLM",  # Not regular model.  # Needs to be setup as decoder.
     "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
     "TFRobertaForMultipleChoice",  # TODO: fix

From 93f315a5ebdce73e1908dacb1736cd32fb8d038a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 9 Oct 2021 00:09:55 +0800
Subject: [PATCH 51/98] Exclude RealmReader from common tests

---
 utils/check_repo.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/utils/check_repo.py b/utils/check_repo.py
index 19be1677465c..61025cbd1c61 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -73,10 +73,11 @@
     "DPREncoder",  # Building part of bigger (tested) model.
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
     "RealmBertModel",  # Building part of bigger (tested) model.
+    "RealmReader",  # Not regular model.
     "RealmRetriever",  # Not regular model.
     "RealmSearcher",  # Not regular model.
-    "RealmForOpenQA",
-    "ReformerForMaskedLM",  # Not regular model.  # Needs to be setup as decoder.
+    "RealmForOpenQA",  # Not regular model.
+    "ReformerForMaskedLM",  # Needs to be setup as decoder.
     "TFDPREncoder",  # Building part of bigger (tested) model.
     "TFElectraMainLayer",  # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?)
     "TFRobertaForMultipleChoice",  # TODO: fix

From 4cad343d2fc1949c3612ea1cf47e4a43c71211a6 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 9 Oct 2021 00:39:03 +0800
Subject: [PATCH 52/98] Fix

---
 .../models/realm/configuration_realm.py       |  1 +
 .../models/realm/modeling_realm.py            | 29 +++++++------------
 src/transformers/utils/dummy_pt_objects.py    | 15 ++++++++++
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 0e5fe55a9195..7727a1fdc63a 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -40,6 +40,7 @@ class RealmConfig(PretrainedConfig):
     4. :class:`~transformers.RealmSearcher`
     5. :class:`~transformers.RealmReader`
     6. :class:`~transformers.RealmForOpenQA`
+
     It is used to instantiate an REALM model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
     `realm-cc-news-pretrained <https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder>`__ architecture.
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index ed12f85e2212..93f9a4cefbbb 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -22,7 +22,6 @@
 
 import numpy as np
 import torch
-import torch.utils.checkpoint
 from packaging import version
 from torch import nn
 from torch.nn import CrossEntropyLoss
@@ -30,12 +29,7 @@
 from transformers.models.realm.tokenization_realm import RealmTokenizer
 
 from ...activations import ACT2FN
-from ...file_utils import (
-    add_code_sample_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     BaseModelOutputWithPoolingAndCrossAttentions,
@@ -89,7 +83,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     init_vars = tf.train.list_variables(tf_path)
     names = []
     arrays = []
-    is_reader_checkpoint = False
 
     for name, shape in init_vars:
         logger.info(f"Loading TF weight {name} with shape {shape}")
@@ -168,7 +161,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
     return model
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEmbeddings->RealmEmbeddings
+# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->Realm
 class RealmEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings."""
 
@@ -229,7 +222,7 @@ def forward(
         return embeddings
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfAttention->RealmSelfAttention
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Realm
 class RealmSelfAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -355,7 +348,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertSelfOutput->RealmSelfOutput
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Realm
 class RealmSelfOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -370,7 +363,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertAttention->RealmAttention
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm
 class RealmAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -420,7 +413,7 @@ def forward(
         return outputs
 
 
-# Copied from transformers.models.bert.modeling_bert.BertIntermediate->RealmIntermediate
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Realm
 class RealmIntermediate(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -436,7 +429,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertOutput->RealmOutput
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Realm
 class RealmOutput(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -451,7 +444,7 @@ def forward(self, hidden_states, input_tensor):
         return hidden_states
 
 
-# Copied from transformers.models.bert.modeling_bert.BertLayer->RealmLayer
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Realm
 class RealmLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -535,7 +528,7 @@ def feed_forward_chunk(self, attention_output):
         return layer_output
 
 
-# Copied from transformers.models.bert.modeling_bert.BertEncoder->RealmEncoder
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Realm
 class RealmEncoder(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -633,7 +626,7 @@ def custom_forward(*inputs):
         )
 
 
-# Copied from transformers.models.bert.modeling_bert.BertPooler->RealmPooler
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Realm
 class RealmPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -1526,7 +1519,7 @@ def __init__(self, config, block_records_path):
         )
         if config.use_scann:
             try:
-                import scann
+                import scann  # noqa: F401
             except ImportError:
                 raise ImportError(
                     "RealmSearcher requires ScaNN to retrieve documents from the corpus."
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c13aaf8c497d..cf1651d55a40 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2958,6 +2958,11 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class RealmForOpenQA:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class RealmKnowledgeAugEncoder:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
@@ -2972,11 +2977,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class RealmReader:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class RealmRetriever:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class RealmSearcher:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 def load_tf_weights_in_realm(*args, **kwargs):
     requires_backends(load_tf_weights_in_realm, ["torch"])
 

From dd815913d98691808362e5ec90ae4b52c17eff04 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sat, 9 Oct 2021 00:40:24 +0800
Subject: [PATCH 53/98] Fix

---
 src/transformers/models/realm/modeling_realm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 93f9a4cefbbb..45cb6ec09645 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -321,7 +321,7 @@ def forward(
 
         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
         if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in RealmModel forward() function)
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
@@ -534,6 +534,7 @@ def __init__(self, config):
         super().__init__()
         self.config = config
         self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
 
     def forward(
         self,
@@ -560,12 +561,11 @@ def forward(
             layer_head_mask = head_mask[i] if head_mask is not None else None
             past_key_value = past_key_values[i] if past_key_values is not None else None
 
-            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+            if self.gradient_checkpointing and self.training:
 
                 if use_cache:
                     logger.warning(
-                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
-                        "`use_cache=False`..."
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
                     )
                     use_cache = False
 

From 36e7f1a0334b4b4247b4848e8e0ca55c6a4f5cc7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 24 Dec 2021 17:41:48 +0000
Subject: [PATCH 54/98] convert docs

---
 .../models/realm/modeling_realm.py            | 259 +++++++++---------
 1 file changed, 128 insertions(+), 131 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 45cb6ec09645..04607ffd73bf 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -789,17 +789,16 @@ class RealmEmbedderOutput(ModelOutput):
     Outputs of RealmEmbedder models.
 
     Args:
-        projected_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
+        projected_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
 
             Projected score.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
@@ -816,11 +815,11 @@ class RealmRetrieverOutput(ModelOutput):
     Outputs of RealmRetriever models.
 
     Args:
-        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates)`):
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`):
             The relevance score of document candidates (before softmax).
-        query_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.retriever_proj_size)`):
+        query_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
             Query score derived from the query embedder.
-        candidate_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_candidates, config.retriever_proj_size)`):
+        candidate_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates, config.retriever_proj_size)`):
             Candidate score derived from the embedder.
     """
 
@@ -835,11 +834,11 @@ class RealmSearcherOutput(ModelOutput):
     Outputs of RealmSearcher models.
 
     Args:
-        retrieved_logits (:obj:`torch.FloatTensor` of shape :obj:`(config.searcher_beam_size,)`):
+        retrieved_logits (`torch.FloatTensor` of shape `(config.searcher_beam_size,)`):
             The relevance score of document candidates (before softmax).
-        retrieved_blocks (:obj:`np.ndarray` of shape :obj:`(config.searcher_beam_size,)`):
+        retrieved_blocks (`np.ndarray` of shape `(config.searcher_beam_size,)`):
             Retrieved document blocks.
-        retrieved_block_ids (:obj:`torch.LongTensor` of shape :obj:`(config.searcher_beam_size,)`):
+        retrieved_block_ids (`torch.LongTensor` of shape `(config.searcher_beam_size,)`):
             IDs of retrieved blocks.
     """
 
@@ -854,32 +853,31 @@ class RealmReaderOutput(ModelOutput):
     Outputs of RealmReader models.
 
     Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
             Total loss.
-        retriever_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+        retriever_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
             Retriever loss.
-        reader_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`start_positions`, :obj:`end_positions`, :obj:`has_answers` are provided):
+        reader_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
             Reader loss.
-        retriever_correct (:obj:`torch.BoolTensor` of shape :obj:`(config.searcher_beam_size,)`, `optional`):
-            Whether or not a evidence block derived from `RealmSearcher` contains answer.
-        reader_correct (:obj:`torch.BoolTensor` of shape :obj:`(config.reader_beam_size, num_candidates)`, `optional`):
+        retriever_correct (`torch.BoolTensor` of shape `(config.searcher_beam_size,)`, *optional*):
+            Whether or not a evidence block derived from *RealmSearcher* contains answer.
+        reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*):
             Whether or not a span candidate contains answer.
-        block_idx (:obj:`torch.LongTensor` of shape :obj:`()`):
+        block_idx (`torch.LongTensor` of shape `()`):
             The index of retrieved evidence blocks in which the predicted answer in most likely
-        candidate (:obj:`torch.LongTensor` of shape :obj:`()`):
+        candidate (`torch.LongTensor` of shape `()`):
             .
-        start_pos (:obj:`torch.IntTensor` of shape :obj:`()`):
-            Predicted answer starting position in `RealmReader`'s inputs.
-        end_pos: (:obj:`torch.IntTensor` of shape :obj:`()`):
-            Predicted answer ending position in `RealmReader`'s inputs.
-        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        start_pos (`torch.IntTensor` of shape `()`):
+            Predicted answer starting position in *RealmReader*'s inputs.
+        end_pos: (`torch.IntTensor` of shape `()`):
+            Predicted answer ending position in *RealmReader*'s inputs.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
@@ -905,11 +903,11 @@ class RealmForOpenQAOutput(ModelOutput):
     Outputs of RealmReader models.
 
     Args:
-        searcher_output (:obj:`dict`):
+        searcher_output (`dict`):
             Searcher output.
-        reader_output (:obj:`dict`):
+        reader_output (`dict`):
             Reader output.
-        predicted_answer (:obj:`str`):
+        predicted_answer (`str`):
             Predicted answer.
     """
 
@@ -1088,65 +1086,63 @@ def _flatten_inputs(self, *inputs):
 
 
 REALM_START_DOCSTRING = r"""
-    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ sub-class.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
     and behavior.
 
     Parameters:
-        config (:class:`~transformers.RealmConfig`): Model configuration class with all the parameters of the model.
+        config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
             weights.
 """
 
 REALM_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+        input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`transformers.RealmTokenizer`. See
-            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using [`RealmTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
 
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
-            config.max_position_embeddings - 1]``.
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
 
-            `What are position IDs? <../glossary.html#position-ids>`_
-        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
-            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
             than the model's internal embedding lookup matrix.
-        output_attentions (:obj:`bool`, `optional`):
-            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
-        output_hidden_states (:obj:`bool`, `optional`):
-            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
             more detail.
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
 """
 
 
@@ -1222,7 +1218,7 @@ def forward(
 class RealmRetriever(RealmPreTrainedModel):
     r"""
     Args:
-        query_embedder (:class:`~transformers.RealmEmbedder`):
+        query_embedder ([`RealmEmbedder`]):
             Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
     """
 
@@ -1254,32 +1250,31 @@ def forward(
         return_dict=None,
     ):
         r"""
-        candidate_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`):
+        candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`):
             Indices of candidate input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`transformers.RealmTokenizer`. See
-            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using [`RealmTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        candidate_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`, `optional`):
-            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            [What are input IDs?](../glossary#input-ids)
+        candidate_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
-        candidate_token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_candidates, sequence_length)`, `optional`):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
-            1]``:
+            [What are attention masks?](../glossary#attention-mask)
+        candidate_token_type_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
 
-            - 0 corresponds to a `sentence A` token,
-            - 1 corresponds to a `sentence B` token.
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
 
-            `What are token type IDs? <../glossary.html#token-type-ids>`_
-        candidate_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_candidates, sequence_length, hidden_size)`, `optional`):
-            Optionally, instead of passing :obj:`candidate_input_ids` you can choose to directly pass an embedded
-            representation. This is useful if you want more control over how to convert `candidate_input_ids` indices
+            [What are token type IDs?](../glossary#token-type-ids)
+        candidate_inputs_embeds (`torch.FloatTensor` of shape `(batch_size * num_candidates, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `candidate_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert *candidate_input_ids* indices
             into associated vectors than the model's internal embedding lookup matrix.
 
         Returns:
@@ -1382,42 +1377,43 @@ def forward(
         return_dict=None,
     ):
         r"""
-        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_candidates)`, `optional`):
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, num_candidates)`, *optional*):
             Relevance score derived from RealmRetriever, must be specified if you want to compute the masked language
             modeling loss.
 
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
-            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
 
-        mlm_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+        mlm_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked.
-            Mask values selected in ``[0, 1]``:
+            Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
             - 0 for tokens that are **masked**.
 
         Returns:
 
-        Example::
+        Example:
 
-            >>> import torch
-            >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
+        ```python
+        >>> import torch
+        >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
 
-            >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
-            >>> model = RealmKnowledgeAugEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
+        >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
 
-            >>> # batch_size = 2, num_candidates = 2
-            >>> text = [
-            >>>     ["Hello world!", "Nice to meet you!"],
-            >>>     ["The cute cat.", "The adorable dog."]
-            >>> ]
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [
+        >>>     ["Hello world!", "Nice to meet you!"],
+        >>>     ["The cute cat.", "The adorable dog."]
+        >>> ]
 
-            >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> logits = outputs.logits
-        """
+        >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```
+"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1497,7 +1493,7 @@ def forward(
 class RealmSearcher(RealmPreTrainedModel):
     r"""
     Args:
-        block_records_path (:obj:`str`):
+        block_records_path (`str`):
             Block records path.
     """
 
@@ -1650,19 +1646,19 @@ def forward(
         return_dict=None,
     ):
         r"""
-        relevance_score (:obj:`torch.FloatTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
-            Relevance score derived from `RealmSearcher`, must be specified if you want to compute the marginal log
+        relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Relevance score derived from *RealmSearcher*, must be specified if you want to compute the marginal log
             loss.
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
             sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
             sequence are not taken into account for computing the loss.
-        has_answers (:obj:`torch.BoolTensor` of shape :obj:`(searcher_beam_size,)`, `optional`):
-            Whether or not the evidence blocks derived from `RealmSearcher` have answer(s).
+        has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Whether or not the evidence blocks derived from *RealmSearcher* have answer(s).
 
         Returns:
         """
@@ -1784,14 +1780,13 @@ def mask_to_score(mask):
 
 REALM_FOR_OPEN_QA_DOCSTRING = r"""
     Args:
-        question (:obj:`str`):
+        question (`str`):
             OpenQA Question.
-        answer_ids (:obj:`torch.LongTensor` of shape :obj:`(num_answers, answer_length)`, `optional`):
-            Answer ids for computing the marginal log-likelihood loss. Indices should be in ``[-1, 0, ...,
-            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-1`` are ignored (masked),
-            the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
-        return_dict (:obj:`bool`, `optional`):
-            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        answer_ids (`torch.LongTensor` of shape `(num_answers, answer_length)`, *optional*):
+            Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked),
+            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
 """
 
 
@@ -1812,11 +1807,11 @@ def from_pretrained(
     ):
         """
         Args:
-            searcher_pretrained_name_or_path (:obj:`str`):
+            searcher_pretrained_name_or_path (`str`):
                 Searcher pretrained name or path.
-            reader_pretrained_name_or_path (:obj:`str`):
+            reader_pretrained_name_or_path (`str`):
                 Reader pretrained name or path.
-            block_records_path (:obj:`str`):
+            block_records_path (`str`):
                 Block records path.
 
         """
@@ -1933,19 +1928,21 @@ def forward(self, question, answer_ids=None, return_dict=None):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> import torch
-            >>> from transformers import RealmForOpenQA, RealmTokenizer
+        ```python
+        >>> import torch
+        >>> from transformers import RealmForOpenQA, RealmTokenizer
 
-            >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
+        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
 
-            >>> question = "Who is the pioneer in modern computer science?"
-            >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
+        >>> question = "Who is the pioneer in modern computer science?"
+        >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
 
-            >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
-            >>> loss = reader_output.loss
-        """
+        >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
+        >>> loss = reader_output.loss
+        ```
+"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 

From a41734ceca367a7c38b7f78a08db035385133601 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 24 Dec 2021 17:43:37 +0000
Subject: [PATCH 55/98] up

---
 README.md | 50 --------------------------------------------------
 1 file changed, 50 deletions(-)

diff --git a/README.md b/README.md
index 1cc78425022a..4ec9f6b50386 100644
--- a/README.md
+++ b/README.md
@@ -257,57 +257,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)**
-1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-=======
 1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.

From fb53dad415fc603d6de9ed720d1757de0a035d75 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 24 Dec 2021 17:44:15 +0000
Subject: [PATCH 56/98] up

---
 README.md                                     |  1 -
 README_ko.md                                  |  1 +
 README_zh-hans.md                             |  4 +--
 README_zh-hant.md                             |  4 +--
 docs/source/index.mdx                         |  1 +
 .../models/realm/modeling_realm.py            | 26 +++++++++++--------
 6 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index 4ec9f6b50386..5b55f6cabfb3 100644
--- a/README.md
+++ b/README.md
@@ -321,7 +321,6 @@ AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Ch
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
diff --git a/README_ko.md b/README_ko.md
index 5d001c6c1112..20e55e95fe1f 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -269,6 +269,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 0a9fa246df47..8fbf78761030 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -369,6 +369,7 @@ conda install -c huggingface transformers
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
@@ -397,8 +398,7 @@ conda install -c huggingface transformers
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。 >>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
 1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
 
 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 65c8f5d6dd4f..6aa912edb30e 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -381,6 +381,7 @@ conda install -c huggingface transformers
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@@ -409,8 +410,7 @@ conda install -c huggingface transformers
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. >>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
 1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
 
 要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 112e84b82571..69d9dbf540b8 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -117,6 +117,7 @@ conversion utilities for the following models.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[EncoderDecoder](model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 04607ffd73bf..b6a5e07e63e2 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -181,7 +181,7 @@ def __init__(self, config):
         if version.parse(torch.__version__) > version.parse("1.6.0"):
             self.register_buffer(
                 "token_type_ids",
-                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                torch.zeros(self.position_ids.size(), dtype=torch.long),
                 persistent=False,
             )
 
@@ -224,7 +224,7 @@ def forward(
 
 # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Realm
 class RealmSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, position_embedding_type=None):
         super().__init__()
         if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
             raise ValueError(
@@ -241,7 +241,9 @@ def __init__(self, config):
         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
         if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
             self.max_position_embeddings = config.max_position_embeddings
             self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
@@ -325,7 +327,7 @@ def forward(
             attention_scores = attention_scores + attention_mask
 
         # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -365,9 +367,9 @@ def forward(self, hidden_states, input_tensor):
 
 # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Realm
 class RealmAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, position_embedding_type=None):
         super().__init__()
-        self.self = RealmSelfAttention(config)
+        self.self = RealmSelfAttention(config, position_embedding_type=position_embedding_type)
         self.output = RealmSelfOutput(config)
         self.pruned_heads = set()
 
@@ -454,8 +456,9 @@ def __init__(self, config):
         self.is_decoder = config.is_decoder
         self.add_cross_attention = config.add_cross_attention
         if self.add_cross_attention:
-            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
-            self.crossattention = RealmAttention(config)
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RealmAttention(config, position_embedding_type="absolute")
         self.intermediate = RealmIntermediate(config)
         self.output = RealmOutput(config)
 
@@ -489,9 +492,10 @@ def forward(
 
         cross_attn_present_key_value = None
         if self.is_decoder and encoder_hidden_states is not None:
-            assert hasattr(
-                self, "crossattention"
-            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+                )
 
             # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
             cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None

From 02bae05d0d0c961ab35dfaa02bb0e8d10947d403 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 27 Dec 2021 12:11:18 +0000
Subject: [PATCH 57/98] more make style

---
 src/transformers/models/auto/configuration_auto.py | 9 ---------
 src/transformers/models/realm/modeling_realm.py    | 6 ++----
 tests/test_modeling_realm.py                       | 3 ++-
 3 files changed, 4 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 8ae3dc0bd591..ea6394bb56b5 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -30,14 +30,11 @@
 CONFIG_MAPPING_NAMES = OrderedDict(
     [
         # Add configs here
-<<<<<<< HEAD
         ("realm", "RealmConfig"),
-=======
         ("imagegpt", "ImageGPTConfig"),
         ("qdqbert", "QDQBertConfig"),
         ("vision-encoder-decoder", "VisionEncoderDecoderConfig"),
         ("trocr", "TrOCRConfig"),
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
         ("fnet", "FNetConfig"),
         ("segformer", "SegformerConfig"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
@@ -120,12 +117,9 @@
 CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
     [
         # Add archive maps here
-<<<<<<< HEAD
         ("realm", "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-=======
         ("imagegpt", "IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("qdqbert", "QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
         ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -198,14 +192,11 @@
 MODEL_NAMES_MAPPING = OrderedDict(
     [
         # Add full (and cased) model names here
-<<<<<<< HEAD
         ("realm", "Realm"),
-=======
         ("imagegpt", "ImageGPT"),
         ("qdqbert", "QDQBert"),
         ("vision-encoder-decoder", "Vision Encoder decoder"),
         ("trocr", "TrOCR"),
->>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
         ("fnet", "FNet"),
         ("segformer", "SegFormer"),
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index b6a5e07e63e2..4c6a1a7509d3 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1416,8 +1416,7 @@ def forward(
         >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> logits = outputs.logits
-        ```
-"""
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1945,8 +1944,7 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
         >>> loss = reader_output.loss
-        ```
-"""
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 928eb0294f73..2592f6ffe478 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -39,7 +39,8 @@
 
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
-BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+# BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
 
 
 class RealmModelTester:

From 8f50e8c3e8aa5a400a64a010ecab4583405d0912 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 27 Dec 2021 12:15:30 +0000
Subject: [PATCH 58/98] up

---
 README_zh-hans.md | 78 +----------------------------------------------
 README_zh-hant.md | 78 +----------------------------------------------
 2 files changed, 2 insertions(+), 154 deletions(-)

diff --git a/README_zh-hans.md b/README_zh-hans.md
index 8fbf78761030..559ca3502bf3 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -231,82 +231,6 @@ conda install -c huggingface transformers
 
 目前的检查点数量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
 
-1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
-1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
-1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
-1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
-1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
-1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
-1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
-1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
-1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
-1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
-1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
-1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
-1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
-1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
-1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
-1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
-1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
-1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
-1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
-1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
-1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
-1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
-1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
-1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
-1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
-1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
-1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
-1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
-1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
-1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
-1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
-1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
-1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
-1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
-1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
-1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
-1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
-1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
-1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
-1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
-1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
-1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
-1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
-1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
-1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
-1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
-1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
-1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
-1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
-1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
-1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
-1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)** 
-1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
-1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
-1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
-1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
-1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
-1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
-1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
-1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
-1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
-1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
-1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
-1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
-1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
-1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
-1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
-=======
 🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
@@ -398,7 +322,7 @@ conda install -c huggingface transformers
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。 >>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
 1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
 
 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index 6aa912edb30e..e7a141ef43ea 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -243,82 +243,6 @@ conda install -c huggingface transformers
 
 目前的檢查點數量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
 
-1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
-1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
-1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
-1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
-1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
-1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
-1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
-1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
-1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
-1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[REALM](https://huggingface.co/transformers/master/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)** 
-1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
-1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-=======
 🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/docs/transformers/model_summary)）：
 
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
@@ -410,7 +334,7 @@ conda install -c huggingface transformers
 1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
-1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli. >>>>>>> 705ca7f21b2b557e0cfd5d0853b297fa53489d20
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
 
 要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。

From fef8cf31ab1585100bd612d41e7fd06e9219d14a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 27 Dec 2021 12:16:06 +0000
Subject: [PATCH 59/98] upload

---
 docs/source/index.rst | 650 ------------------------------------------
 1 file changed, 650 deletions(-)
 delete mode 100644 docs/source/index.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index a4cb860bbb48..000000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,650 +0,0 @@
-Transformers
-=======================================================================================================================
-
-State-of-the-art Natural Language Processing for Jax, Pytorch and TensorFlow
-
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
-architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
-Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between Jax,
-PyTorch and TensorFlow.
-
-This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__. You can
-also follow our `online course <https://huggingface.co/course>`__ that teaches how to use this library, as well as the
-other libraries developed by Hugging Face and the Hub.
-
-If you are looking for custom support from the Hugging Face team
------------------------------------------------------------------------------------------------------------------------
-
-.. raw:: html
-
-    <a target="_blank" href="https://huggingface.co/support">
-        <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-    </a><br>
-
-Features
------------------------------------------------------------------------------------------------------------------------
-
-- High performance on NLU and NLG tasks
-- Low barrier to entry for educators and practitioners
-
-State-of-the-art NLP for everyone:
-
-- Deep learning researchers
-- Hands-on practitioners
-- AI/ML/NLP teachers and educators
-
-..
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-Lower compute costs, smaller carbon footprint:
-
-- Researchers can share trained models instead of always retraining
-- Practitioners can reduce compute time and production costs
-- 8 architectures with over 30 pretrained models, some in more than 100 languages
-
-Choose the right framework for every part of a model's lifetime:
-
-- Train state-of-the-art models in 3 lines of code
-- Deep interoperability between Jax, Pytorch and TensorFlow models
-- Move a single model between Jax/PyTorch/TensorFlow frameworks at will
-- Seamlessly pick the right framework for training, evaluation, production
-
-The support for Jax is still experimental (with a few models right now), expect to see it grow in the coming months!
-
-`All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
-hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
-`organizations <https://huggingface.co/organizations>`__.
-
-Current number of checkpoints: |checkpoints|
-
-.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen
-
-Contents
------------------------------------------------------------------------------------------------------------------------
-
-The documentation is organized in five parts:
-
-- **GET STARTED** contains a quick tour, the installation instructions and some useful information about our philosophy
-  and a glossary.
-- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
-- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
-- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
-  transformers model
-- The three last section contain the documentation of each public class and function, grouped in:
-
-    - **MAIN CLASSES** for the main classes exposing the important APIs of the library.
-    - **MODELS** for the classes and functions related to each model implemented in the library.
-    - **INTERNAL HELPERS** for the classes and functions we use internally.
-
-The library currently contains Jax, PyTorch and Tensorflow implementations, pretrained model weights, usage scripts and
-conversion utilities for the following models.
-
-Supported models
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-..
-    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
-
-1. :doc:`ALBERT <model_doc/albert>` (from Google Research and the Toyota Technological Institute at Chicago) released
-   with the paper `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
-   <https://arxiv.org/abs/1909.11942>`__, by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush
-   Sharma, Radu Soricut.
-2. :doc:`BART <model_doc/bart>` (from Facebook) released with the paper `BART: Denoising Sequence-to-Sequence
-   Pre-training for Natural Language Generation, Translation, and Comprehension
-   <https://arxiv.org/pdf/1910.13461.pdf>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
-   Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
-   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
-   Tixier, Michalis Vazirgiannis.
-4. :doc:`BEiT <model_doc/beit>` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers
-   <https://arxiv.org/abs/2106.08254>`__ by Hangbo Bao, Li Dong, Furu Wei.
-5. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
-   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
-   Kenton Lee and Kristina Toutanova.
-6. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
-   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
-   Narayan, Aliaksei Severyn.
-7. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
-   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
-   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-8. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
-   Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
-   Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-9. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
-   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
-   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-10. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building
-    an open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju,
-    Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-11. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
-    <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-12. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
-    pre-trained byte-to-byte models <https://arxiv.org/abs/2105.13626>`__ by Linting Xue, Aditya Barua, Noah Constant,
-    Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-13. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
-    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
-    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-14. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
-    Tokenization-Free Encoder for Language Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark,
-    Dan Garrette, Iulia Turc, John Wieting.
-15. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
-    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
-    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
-    Krueger, Ilya Sutskever.
-16. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
-    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
-    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-17. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
-    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
-    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
-    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
-    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-18. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
-    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
-    Lav R. Varshney, Caiming Xiong and Richard Socher.
-19. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
-    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
-    Chen.
-20. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
-    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
-    Weizhu Chen.
-21. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
-    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
-    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-22. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
-    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
-    Alexander Kirillov, Sergey Zagoruyko.
-23. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
-    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
-    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-24. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
-    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
-    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
-    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
-    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
-    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
-    version of DistilBERT.
-25. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
-    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
-    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-26. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
-    Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
-    Narayan, Aliaksei Severyn.
-27. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
-    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
-    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-28. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
-    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
-    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-29. :doc:`FNet <model_doc/fnet>` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier
-    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
-    Ontanon.
-30. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
-    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
-    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-31. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
-    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
-    and Ilya Sutskever.
-32. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
-    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
-    Luan, Dario Amodei** and Ilya Sutskever**.
-33. :doc:`GPT-J <model_doc/gptj>` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax
-    <https://github.com/kingoflolz/mesh-transformer-jax/>`__ by Ben Wang and Aran Komatsuzaki.
-34. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
-    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-35. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
-    Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
-    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-36. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-37. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
-    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
-    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-38. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
-    Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
-    Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
-    Zhang, Lidong Zhou.
-39. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
-    Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
-    by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-40. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
-    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-41. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
-    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-42. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
-    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
-    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-43. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
-    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
-    by Hao Tan and Mohit Bansal.
-44. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
-    Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
-    Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-45. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
-    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
-    Translator Team.
-46. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
-    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
-    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-47. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
-    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
-    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-48. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
-    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
-    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-49. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
-    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
-    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-50. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
-    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
-    Jianfeng Lu, Tie-Yan Liu.
-51. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
-    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
-    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-52. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
-    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
-    Mohammad Saleh and Peter J. Liu.
-53. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
-    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
-    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-54. `REALM <https://huggingface.co/transformers/master/model_doc/realm.html>`__ (from Google Research) released with
-    the paper `REALM: Retrieval-Augmented Language Model Pre-Training <https://arxiv.org/abs/2002.08909>`__ by Kelvin
-    Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
-55. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
-    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-56. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
-    pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
-    Tsai, M. Johnson, Sebastian Ruder.
-57. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
-    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
-    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-58. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
-    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
-    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-59. :doc:`SpeechEncoderDecoder <model_doc/speechencoderdecoder>`
-60. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
-    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
-    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-61. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
-    `Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
-    Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-62. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
-    Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
-    Jonathan Berant, Amir Globerson, Omer Levy.
-63. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
-    vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
-    Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-64. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
-    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
-    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-65. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
-    `google-research/text-to-text-transfer-transformer
-    <https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
-    Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
-    Zhou and Wei Li and Peter J. Liu.
-66. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
-    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
-    Francesco Piccinno and Julian Martin Eisenschlos.
-67. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
-    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
-    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-68. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
-    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
-    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
-    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-69. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
-    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
-    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-70. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
-    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
-    Zhou, Abdelrahman Mohamed, Michael Auli.
-71. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
-    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-72. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
-    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
-    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-73. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
-    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
-    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
-    Zettlemoyer and Veselin Stoyanov.
-74. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive
-    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
-    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-75. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
-    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
-    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
-
-
-Supported frameworks
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The table below represents the current support in the library for each of those models, whether they have a Python
-tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in Jax (via
-Flax), PyTorch, and/or TensorFlow.
-
-..
-    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
-
-.. rst-class:: center-aligned-table
-
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-+=============================+================+================+=================+====================+==============+
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            BeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           BigBird           |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            GPT-J            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            Realm            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Get started
-
-    quicktour
-    installation
-    philosophy
-    glossary
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Using 🤗 Transformers
-
-    task_summary
-    model_summary
-    preprocessing
-    training
-    model_sharing
-    tokenizer_summary
-    multilingual
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Advanced guides
-
-    pretrained_models
-    examples
-    troubleshooting
-    custom_datasets
-    notebooks
-    sagemaker
-    community
-    converting_tensorflow_models
-    migration
-    contributing
-    add_new_model
-    add_new_pipeline
-    fast_tokenizers
-    performance
-    parallelism
-    testing
-    debugging
-    serialization
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Research
-
-    bertology
-    perplexity
-    benchmarks
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Main Classes
-
-    main_classes/callback
-    main_classes/configuration
-    main_classes/data_collator
-    main_classes/keras_callbacks
-    main_classes/logging
-    main_classes/model
-    main_classes/optimizer_schedules
-    main_classes/output
-    main_classes/pipelines
-    main_classes/processors
-    main_classes/tokenizer
-    main_classes/trainer
-    main_classes/deepspeed
-    main_classes/feature_extractor
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Models
-
-    model_doc/albert
-    model_doc/auto
-    model_doc/bart
-    model_doc/barthez
-    model_doc/beit
-    model_doc/bert
-    model_doc/bertweet
-    model_doc/bertgeneration
-    model_doc/bert_japanese
-    model_doc/bigbird
-    model_doc/bigbird_pegasus
-    model_doc/blenderbot
-    model_doc/blenderbot_small
-    model_doc/bort
-    model_doc/byt5
-    model_doc/camembert
-    model_doc/canine
-    model_doc/clip
-    model_doc/convbert
-    model_doc/cpm
-    model_doc/ctrl
-    model_doc/deberta
-    model_doc/deberta_v2
-    model_doc/deit
-    model_doc/detr
-    model_doc/dialogpt
-    model_doc/distilbert
-    model_doc/dpr
-    model_doc/electra
-    model_doc/encoderdecoder
-    model_doc/flaubert
-    model_doc/fnet
-    model_doc/fsmt
-    model_doc/funnel
-    model_doc/herbert
-    model_doc/ibert
-    model_doc/layoutlm
-    model_doc/layoutlmv2
-    model_doc/layoutxlm
-    model_doc/led
-    model_doc/longformer
-    model_doc/luke
-    model_doc/lxmert
-    model_doc/marian
-    model_doc/m2m_100
-    model_doc/mbart
-    model_doc/megatron_bert
-    model_doc/megatron_gpt2
-    model_doc/mobilebert
-    model_doc/mpnet
-    model_doc/mt5
-    model_doc/gpt
-    model_doc/gpt2
-    model_doc/gptj
-    model_doc/gpt_neo
-    model_doc/hubert
-    model_doc/pegasus
-    model_doc/phobert
-    model_doc/prophetnet
-    model_doc/rag
-    model_doc/realm
-    model_doc/reformer
-    model_doc/rembert
-    model_doc/retribert
-    model_doc/roberta
-    model_doc/roformer
-    model_doc/speechencoderdecoder
-    model_doc/speech_to_text
-    model_doc/speech_to_text_2
-    model_doc/splinter
-    model_doc/squeezebert
-    model_doc/t5
-    model_doc/t5v1.1
-    model_doc/tapas
-    model_doc/transformerxl
-    model_doc/vit
-    model_doc/visual_bert
-    model_doc/wav2vec2
-    model_doc/xlm
-    model_doc/xlmprophetnet
-    model_doc/xlmroberta
-    model_doc/xlnet
-    model_doc/xlsr_wav2vec2
-
-.. toctree::
-    :maxdepth: 2
-    :caption: Internal Helpers
-
-    internal/modeling_utils
-    internal/pipelines_utils
-    internal/tokenization_utils
-    internal/trainer_utils
-    internal/generation_utils
-    internal/file_utils

From 8b723ae638cc04a0f39192dd9497571d2a886bf7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 27 Dec 2021 12:18:33 +0000
Subject: [PATCH 60/98] up

---
 .../models/realm/configuration_realm.py       | 108 +++++++++---------
 .../models/realm/tokenization_realm.py        |  40 ++++---
 2 files changed, 76 insertions(+), 72 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 7727a1fdc63a..6d6705966c68 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -34,94 +34,96 @@ class RealmConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of
 
-    1. :class:`~transformers.RealmEmbedder`
-    2. :class:`~transformers.RealmRetriever`
-    3. :class:`~transformers.RealmKnowledgeAugEncoder`
-    4. :class:`~transformers.RealmSearcher`
-    5. :class:`~transformers.RealmReader`
-    6. :class:`~transformers.RealmForOpenQA`
+    1. [`RealmEmbedder`]
+    2. [`RealmRetriever`]
+    3. [`RealmKnowledgeAugEncoder`]
+    4. [`RealmSearcher`]
+    5. [`RealmReader`]
+    6. [`RealmForOpenQA`]
 
     It is used to instantiate an REALM model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
-    `realm-cc-news-pretrained <https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder>`__ architecture.
+    [realm-cc-news-pretrained](https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`,
-            :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`RealmEmbedder`],
+            [`RealmRetriever`], [`RealmKnowledgeAugEncoder`],
+            [`RealmSearcher`], or [`RealmReader`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        retriever_proj_size (:obj:`int`, `optional`, defaults to 128):
+        retriever_proj_size (`int`, *optional*, defaults to 128):
             Dimension of the retriever(embedder) projection.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        num_candidates (:obj:`int`, `optional`, defaults to 8):
+        num_candidates (`int`, *optional*, defaults to 8):
             Number of candidates inputted to the RealmRetriever or RealmKnowledgeAugEncoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RealmEmbedder`,
-            :class:`~transformers.RealmRetriever`, :class:`~transformers.RealmKnowledgeAugEncoder`,
-            :class:`~transformers.RealmSearcher`, or :class:`~transformers.RealmReader`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`],
+            [`RealmRetriever`], [`RealmKnowledgeAugEncoder`],
+            [`RealmSearcher`], or [`RealmReader`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        use_scann (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not :class:`~transformers.RealmSearcher` uses `ScaNN` as the vector similarity searcher. This
+            relevant if `config.is_decoder=True`.
+        use_scann (`bool`, *optional*, defaults to `True`):
+            Whether or not [`RealmSearcher`] uses *ScaNN* as the vector similarity searcher. This
             option has no effect and is reserved for future development.
-        span_hidden_size (:obj:`int`, `optional`, defaults to 256):
+        span_hidden_size (`int`, *optional*, defaults to 256):
             Dimension of the reader's spans.
-        max_span_width (:obj:`int`, `optional`, defaults to 10):
+        max_span_width (`int`, *optional*, defaults to 10):
             Max span width of the reader.
-        reader_layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-3):
+        reader_layer_norm_eps (`float`, *optional*, defaults to 1e-3):
             The epsilon used by the reader's layer normalization layers.
-        reader_beam_size (:obj:`int`, `optional`, defaults to 5):
+        reader_beam_size (`int`, *optional*, defaults to 5):
             Beam size of the reader.
-        reader_seq_len (:obj:`int`, `optional`, defaults to 288+32):
+        reader_seq_len (`int`, *optional*, defaults to 288+32):
             Maximum sequence length of the reader.
-        num_block_records (:obj:`int`, `optional`, defaults to 13353718):
+        num_block_records (`int`, *optional*, defaults to 13353718):
             Number of block records.
-        searcher_beam_size (:obj:`int`, `optional`, defaults to 5000):
-            Beam size of the searcher. Note that when eval mode is enabled, `searcher_beam_size` will be the same as
-            `reader_beam_size`.
-        searcher_seq_len (:obj:`int`, `optional`, defaults to 64):
+        searcher_beam_size (`int`, *optional*, defaults to 5000):
+            Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
+            *reader_beam_size*.
+        searcher_seq_len (`int`, *optional*, defaults to 64):
             Maximum sequence length of the searcher.
 
-    Example::
+    Example:
 
-        >>> from transformers import RealmEmbedder, RealmConfig
+    ```python
+    >>> from transformers import RealmEmbedder, RealmConfig
 
-        >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration
-        >>> configuration = RealmConfig()
+    >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration
+    >>> configuration = RealmConfig()
 
-        >>> # Initializing a model from the qqaatw/realm-cc-news-pretrained-embedder style configuration
-        >>> model = RealmEmbedder(configuration)
+    >>> # Initializing a model from the qqaatw/realm-cc-news-pretrained-embedder style configuration
+    >>> model = RealmEmbedder(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
     model_type = "realm"
 
     def __init__(
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 64848f7795bd..bc5377fdd113 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -56,10 +56,10 @@ class RealmTokenizer(BertTokenizer):
     r"""
     Construct a REALM tokenizer.
 
-    :class:`~transformers.RealmTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -74,38 +74,40 @@ def batch_encode_candidates(self, text, **kwargs):
         differences:
 
             1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
-            2. Always pad the sequences to `max_length`.
-            3. Must specify `max_length` in order to stack packs of candidates into a batch.
+            2. Always pad the sequences to *max_length*.
+            3. Must specify *max_length* in order to stack packs of candidates into a batch.
 
-            - single sequence: ``[CLS] X [SEP]``
-            - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            text (:obj:`List[List[str]]`):
+            text (`List[List[str]]`):
                 The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
                 num_candidates, text).
-            text_pair (:obj:`List[List[str]]`, `optional`):
+            text_pair (`List[List[str]]`, *optional*):
                 The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
                 num_candidates, text).
             **kwargs:
                 Keyword arguments of the __call__ method.
 
         Returns:
-            :class:`~transformers.BatchEncoding`: Encoded text or text pair.
+            [`BatchEncoding`]: Encoded text or text pair.
 
-        Example::
+        Example:
 
-            >>> from transformers import RealmTokenizer
+        ```python
+        >>> from transformers import RealmTokenizer
 
-            >>> # batch_size = 2, num_candidates = 2
-            >>> text = [
-            >>>     ["Hello world!", "Nice to meet you!"],
-            >>>     ["The cute cat.", "The adorable dog."]
-            >>> ]
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [
+        >>>     ["Hello world!", "Nice to meet you!"],
+        >>>     ["The cute cat.", "The adorable dog."]
+        >>> ]
 
-            >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
-            >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
-        """
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        ```
+"""
 
         # Always using a fixed sequence length to encode in order to stack candidates into a batch.
         kwargs["padding"] = PaddingStrategy.MAX_LENGTH

From 348936e3b82703eb0fad11f334ea594b0c1b76a5 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 29 Dec 2021 03:22:00 +0800
Subject: [PATCH 61/98] Fix

---
 .../models/realm/modeling_realm.py            | 82 ++++++++-----------
 1 file changed, 36 insertions(+), 46 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 4c6a1a7509d3..f179fbc3bd64 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1830,12 +1830,20 @@ def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
         self.reader.save_pretrained(save_directory)
 
-    def retrieve(self, input_ids, **kwargs):
-        output = self.searcher(input_ids, return_dict=True, **kwargs)
+    def retrieve(self, question):
+        question_ids = self.tokenizer(
+            [question], 
+            padding=True, 
+            truncation=True, 
+            max_length=self.config.searcher_seq_len,
+            return_tensors="pt",
+        ).to(self.searcher.device)
+
+        output = self.searcher(**question_ids, return_dict=True)
         return output
 
-    def read(self, searcher_output, question, answers):
-        def block_has_answer(concat_inputs, answers):
+    def read(self, searcher_output, question, answer_ids):
+        def block_has_answer(concat_inputs, answer_ids):
             """check if retrieved_blocks has answers."""
             has_answers = []
             start_pos = []
@@ -1843,32 +1851,17 @@ def block_has_answer(concat_inputs, answers):
             max_answers = 0
 
             for input_id in concat_inputs.input_ids:
-                pass_sep = False
-                answer_pos = 0
-                start = -1
                 start_pos.append([])
                 end_pos.append([])
-                for answer in answers:
-                    for idx, id in enumerate(input_id):
-                        if id == self.tokenizer.sep_token_id:
-                            pass_sep = True
-                        if not pass_sep:
-                            continue
-                        if answer[answer_pos] == id:
-                            if start == -1:
-                                start = idx
-                            if answer_pos == len(answer) - 1:
-                                start_pos[-1].append(start)
-                                end_pos[-1].append(idx)
-                                answer_pos = 0
-                                start = -1
-                                break
-                            else:
-                                answer_pos += 1
-                        else:
-                            answer_pos = 0
-                            start = -1
-
+                input_id = input_id.tolist()
+                sep_idx = input_id.index(self.tokenizer.sep_token_id)
+                for answer in answer_ids:
+                    for idx in range(sep_idx, len(input_id)):
+                        if answer[0] == input_id[idx]:
+                            if input_id[idx: idx + len(answer)] == answer:
+                                start_pos[-1].append(idx)
+                                end_pos[-1].append(idx + len(answer)-1)
+        
                 if len(start_pos[-1]) == 0:
                     has_answers.append(False)
                 else:
@@ -1878,17 +1871,15 @@ def block_has_answer(concat_inputs, answers):
 
             # Pad -1 to max_answers
             for start_pos_, end_pos_ in zip(start_pos, end_pos):
-                while len(start_pos_) < max_answers:
-                    start_pos_.append(-1)
-                while len(end_pos_) < max_answers:
-                    end_pos_.append(-1)
-
-            assert len(has_answers) == len(start_pos) == len(end_pos)
+                if len(start_pos_) < max_answers:
+                    padded = [-1] * (max_answers - len(start_pos_))
+                    start_pos_ += padded
+                    end_pos_ += padded
 
             return (
-                torch.tensor(has_answers, dtype=torch.bool, device=concat_inputs.input_ids.device),
-                torch.tensor(start_pos, dtype=torch.int64, device=concat_inputs.input_ids.device),
-                torch.tensor(end_pos, dtype=torch.int64, device=concat_inputs.input_ids.device),
+                torch.tensor(has_answers, dtype=torch.bool),
+                torch.tensor(start_pos, dtype=torch.int64),
+                torch.tensor(end_pos, dtype=torch.int64),
             )
 
         text = []
@@ -1899,12 +1890,15 @@ def block_has_answer(concat_inputs, answers):
 
         concat_inputs = self.tokenizer(
             text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors="pt"
-        )
+        ).to(self.reader.device)
 
-        if answers is not None:
+        if answer_ids is not None:
             has_answers, start_positions, end_positions = block_has_answer(
-                concat_inputs.to(searcher_output.retrieved_logits.device), answers
+                concat_inputs, answer_ids
             )
+            has_answers = has_answers.to(self.reader.device)
+            start_positions = start_positions.to(self.reader.device)
+            end_positions = end_positions.to(self.reader.device)
         else:
             has_answers, start_positions, end_positions = (None, None, None)
 
@@ -1948,15 +1942,11 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        question_ids = self.tokenizer(
-            [question], padding=True, truncation=True, max_length=self.config.searcher_seq_len, return_tensors="pt"
-        )
-
-        searcher_output = self.retrieve(**question_ids)
+        searcher_output = self.retrieve(question)
 
         reader_output, predicted_answer = self.read(searcher_output, question, answer_ids)
 
-        if return_dict:
+        if not return_dict:
             return searcher_output, reader_output, predicted_answer
 
         return RealmForOpenQAOutput(

From b86139b8be7d70b75757639f0c66a76707e1c4de Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 12:38:06 +0100
Subject: [PATCH 62/98] Update src/transformers/__init__.py

---
 src/transformers/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index fd9b3dd6946f..6cb31bd9c842 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -141,7 +141,6 @@
         "is_tensorboard_available",
         "is_wandb_available",
     ],
-    "keras_callbacks": [],
     "modelcard": ["ModelCard"],
     "modeling_tf_pytorch_utils": [
         "convert_tf_weight_name_to_pt_weight_name",

From 581c20c4602e5be77c5aa879e07693b957107274 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 12:31:54 +0000
Subject: [PATCH 63/98] adapt testing

---
 tests/test_modeling_realm.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 2592f6ffe478..d6916f98b669 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -477,10 +477,12 @@ def test_inference_encoder(self):
     @slow
     def test_inference_open_qa(self):
         # TODO: TF record dataset
+        config = RealmConfig(use_scann=False)
         model = RealmForOpenQA.from_pretrained(
             r"qqaatw/realm-orqa-nq-searcher",
             r"qqaatw/realm-orqa-nq-reader",
             BLOCK_RECORDS_PATH,
+            config=config,
         )
 
         question = "Who is the pioneer in modern computer science?"

From c39b31f9488c6f48c067d36231779687c61bfb02 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 15:41:12 +0000
Subject: [PATCH 64/98] change modeling code

---
 .../models/realm/modeling_realm.py            | 146 +++++++++---------
 .../models/realm/retrieval_realm.py           | 129 ++++++++++++++++
 2 files changed, 201 insertions(+), 74 deletions(-)
 create mode 100644 src/transformers/models/realm/retrieval_realm.py

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index f179fbc3bd64..826b128e70ba 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1830,57 +1830,77 @@ def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
         self.reader.save_pretrained(save_directory)
 
-    def retrieve(self, question):
+    def block_has_answer(self, concat_inputs, answer_ids):
+        """check if retrieved_blocks has answers."""
+        has_answers = []
+        start_pos = []
+        end_pos = []
+        max_answers = 0
+
+        for input_id in concat_inputs.input_ids:
+            start_pos.append([])
+            end_pos.append([])
+            input_id = input_id.tolist()
+            sep_idx = input_id.index(self.tokenizer.sep_token_id)
+            for answer in answer_ids:
+                for idx in range(sep_idx, len(input_id)):
+                    if answer[0] == input_id[idx]:
+                        if input_id[idx: idx + len(answer)] == answer:
+                            start_pos[-1].append(idx)
+                            end_pos[-1].append(idx + len(answer) - 1)
+
+            if len(start_pos[-1]) == 0:
+                has_answers.append(False)
+            else:
+                has_answers.append(True)
+                if len(start_pos[-1]) > max_answers:
+                    max_answers = len(start_pos[-1])
+
+        # Pad -1 to max_answers
+        for start_pos_, end_pos_ in zip(start_pos, end_pos):
+            if len(start_pos_) < max_answers:
+                padded = [-1] * (max_answers - len(start_pos_))
+                start_pos_ += padded
+                end_pos_ += padded
+
+        return (
+            torch.tensor(has_answers, dtype=torch.bool),
+            torch.tensor(start_pos, dtype=torch.int64),
+            torch.tensor(end_pos, dtype=torch.int64),
+        )
+
+    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
+    @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(self, question, answer_ids=None, return_dict=None):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import RealmForOpenQA, RealmTokenizer
+
+        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
+
+        >>> question = "Who is the pioneer in modern computer science?"
+        >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
+
+        >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
+        >>> loss = reader_output.loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
         question_ids = self.tokenizer(
-            [question], 
-            padding=True, 
-            truncation=True, 
+            [question],
+            padding=True,
+            truncation=True,
             max_length=self.config.searcher_seq_len,
             return_tensors="pt",
         ).to(self.searcher.device)
 
-        output = self.searcher(**question_ids, return_dict=True)
-        return output
-
-    def read(self, searcher_output, question, answer_ids):
-        def block_has_answer(concat_inputs, answer_ids):
-            """check if retrieved_blocks has answers."""
-            has_answers = []
-            start_pos = []
-            end_pos = []
-            max_answers = 0
-
-            for input_id in concat_inputs.input_ids:
-                start_pos.append([])
-                end_pos.append([])
-                input_id = input_id.tolist()
-                sep_idx = input_id.index(self.tokenizer.sep_token_id)
-                for answer in answer_ids:
-                    for idx in range(sep_idx, len(input_id)):
-                        if answer[0] == input_id[idx]:
-                            if input_id[idx: idx + len(answer)] == answer:
-                                start_pos[-1].append(idx)
-                                end_pos[-1].append(idx + len(answer)-1)
-        
-                if len(start_pos[-1]) == 0:
-                    has_answers.append(False)
-                else:
-                    has_answers.append(True)
-                    if len(start_pos[-1]) > max_answers:
-                        max_answers = len(start_pos[-1])
-
-            # Pad -1 to max_answers
-            for start_pos_, end_pos_ in zip(start_pos, end_pos):
-                if len(start_pos_) < max_answers:
-                    padded = [-1] * (max_answers - len(start_pos_))
-                    start_pos_ += padded
-                    end_pos_ += padded
-
-            return (
-                torch.tensor(has_answers, dtype=torch.bool),
-                torch.tensor(start_pos, dtype=torch.int64),
-                torch.tensor(end_pos, dtype=torch.int64),
-            )
+        searcher_output = self.searcher(**question_ids, return_dict=True)
 
         text = []
         text_pair = []
@@ -1892,8 +1912,9 @@ def block_has_answer(concat_inputs, answer_ids):
             text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors="pt"
         ).to(self.reader.device)
 
+        # concat inputs should come from the retriever here
         if answer_ids is not None:
-            has_answers, start_positions, end_positions = block_has_answer(
+            has_answers, start_positions, end_positions = self.block_has_answer(
                 concat_inputs, answer_ids
             )
             has_answers = has_answers.to(self.reader.device)
@@ -1913,38 +1934,15 @@ def block_has_answer(concat_inputs, answer_ids):
             return_dict=True,
         )
 
+        # this will by handled by the retriever decode method
         answer = self.tokenizer.decode(
             concat_inputs.input_ids[output.block_idx][output.start_pos : output.end_pos + 1]
         )
 
-        return output, answer
-
-    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
-    @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(self, question, answer_ids=None, return_dict=None):
-        r"""
-        Returns:
-
-        Example:
-
-        ```python
-        >>> import torch
-        >>> from transformers import RealmForOpenQA, RealmTokenizer
-
-        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
-
-        >>> question = "Who is the pioneer in modern computer science?"
-        >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
-
-        >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
-        >>> loss = reader_output.loss
-        ```"""
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        searcher_output = self.retrieve(question)
+        reader_output, predicted_answer = output, answer
+        import ipdb; ipdb.set_trace()
 
-        reader_output, predicted_answer = self.read(searcher_output, question, answer_ids)
+        # this will by handled by the retriever decode method
 
         if not return_dict:
             return searcher_output, reader_output, predicted_answer
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
new file mode 100644
index 000000000000..f76bc6b17f3b
--- /dev/null
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RAG Retriever model implementation."""
+
+import os
+import pickle
+import time
+from typing import Iterable, List, Optional, Tuple
+
+import numpy as np
+
+from ...file_utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, requires_backends
+from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils_base import BatchEncoding
+from ...utils import logging
+from .configuration_rag import RagConfig
+from .tokenization_rag import RagTokenizer
+
+
+if is_datasets_available():
+    from datasets import Dataset, load_dataset, load_from_disk
+
+if is_faiss_available():
+    import faiss
+
+
+logger = logging.get_logger(__name__)
+
+
+LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/"
+
+
+
+class RealmRetriever:
+
+    def __init__(self, config, tokenizer, index=None):
+        super().__init__()
+        self.tokenizer = tokenizer
+        self.index
+
+    @classmethod
+    def from_pretrained(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
+        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(retriever_name_or_path, **kwargs)
+        tokenizer = RealmTokenizer.from_pretrained(retriever_name_or_path, config=config)
+
+        # logic to load tf.records (should probs put it in `datasets`)
+        index = None
+
+        return cls(
+            config,
+            tokenizer=tokenizer,
+            index=index,
+        )
+
+    def save_pretrained(self, save_directory):
+        # save index here
+
+        self.config.save_pretrained(save_directory)
+        self.tokenizer.save_pretrained(save_directory)
+
+    def __call__(
+        self,
+        question_input_ids: List[List[int]],
+        question_hidden_states: np.ndarray,
+        prefix=None,
+        n_docs=None,
+        return_tensors=None,
+    ) -> BatchEncoding:
+
+        n_docs = n_docs if n_docs is not None else self.n_docs
+        prefix = prefix if prefix is not None else self.config.generator.prefix
+        retrieved_doc_embeds, doc_ids, docs = self.retrieve(question_hidden_states, n_docs)
+
+        input_strings = self.question_encoder_tokenizer.batch_decode(question_input_ids, skip_special_tokens=True)
+        context_input_ids, context_attention_mask = self.postprocess_docs(
+            docs, input_strings, prefix, n_docs, return_tensors=return_tensors
+        )
+
+        if self.return_tokenized_docs:
+            retrived_doc_text = []
+            retrived_doc_title = []
+
+            for b_idx in range(len(docs)):
+                for doc_idx in range(n_docs):
+                    retrived_doc_text.append(docs[b_idx]["text"][doc_idx])
+                    retrived_doc_title.append(docs[b_idx]["title"][doc_idx])
+
+            tokenized_docs = self.ctx_encoder_tokenizer(
+                retrived_doc_title,
+                retrived_doc_text,
+                truncation=True,
+                padding="longest",
+                return_tensors=return_tensors,
+            )
+
+            return BatchEncoding(
+                {
+                    "context_input_ids": context_input_ids,
+                    "context_attention_mask": context_attention_mask,
+                    "retrieved_doc_embeds": retrieved_doc_embeds,
+                    "doc_ids": doc_ids,
+                    "tokenized_doc_ids": tokenized_docs["input_ids"],
+                    "tokenized_doc_attention_mask": tokenized_docs["attention_mask"],
+                },
+                tensor_type=return_tensors,
+            )
+
+        else:
+            return BatchEncoding(
+                {
+                    "context_input_ids": context_input_ids,
+                    "context_attention_mask": context_attention_mask,
+                    "retrieved_doc_embeds": retrieved_doc_embeds,
+                    "doc_ids": doc_ids,
+                },
+                tensor_type=return_tensors,
+            )

From bb78ce5cbc238d16025d3cf738f7e8bceffdc21d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 16:10:29 +0000
Subject: [PATCH 65/98] fix test

---
 src/transformers/models/realm/modeling_realm.py | 1 -
 tests/test_modeling_realm.py                    | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 826b128e70ba..3d0426e12548 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1940,7 +1940,6 @@ def forward(self, question, answer_ids=None, return_dict=None):
         )
 
         reader_output, predicted_answer = output, answer
-        import ipdb; ipdb.set_trace()
 
         # this will by handled by the retriever decode method
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index d6916f98b669..da82ef206447 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -486,7 +486,7 @@ def test_inference_open_qa(self):
         )
 
         question = "Who is the pioneer in modern computer science?"
-        searcher_output, reader_output, predicted_answer = model(question)
+        predicted_answer = model(question).predicted_answer
 
         self.assertEqual(predicted_answer, "alan mathison turing")
 

From 851d9ea4cdd333e9ae3bc421e0a526a370b3c33f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 17:24:58 +0000
Subject: [PATCH 66/98] up

---
 .../models/realm/modeling_realm.py            | 128 +++++++++++++++++-
 1 file changed, 121 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 3d0426e12548..616deedc0470 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1793,17 +1793,67 @@ def mask_to_score(mask):
 """
 
 
+class RealmSearcherWrapper(RealmPreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.embedder = RealmEmbedder(config)
+        self.register_buffer(
+            "block_emb",
+            torch.zeros(()).new_empty(
+                size=(config.num_block_records, config.retriever_proj_size),
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+            ),
+        )
+
+
 @add_start_docstrings(
     "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
     REALM_START_DOCSTRING,
 )
 class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, tokenizer):
+    def __init__(self, config, searcher, reader, tokenizer, block_records_path):
         super().__init__(config)
-        self.searcher = searcher
         self.reader = reader
         self.tokenizer = tokenizer
 
+        self.embedder = searcher.embedder
+        self.block_emb = searcher.block_emb
+
+        self.block_records = convert_tfrecord_to_np(
+            block_records_path=block_records_path,
+            num_block_records=config.num_block_records,
+        )
+        if config.use_scann:
+            try:
+                import scann  # noqa: F401
+            except ImportError:
+                raise ImportError(
+                    "RealmSearcher requires ScaNN to retrieve documents from the corpus."
+                    "Please install it through `pip install scann`."
+                )
+
+#        if self.training:
+#            beam_size = self.config.searcher_beam_size
+#        else:
+        beam_size = self.config.reader_beam_size
+
+        if self.config.use_scann and self.block_emb.device != torch.device("cpu"):
+            self.block_emb = self.block_emb.cpu()
+
+        if self.config.use_scann:
+            self.retriever = ScaNNSearcher(
+                db=self.block_emb,
+                num_neighbors=beam_size,
+            )
+        else:
+            self.retriever = BruteForceSearcher(
+                db=self.block_emb,
+                num_neighbors=beam_size,
+            )
+        self.init_weights()
+
     @classmethod
     def from_pretrained(
         cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs
@@ -1819,12 +1869,12 @@ def from_pretrained(
 
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(
-            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
+        searcher = RealmSearcherWrapper.from_pretrained(
+            searcher_pretrained_name_or_path, config=config, **kwargs
         )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
         tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
-        return cls(config, searcher, reader, tokenizer)
+        return cls(config, searcher, reader, tokenizer, block_records_path)
 
     def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
@@ -1869,6 +1919,70 @@ def block_has_answer(self, concat_inputs, answer_ids):
             torch.tensor(end_pos, dtype=torch.int64),
         )
 
+    def search(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is not None and input_ids.shape[0] != 1) or (
+            inputs_embeds is not None and inputs_embeds.shape[0] != 1
+        ):
+            raise ValueError("The batch_size of the inputs must be 1.")
+
+        question_outputs = self.embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+
+        # [1, searcher_beam_size]
+        retrieved_block_ids = self.retriever.search_batched(question_projection)
+
+        # [searcher_beam_size]
+        retrieved_block_ids = retrieved_block_ids.squeeze()
+
+        # [searcher_beam_size]
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
+
+        # [searcher_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(
+            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
+        )
+
+        # [searcher_beam_size]
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
+        )
+
+        if not return_dict:
+            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
+
+        return RealmSearcherOutput(
+            retrieved_logits=retrieved_logits,
+            retrieved_blocks=retrieved_blocks,
+            retrieved_block_ids=retrieved_block_ids,
+        )
+
     @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
     def forward(self, question, answer_ids=None, return_dict=None):
@@ -1898,9 +2012,9 @@ def forward(self, question, answer_ids=None, return_dict=None):
             truncation=True,
             max_length=self.config.searcher_seq_len,
             return_tensors="pt",
-        ).to(self.searcher.device)
+        ).to(self.device)
 
-        searcher_output = self.searcher(**question_ids, return_dict=True)
+        searcher_output = self.search(**question_ids, return_dict=True)
 
         text = []
         text_pair = []

From 8e340a12c510e4747e3628d2c2608ae20bc73334 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 17:45:55 +0000
Subject: [PATCH 67/98] up

---
 .../models/realm/modeling_realm.py            | 164 ++----------------
 1 file changed, 13 insertions(+), 151 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 616deedc0470..afb84b2b6490 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1489,129 +1489,6 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "The searcher of REALM outputting relevance score (before softmax) and corresponding document blocks.",
-    REALM_START_DOCSTRING,
-)
-class RealmSearcher(RealmPreTrainedModel):
-    r"""
-    Args:
-        block_records_path (`str`):
-            Block records path.
-    """
-
-    def __init__(self, config, block_records_path):
-        super().__init__(config)
-        self.embedder = RealmEmbedder(config)
-        self.searcher = None
-        self.block_records = convert_tfrecord_to_np(
-            block_records_path=block_records_path,
-            num_block_records=config.num_block_records,
-        )
-        self.register_buffer(
-            "block_emb",
-            torch.zeros(()).new_empty(
-                size=(config.num_block_records, config.retriever_proj_size),
-                dtype=torch.float32,
-                device=torch.device("cpu"),
-            ),
-        )
-        if config.use_scann:
-            try:
-                import scann  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "RealmSearcher requires ScaNN to retrieve documents from the corpus."
-                    "Please install it through `pip install scann`."
-                )
-        self.init_weights()
-
-    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("1, searcher_seq_len"))
-    @replace_return_docstrings(output_type=RealmSearcherOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is not None and input_ids.shape[0] != 1) or (
-            inputs_embeds is not None and inputs_embeds.shape[0] != 1
-        ):
-            raise ValueError("The batch_size of the inputs must be 1.")
-
-        if self.training:
-            beam_size = self.config.searcher_beam_size
-        else:
-            beam_size = self.config.reader_beam_size
-
-        if self.config.use_scann and self.block_emb.device != torch.device("cpu"):
-            self.block_emb = self.block_emb.cpu()
-        if self.searcher is None:
-            if self.config.use_scann:
-                self.searcher = ScaNNSearcher(
-                    db=self.block_emb,
-                    num_neighbors=beam_size,
-                )
-            else:
-                self.searcher = BruteForceSearcher(
-                    db=self.block_emb,
-                    num_neighbors=beam_size,
-                )
-
-        question_outputs = self.embedder(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # [1, projection_size]
-        question_projection = question_outputs[0]
-
-        # [1, searcher_beam_size]
-        retrieved_block_ids = self.searcher.search_batched(question_projection)
-
-        # [searcher_beam_size]
-        retrieved_block_ids = retrieved_block_ids.squeeze()
-
-        # [searcher_beam_size]
-        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
-
-        # [searcher_beam_size, projection_size]
-        retrieved_block_emb = torch.index_select(
-            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
-        )
-
-        # [searcher_beam_size]
-        retrieved_logits = torch.einsum(
-            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
-        )
-
-        if not return_dict:
-            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
-
-        return RealmSearcherOutput(
-            retrieved_logits=retrieved_logits,
-            retrieved_blocks=retrieved_blocks,
-            retrieved_block_ids=retrieved_block_ids,
-        )
-
-
 @add_start_docstrings(
     "The reader of REALM.",
     REALM_START_DOCSTRING,
@@ -1793,7 +1670,7 @@ def mask_to_score(mask):
 """
 
 
-class RealmSearcherWrapper(RealmPreTrainedModel):
+class RealmSearcher(RealmPreTrainedModel):
 
     def __init__(self, config):
         super().__init__(config)
@@ -1806,6 +1683,7 @@ def __init__(self, config):
                 device=torch.device("cpu"),
             ),
         )
+        self.init_weights()
 
 
 @add_start_docstrings(
@@ -1825,33 +1703,7 @@ def __init__(self, config, searcher, reader, tokenizer, block_records_path):
             block_records_path=block_records_path,
             num_block_records=config.num_block_records,
         )
-        if config.use_scann:
-            try:
-                import scann  # noqa: F401
-            except ImportError:
-                raise ImportError(
-                    "RealmSearcher requires ScaNN to retrieve documents from the corpus."
-                    "Please install it through `pip install scann`."
-                )
-
-#        if self.training:
-#            beam_size = self.config.searcher_beam_size
-#        else:
-        beam_size = self.config.reader_beam_size
-
-        if self.config.use_scann and self.block_emb.device != torch.device("cpu"):
-            self.block_emb = self.block_emb.cpu()
 
-        if self.config.use_scann:
-            self.retriever = ScaNNSearcher(
-                db=self.block_emb,
-                num_neighbors=beam_size,
-            )
-        else:
-            self.retriever = BruteForceSearcher(
-                db=self.block_emb,
-                num_neighbors=beam_size,
-            )
         self.init_weights()
 
     @classmethod
@@ -1869,7 +1721,7 @@ def from_pretrained(
 
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcherWrapper.from_pretrained(
+        searcher = RealmSearcher.from_pretrained(
             searcher_pretrained_name_or_path, config=config, **kwargs
         )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
@@ -1936,6 +1788,16 @@ def search(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        if self.training:
+            beam_size = self.config.searcher_beam_size
+        else:
+            beam_size = self.config.reader_beam_size
+
+        self.retriever = BruteForceSearcher(
+            db=self.block_emb,
+            num_neighbors=beam_size,
+        )
+
         if (input_ids is not None and input_ids.shape[0] != 1) or (
             inputs_embeds is not None and inputs_embeds.shape[0] != 1
         ):

From 620ac3696f82e4633a5e2d62c002ce8dc003dd5a Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 17:50:19 +0000
Subject: [PATCH 68/98] up

---
 .../models/realm/modeling_realm.py            | 169 +++++++++---------
 1 file changed, 85 insertions(+), 84 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index afb84b2b6490..47d8e474e7f0 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1672,9 +1672,15 @@ def mask_to_score(mask):
 
 class RealmSearcher(RealmPreTrainedModel):
 
-    def __init__(self, config):
+    def __init__(self, config, block_records_path):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
+        self.searcher = None
+        self.block_records = convert_tfrecord_to_np(
+            block_records_path=block_records_path,
+            num_block_records=config.num_block_records,
+        )
+
         self.register_buffer(
             "block_emb",
             torch.zeros(()).new_empty(
@@ -1685,13 +1691,87 @@ def __init__(self, config):
         )
         self.init_weights()
 
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.training:
+            beam_size = self.config.searcher_beam_size
+        else:
+            beam_size = self.config.reader_beam_size
+
+        self.searcher = BruteForceSearcher(
+            db=self.block_emb,
+            num_neighbors=beam_size,
+        )
+
+        if (input_ids is not None and input_ids.shape[0] != 1) or (
+            inputs_embeds is not None and inputs_embeds.shape[0] != 1
+        ):
+            raise ValueError("The batch_size of the inputs must be 1.")
+
+        question_outputs = self.embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+
+        # [1, searcher_beam_size]
+        retrieved_block_ids = self.searcher.search_batched(question_projection)
+
+        # [searcher_beam_size]
+        retrieved_block_ids = retrieved_block_ids.squeeze()
+
+        # [searcher_beam_size]
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
+
+        # [searcher_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(
+            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
+        )
+
+        # [searcher_beam_size]
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
+        )
+
+        if not return_dict:
+            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
+
+        return RealmSearcherOutput(
+            retrieved_logits=retrieved_logits,
+            retrieved_blocks=retrieved_blocks,
+            retrieved_block_ids=retrieved_block_ids,
+        )
+
 
 @add_start_docstrings(
     "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
     REALM_START_DOCSTRING,
 )
 class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, tokenizer, block_records_path):
+    def __init__(self, config, searcher, reader, tokenizer):
         super().__init__(config)
         self.reader = reader
         self.tokenizer = tokenizer
@@ -1699,11 +1779,6 @@ def __init__(self, config, searcher, reader, tokenizer, block_records_path):
         self.embedder = searcher.embedder
         self.block_emb = searcher.block_emb
 
-        self.block_records = convert_tfrecord_to_np(
-            block_records_path=block_records_path,
-            num_block_records=config.num_block_records,
-        )
-
         self.init_weights()
 
     @classmethod
@@ -1722,11 +1797,11 @@ def from_pretrained(
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
         searcher = RealmSearcher.from_pretrained(
-            searcher_pretrained_name_or_path, config=config, **kwargs
+            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
         )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
         tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
-        return cls(config, searcher, reader, tokenizer, block_records_path)
+        return cls(config, searcher, reader, tokenizer)
 
     def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
@@ -1771,80 +1846,6 @@ def block_has_answer(self, concat_inputs, answer_ids):
             torch.tensor(end_pos, dtype=torch.int64),
         )
 
-    def search(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.training:
-            beam_size = self.config.searcher_beam_size
-        else:
-            beam_size = self.config.reader_beam_size
-
-        self.retriever = BruteForceSearcher(
-            db=self.block_emb,
-            num_neighbors=beam_size,
-        )
-
-        if (input_ids is not None and input_ids.shape[0] != 1) or (
-            inputs_embeds is not None and inputs_embeds.shape[0] != 1
-        ):
-            raise ValueError("The batch_size of the inputs must be 1.")
-
-        question_outputs = self.embedder(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # [1, projection_size]
-        question_projection = question_outputs[0]
-
-        # [1, searcher_beam_size]
-        retrieved_block_ids = self.retriever.search_batched(question_projection)
-
-        # [searcher_beam_size]
-        retrieved_block_ids = retrieved_block_ids.squeeze()
-
-        # [searcher_beam_size]
-        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
-
-        # [searcher_beam_size, projection_size]
-        retrieved_block_emb = torch.index_select(
-            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
-        )
-
-        # [searcher_beam_size]
-        retrieved_logits = torch.einsum(
-            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
-        )
-
-        if not return_dict:
-            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
-
-        return RealmSearcherOutput(
-            retrieved_logits=retrieved_logits,
-            retrieved_blocks=retrieved_blocks,
-            retrieved_block_ids=retrieved_block_ids,
-        )
-
     @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
     def forward(self, question, answer_ids=None, return_dict=None):
@@ -1876,7 +1877,7 @@ def forward(self, question, answer_ids=None, return_dict=None):
             return_tensors="pt",
         ).to(self.device)
 
-        searcher_output = self.search(**question_ids, return_dict=True)
+        searcher_output = self.searcher(**question_ids, return_dict=True)
 
         text = []
         text_pair = []

From 9708e44bbec3e65f489ed80fc0cc597c64019d3b Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 29 Dec 2021 17:59:32 +0000
Subject: [PATCH 69/98] correct more

---
 .../models/realm/modeling_realm.py            | 113 +++++++++---------
 1 file changed, 57 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 47d8e474e7f0..993a8041d6eb 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1673,9 +1673,9 @@ def mask_to_score(mask):
 class RealmSearcher(RealmPreTrainedModel):
 
     def __init__(self, config, block_records_path):
+        # TODO(PVP) - this class has to be removed
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
-        self.searcher = None
         self.block_records = convert_tfrecord_to_np(
             block_records_path=block_records_path,
             num_block_records=config.num_block_records,
@@ -1691,7 +1691,60 @@ def __init__(self, config, block_records_path):
         )
         self.init_weights()
 
-    def forward(
+
+@add_start_docstrings(
+    "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
+    REALM_START_DOCSTRING,
+)
+class RealmForOpenQA(RealmPreTrainedModel):
+    def __init__(self, config, searcher, reader, tokenizer):
+        super().__init__(config)
+        self.reader = reader
+        self.tokenizer = tokenizer
+
+        self.embedder = searcher.embedder
+        self.block_records = searcher.block_records
+        self.block_emb = searcher.block_emb
+
+#        if self.training:
+#            beam_size = self.config.searcher_beam_size
+#        else:
+        beam_size = self.config.reader_beam_size
+
+        self.retriever = BruteForceSearcher(
+            db=self.block_emb,
+            num_neighbors=beam_size,
+        )
+        
+        # TODO(PVP) - init should be here
+
+    @classmethod
+    def from_pretrained(
+        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs
+    ):
+        """
+        Args:
+            searcher_pretrained_name_or_path (`str`):
+                Searcher pretrained name or path.
+            reader_pretrained_name_or_path (`str`):
+                Reader pretrained name or path.
+            block_records_path (`str`):
+                Block records path.
+
+        """
+        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
+        searcher = RealmSearcher.from_pretrained(
+            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
+        )
+        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
+        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
+        return cls(config, searcher, reader, tokenizer)
+
+    def save_pretrained(self, save_directory):
+        self.searcher.save_pretrained(save_directory)
+        self.reader.save_pretrained(save_directory)
+
+    def search(
         self,
         input_ids=None,
         attention_mask=None,
@@ -1708,16 +1761,6 @@ def forward(
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if self.training:
-            beam_size = self.config.searcher_beam_size
-        else:
-            beam_size = self.config.reader_beam_size
-
-        self.searcher = BruteForceSearcher(
-            db=self.block_emb,
-            num_neighbors=beam_size,
-        )
-
         if (input_ids is not None and input_ids.shape[0] != 1) or (
             inputs_embeds is not None and inputs_embeds.shape[0] != 1
         ):
@@ -1738,7 +1781,7 @@ def forward(
         question_projection = question_outputs[0]
 
         # [1, searcher_beam_size]
-        retrieved_block_ids = self.searcher.search_batched(question_projection)
+        retrieved_block_ids = self.retriever.search_batched(question_projection)
 
         # [searcher_beam_size]
         retrieved_block_ids = retrieved_block_ids.squeeze()
@@ -1765,48 +1808,6 @@ def forward(
             retrieved_block_ids=retrieved_block_ids,
         )
 
-
-@add_start_docstrings(
-    "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
-    REALM_START_DOCSTRING,
-)
-class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, tokenizer):
-        super().__init__(config)
-        self.reader = reader
-        self.tokenizer = tokenizer
-
-        self.embedder = searcher.embedder
-        self.block_emb = searcher.block_emb
-
-        self.init_weights()
-
-    @classmethod
-    def from_pretrained(
-        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs
-    ):
-        """
-        Args:
-            searcher_pretrained_name_or_path (`str`):
-                Searcher pretrained name or path.
-            reader_pretrained_name_or_path (`str`):
-                Reader pretrained name or path.
-            block_records_path (`str`):
-                Block records path.
-
-        """
-        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(
-            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
-        )
-        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
-        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
-        return cls(config, searcher, reader, tokenizer)
-
-    def save_pretrained(self, save_directory):
-        self.searcher.save_pretrained(save_directory)
-        self.reader.save_pretrained(save_directory)
-
     def block_has_answer(self, concat_inputs, answer_ids):
         """check if retrieved_blocks has answers."""
         has_answers = []
@@ -1877,7 +1878,7 @@ def forward(self, question, answer_ids=None, return_dict=None):
             return_tensors="pt",
         ).to(self.device)
 
-        searcher_output = self.searcher(**question_ids, return_dict=True)
+        searcher_output = self.search(**question_ids, return_dict=True)
 
         text = []
         text_pair = []

From 818718c11b65a2d42a884fe3afc1fa0f6a8d6489 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 30 Dec 2021 12:27:58 +0000
Subject: [PATCH 70/98] make retriever work

---
 docs/source/model_doc/realm.rst               |   4 +-
 src/transformers/__init__.py                  |   4 +-
 src/transformers/models/realm/__init__.py     |   4 +-
 .../models/realm/configuration_realm.py       |   8 +-
 .../models/realm/modeling_realm.py            | 184 ++++++-----------
 .../models/realm/retrieval_realm.py           | 185 +++++++++---------
 src/transformers/models/realm/utils_realm.py  |  30 +--
 src/transformers/utils/dummy_pt_objects.py    |   2 +-
 tests/test_modeling_realm.py                  |  18 +-
 utils/check_repo.py                           |   4 +-
 10 files changed, 195 insertions(+), 248 deletions(-)

diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index b8fcf5b2d53f..9b58eaaaecdd 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -61,10 +61,10 @@ RealmEmbedder
     :members: forward
 
 
-RealmRetriever
+RealmScorer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-.. autoclass:: transformers.RealmRetriever
+.. autoclass:: transformers.RealmScorer
     :members: forward
 
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 6cb31bd9c842..698aca8568a4 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1185,7 +1185,7 @@
             "RealmKnowledgeAugEncoder",
             "RealmPreTrainedModel",
             "RealmReader",
-            "RealmRetriever",
+            "RealmScorer",
             "RealmSearcher",
             "load_tf_weights_in_realm",
         ]
@@ -3097,7 +3097,7 @@
             RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
             RealmReader,
-            RealmRetriever,
+            RealmScorer,
             RealmSearcher,
             load_tf_weights_in_realm,
         )
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 94dcc2e1e5fa..7017470a85d1 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -34,7 +34,7 @@
         "RealmKnowledgeAugEncoder",
         "RealmPreTrainedModel",
         "RealmReader",
-        "RealmRetriever",
+        "RealmScorer",
         "RealmSearcher",
         "load_tf_weights_in_realm",
     ]
@@ -52,7 +52,7 @@
             RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
             RealmReader,
-            RealmRetriever,
+            RealmScorer,
             RealmSearcher,
             load_tf_weights_in_realm,
         )
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 6d6705966c68..f8a83d8a539d 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -35,7 +35,7 @@ class RealmConfig(PretrainedConfig):
     This is the configuration class to store the configuration of
 
     1. [`RealmEmbedder`]
-    2. [`RealmRetriever`]
+    2. [`RealmScorer`]
     3. [`RealmKnowledgeAugEncoder`]
     4. [`RealmSearcher`]
     5. [`RealmReader`]
@@ -53,7 +53,7 @@ class RealmConfig(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`RealmEmbedder`],
-            [`RealmRetriever`], [`RealmKnowledgeAugEncoder`],
+            [`RealmScorer`], [`RealmKnowledgeAugEncoder`],
             [`RealmSearcher`], or [`RealmReader`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
@@ -64,7 +64,7 @@ class RealmConfig(PretrainedConfig):
         num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
         num_candidates (`int`, *optional*, defaults to 8):
-            Number of candidates inputted to the RealmRetriever or RealmKnowledgeAugEncoder.
+            Number of candidates inputted to the RealmScorer or RealmKnowledgeAugEncoder.
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
@@ -79,7 +79,7 @@ class RealmConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
             The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`],
-            [`RealmRetriever`], [`RealmKnowledgeAugEncoder`],
+            [`RealmScorer`], [`RealmKnowledgeAugEncoder`],
             [`RealmSearcher`], or [`RealmReader`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 993a8041d6eb..8f67bbad30a0 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -18,7 +18,7 @@
 import math
 import os
 from dataclasses import dataclass
-from typing import Optional, Tuple
+from typing import Optional, Tuple, TypeVar
 
 import numpy as np
 import torch
@@ -47,6 +47,7 @@
 from .utils_realm import BruteForceSearcher, ScaNNSearcher, convert_tfrecord_to_np
 
 
+T = TypeVar('T', bound='Module')
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
 _EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
@@ -814,9 +815,9 @@ class RealmEmbedderOutput(ModelOutput):
 
 
 @dataclass
-class RealmRetrieverOutput(ModelOutput):
+class RealmScorerOutput(ModelOutput):
     """
-    Outputs of RealmRetriever models.
+    Outputs of RealmScorer models.
 
     Args:
         relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`):
@@ -967,7 +968,7 @@ def forward(self, sequence_output):
         return prediction_scores
 
 
-class RealmRetrieverProjection(nn.Module):
+class RealmScorerProjection(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.predictions = RealmLMPredictionHead(config)
@@ -1159,7 +1160,7 @@ def __init__(self, config):
         super().__init__(config)
 
         self.bert = RealmBertModel(self.config)
-        self.cls = RealmRetrieverProjection(self.config)
+        self.cls = RealmScorerProjection(self.config)
         self.init_weights()
 
     def get_input_embeddings(self):
@@ -1219,7 +1220,7 @@ def forward(
     "The retriever of REALM outputting relevance score representing the score of document candidates (before softmax).",
     REALM_START_DOCSTRING,
 )
-class RealmRetriever(RealmPreTrainedModel):
+class RealmScorer(RealmPreTrainedModel):
     r"""
     Args:
         query_embedder ([`RealmEmbedder`]):
@@ -1236,7 +1237,7 @@ def __init__(self, config, query_embedder=None):
         self.init_weights()
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
-    @replace_return_docstrings(output_type=RealmRetrieverOutput, config_class=_CONFIG_FOR_DOC)
+    @replace_return_docstrings(output_type=RealmScorerOutput, config_class=_CONFIG_FOR_DOC)
     def forward(
         self,
         input_ids=None,
@@ -1333,7 +1334,7 @@ def forward(
         if not return_dict:
             return relevance_score, query_score, candidate_score
 
-        return RealmRetrieverOutput(
+        return RealmScorerOutput(
             relevance_score=relevance_score, query_score=query_score, candidate_score=candidate_score
         )
 
@@ -1382,7 +1383,7 @@ def forward(
     ):
         r"""
         relevance_score (`torch.FloatTensor` of shape `(batch_size, num_candidates)`, *optional*):
-            Relevance score derived from RealmRetriever, must be specified if you want to compute the masked language
+            Relevance score derived from RealmScorer, must be specified if you want to compute the masked language
             modeling loss.
 
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -1676,10 +1677,10 @@ def __init__(self, config, block_records_path):
         # TODO(PVP) - this class has to be removed
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
-        self.block_records = convert_tfrecord_to_np(
-            block_records_path=block_records_path,
-            num_block_records=config.num_block_records,
-        )
+#        self.block_records = convert_tfrecord_to_np(
+#            block_records_path=block_records_path,
+#            num_block_records=config.num_block_records,
+#        )
 
         self.register_buffer(
             "block_emb",
@@ -1697,30 +1698,26 @@ def __init__(self, config, block_records_path):
     REALM_START_DOCSTRING,
 )
 class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, tokenizer):
+    def __init__(self, config, searcher, reader, tokenizer, retriever):
         super().__init__(config)
-        self.reader = reader
-        self.tokenizer = tokenizer
-
         self.embedder = searcher.embedder
-        self.block_records = searcher.block_records
+        self.reader = reader
+#        self.block_records = searcher.block_records
         self.block_emb = searcher.block_emb
+        self.tokenizer = tokenizer
+        self.retriever = retriever
 
-#        if self.training:
-#            beam_size = self.config.searcher_beam_size
-#        else:
-        beam_size = self.config.reader_beam_size
+#        self.init_weights()
 
-        self.retriever = BruteForceSearcher(
-            db=self.block_emb,
-            num_neighbors=beam_size,
-        )
-        
-        # TODO(PVP) - init should be here
+    @property
+    def beam_size(self):
+        if self.training:
+            return self.config.searcher_beam_size
+        return self.config.reader_beam_size
 
     @classmethod
     def from_pretrained(
-        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, block_records_path, **kwargs
+        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, block_records_path, **kwargs
     ):
         """
         Args:
@@ -1738,76 +1735,12 @@ def from_pretrained(
         )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
         tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
-        return cls(config, searcher, reader, tokenizer)
+        return cls(config, searcher, reader, tokenizer, retriever)
 
     def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
         self.reader.save_pretrained(save_directory)
 
-    def search(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        r"""
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if (input_ids is not None and input_ids.shape[0] != 1) or (
-            inputs_embeds is not None and inputs_embeds.shape[0] != 1
-        ):
-            raise ValueError("The batch_size of the inputs must be 1.")
-
-        question_outputs = self.embedder(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        # [1, projection_size]
-        question_projection = question_outputs[0]
-
-        # [1, searcher_beam_size]
-        retrieved_block_ids = self.retriever.search_batched(question_projection)
-
-        # [searcher_beam_size]
-        retrieved_block_ids = retrieved_block_ids.squeeze()
-
-        # [searcher_beam_size]
-        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
-
-        # [searcher_beam_size, projection_size]
-        retrieved_block_emb = torch.index_select(
-            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
-        )
-
-        # [searcher_beam_size]
-        retrieved_logits = torch.einsum(
-            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
-        )
-
-        if not return_dict:
-            return (retrieved_logits, retrieved_blocks, retrieved_block_ids)
-
-        return RealmSearcherOutput(
-            retrieved_logits=retrieved_logits,
-            retrieved_blocks=retrieved_blocks,
-            retrieved_block_ids=retrieved_block_ids,
-        )
-
     def block_has_answer(self, concat_inputs, answer_ids):
         """check if retrieved_blocks has answers."""
         has_answers = []
@@ -1870,6 +1803,7 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
+        # TODO(PVP) move out - tokenizer!
         question_ids = self.tokenizer(
             [question],
             padding=True,
@@ -1878,54 +1812,62 @@ def forward(self, question, answer_ids=None, return_dict=None):
             return_tensors="pt",
         ).to(self.device)
 
-        searcher_output = self.search(**question_ids, return_dict=True)
+        if question_ids.input_ids is not None and question_ids.input_ids.shape[0] != 1:
+            raise ValueError("The batch_size of the inputs must be 1.")
 
-        text = []
-        text_pair = []
-        for retrieved_block in searcher_output.retrieved_blocks:
-            text.append(question)
-            text_pair.append(retrieved_block.decode())
+        question_outputs = self.embedder(**question_ids, return_dict=True)
 
-        concat_inputs = self.tokenizer(
-            text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len, return_tensors="pt"
-        ).to(self.reader.device)
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+        # [1, searcher_beam_size]
 
-        # concat inputs should come from the retriever here
-        if answer_ids is not None:
-            has_answers, start_positions, end_positions = self.block_has_answer(
-                concat_inputs, answer_ids
-            )
-            has_answers = has_answers.to(self.reader.device)
-            start_positions = start_positions.to(self.reader.device)
-            end_positions = end_positions.to(self.reader.device)
-        else:
-            has_answers, start_positions, end_positions = (None, None, None)
+        batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection)
+        _, retrieved_block_ids = torch.topk(batch_scores, k=self.beam_size, dim=-1)
+        retrieved_block_ids = retrieved_block_ids.squeeze().cpu()
+
+        # Must return cpu tensor for subsequent numpy operations
+        # [searcher_beam_size]
+        # [searcher_beam_size]
+        has_answers, start_pos, end_pos, concat_inputs = self.retriever(retrieved_block_ids, question, answer_ids)
+
+        if has_answers is not None:
+            has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device)
+            start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device)
+            end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device)
+
+        # TODO(PVP) - keep
+        # [searcher_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(
+            self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
+        )
+        # [searcher_beam_size]
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
+        )
 
         output = self.reader(
             input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size],
             attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size],
             token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size],
-            relevance_score=searcher_output.retrieved_logits,
+            relevance_score=retrieved_logits,
             has_answers=has_answers,
-            start_positions=start_positions,
-            end_positions=end_positions,
+            start_positions=start_pos,
+            end_positions=end_pos,
             return_dict=True,
         )
 
-        # this will by handled by the retriever decode method
+        # TODO(PVP) - tokenizer! move out
         answer = self.tokenizer.decode(
             concat_inputs.input_ids[output.block_idx][output.start_pos : output.end_pos + 1]
         )
 
         reader_output, predicted_answer = output, answer
 
-        # this will by handled by the retriever decode method
-
         if not return_dict:
-            return searcher_output, reader_output, predicted_answer
+            return {}, reader_output, predicted_answer
 
         return RealmForOpenQAOutput(
-            searcher_output=searcher_output,
+            searcher_output={},
             reader_output=reader_output,
             predicted_answer=predicted_answer,
         )
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index f76bc6b17f3b..dd0211e10f04 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -12,118 +12,117 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""RAG Retriever model implementation."""
-
-import os
-import pickle
-import time
-from typing import Iterable, List, Optional, Tuple
-
+"""Realm Retriever model implementation."""
 import numpy as np
 
-from ...file_utils import cached_path, is_datasets_available, is_faiss_available, is_remote_url, requires_backends
-from ...tokenization_utils import PreTrainedTokenizer
-from ...tokenization_utils_base import BatchEncoding
+#from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
-from .configuration_rag import RagConfig
-from .tokenization_rag import RagTokenizer
-
-
-if is_datasets_available():
-    from datasets import Dataset, load_dataset, load_from_disk
-
-if is_faiss_available():
-    import faiss
 
 
 logger = logging.get_logger(__name__)
 
 
-LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wiki_dpr/"
+def convert_tfrecord_to_np(block_records_path, num_block_records):
+    import tensorflow.compat.v1 as tf
 
+    blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
+    blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True)
+    np_record = next(blocks_dataset.take(1).as_numpy_iterator())
 
+    return np_record
 
-class RealmRetriever:
 
-    def __init__(self, config, tokenizer, index=None):
-        super().__init__()
-        self.tokenizer = tokenizer
-        self.index
+class ScaNNSearcher:
+    def __init__(
+        self,
+        db,
+        num_neighbors,
+        dimensions_per_block=2,
+        num_leaves=1000,
+        num_leaves_to_search=100,
+        training_sample_size=100000,
+    ):
+        """Build scann searcher."""
+
+        from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+
+        builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product")
+        builder = builder.tree(
+            num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size
+        )
+        builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
 
-    @classmethod
-    def from_pretrained(cls, retriever_name_or_path, indexed_dataset=None, **kwargs):
-        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(retriever_name_or_path, **kwargs)
-        tokenizer = RealmTokenizer.from_pretrained(retriever_name_or_path, config=config)
+        self.searcher = builder.build()
 
-        # logic to load tf.records (should probs put it in `datasets`)
-        index = None
+    def search_batched(self, question_projection):
+        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
+        # Must return cpu tensor for subsequent numpy operations
+#        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
+        return retrieved_block_ids.astype("int64")
 
-        return cls(
-            config,
-            tokenizer=tokenizer,
-            index=index,
+
+class RealmRetriever:
+    def __init__(self, config, tokenizer, block_records_path):
+        super().__init__()
+        self.config = config
+        self.block_records = convert_tfrecord_to_np(
+            block_records_path=block_records_path,
+            num_block_records=config.num_block_records,
         )
+        self.tokenizer = tokenizer
 
-    def save_pretrained(self, save_directory):
-        # save index here
+#    ) -> BatchEncoding:
+    def __call__(self, retrieved_block_ids, question, answer_ids, return_tensors="pt"):
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
-        self.config.save_pretrained(save_directory)
-        self.tokenizer.save_pretrained(save_directory)
+        text = []
+        text_pair = []
+        for retrieved_block in retrieved_blocks:
+            text.append(question)
+            text_pair.append(retrieved_block.decode())
 
-    def __call__(
-        self,
-        question_input_ids: List[List[int]],
-        question_hidden_states: np.ndarray,
-        prefix=None,
-        n_docs=None,
-        return_tensors=None,
-    ) -> BatchEncoding:
-
-        n_docs = n_docs if n_docs is not None else self.n_docs
-        prefix = prefix if prefix is not None else self.config.generator.prefix
-        retrieved_doc_embeds, doc_ids, docs = self.retrieve(question_hidden_states, n_docs)
-
-        input_strings = self.question_encoder_tokenizer.batch_decode(question_input_ids, skip_special_tokens=True)
-        context_input_ids, context_attention_mask = self.postprocess_docs(
-            docs, input_strings, prefix, n_docs, return_tensors=return_tensors
+        concat_inputs = self.tokenizer(
+            text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len
         )
+        concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors)
 
-        if self.return_tokenized_docs:
-            retrived_doc_text = []
-            retrived_doc_title = []
-
-            for b_idx in range(len(docs)):
-                for doc_idx in range(n_docs):
-                    retrived_doc_text.append(docs[b_idx]["text"][doc_idx])
-                    retrived_doc_title.append(docs[b_idx]["title"][doc_idx])
-
-            tokenized_docs = self.ctx_encoder_tokenizer(
-                retrived_doc_title,
-                retrived_doc_text,
-                truncation=True,
-                padding="longest",
-                return_tensors=return_tensors,
-            )
-
-            return BatchEncoding(
-                {
-                    "context_input_ids": context_input_ids,
-                    "context_attention_mask": context_attention_mask,
-                    "retrieved_doc_embeds": retrieved_doc_embeds,
-                    "doc_ids": doc_ids,
-                    "tokenized_doc_ids": tokenized_docs["input_ids"],
-                    "tokenized_doc_attention_mask": tokenized_docs["attention_mask"],
-                },
-                tensor_type=return_tensors,
-            )
-
+        # concat inputs should come from the retriever here
+        if answer_ids is not None:
+            return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_inputs_tensors,)
         else:
-            return BatchEncoding(
-                {
-                    "context_input_ids": context_input_ids,
-                    "context_attention_mask": context_attention_mask,
-                    "retrieved_doc_embeds": retrieved_doc_embeds,
-                    "doc_ids": doc_ids,
-                },
-                tensor_type=return_tensors,
-            )
+            return (None, None, None, concat_inputs_tensors)
+
+
+    def block_has_answer(self, concat_inputs, answer_ids):
+        """check if retrieved_blocks has answers."""
+        has_answers = []
+        start_pos = []
+        end_pos = []
+        max_answers = 0
+
+        for input_id in concat_inputs.input_ids:
+            start_pos.append([])
+            end_pos.append([])
+            sep_idx = input_id.index(self.tokenizer.sep_token_id)
+            for answer in answer_ids:
+                for idx in range(sep_idx, len(input_id)):
+                    if answer[0] == input_id[idx]:
+                        if input_id[idx: idx + len(answer)] == answer:
+                            start_pos[-1].append(idx)
+                            end_pos[-1].append(idx + len(answer) - 1)
+
+            if len(start_pos[-1]) == 0:
+                has_answers.append(False)
+            else:
+                has_answers.append(True)
+                if len(start_pos[-1]) > max_answers:
+                    max_answers = len(start_pos[-1])
+
+        # Pad -1 to max_answers
+        for start_pos_, end_pos_ in zip(start_pos, end_pos):
+            if len(start_pos_) < max_answers:
+                padded = [-1] * (max_answers - len(start_pos_))
+                start_pos_ += padded
+                end_pos_ += padded
+
+        return has_answers, start_pos, end_pos
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 24c927b8ecd8..4cd71d794d39 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -17,19 +17,6 @@
 import torch
 
 
-class BruteForceSearcher:
-    def __init__(self, db, num_neighbors):
-        """Build brute force searcher."""
-        self.db = db
-        self.num_neighbors = num_neighbors
-
-    def search_batched(self, question_projection):
-        batch_scores = torch.einsum("BD,QD->QB", self.db, question_projection)
-        _, retrieved_block_ids = torch.topk(batch_scores, k=self.num_neighbors, dim=-1)
-        # Must return cpu tensor for subsequent numpy operations
-        return retrieved_block_ids.cpu()
-
-
 class ScaNNSearcher:
     def __init__(
         self,
@@ -55,11 +42,24 @@ def __init__(
     def search_batched(self, question_projection):
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
         # Must return cpu tensor for subsequent numpy operations
-        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
+#        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
+        return retrieved_block_ids.astype("int64")
 
 
-def convert_tfrecord_to_np(block_records_path, num_block_records):
+class BruteForceSearcher:
+    def __init__(self, db, num_neighbors):
+        """Build brute force searcher."""
+        self.db = db
+        self.num_neighbors = num_neighbors
 
+    def search_batched(self, question_projection):
+        batch_scores = torch.einsum("BD,QD->QB", self.db, question_projection)
+        _, retrieved_block_ids = torch.topk(batch_scores, k=self.num_neighbors, dim=-1)
+        # Must return cpu tensor for subsequent numpy operations
+        return retrieved_block_ids.cpu()
+
+
+def convert_tfrecord_to_np(block_records_path, num_block_records):
     import tensorflow.compat.v1 as tf
 
     blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 28d34226844a..debe1af3ecae 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3985,7 +3985,7 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmRetriever:
+class RealmScorer:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index da82ef206447..dd4fb6f3b919 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -33,8 +33,9 @@
         RealmForOpenQA,
         RealmKnowledgeAugEncoder,
         RealmReader,
-        RealmRetriever,
+        RealmScorer,
         RealmSearcher,
+        RealmTokenizer,
     )
 
 # Direct download link
@@ -282,7 +283,7 @@ def create_and_check_retriever(
         token_labels,
         choice_labels,
     ):
-        model = RealmRetriever(config=config)
+        model = RealmScorer(config=config)
         model.to(torch_device)
         model.eval()
         result = model(
@@ -349,9 +350,9 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
         (
             RealmEmbedder,
             RealmKnowledgeAugEncoder,
-            # RealmRetriever is excluded from common tests as it is a container model
+            # RealmScorer is excluded from common tests as it is a container model
             # consisting of two RealmEmbedders & simple inner product calculation.
-            # RealmRetriever
+            # RealmScorer
         )
         if is_torch_available()
         else ()
@@ -429,7 +430,7 @@ def test_reader_from_pretrained(self):
 
     @slow
     def test_retriever_from_pretrained(self):
-        model = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
+        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
         self.assertIsNotNone(model)
 
     @slow
@@ -477,10 +478,15 @@ def test_inference_encoder(self):
     @slow
     def test_inference_open_qa(self):
         # TODO: TF record dataset
+        from transformers.models.realm.retrieval_realm import RealmRetriever
+
         config = RealmConfig(use_scann=False)
+        tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-searcher")
+        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
         model = RealmForOpenQA.from_pretrained(
             r"qqaatw/realm-orqa-nq-searcher",
             r"qqaatw/realm-orqa-nq-reader",
+            retriever,
             BLOCK_RECORDS_PATH,
             config=config,
         )
@@ -522,7 +528,7 @@ def test_inference_reader(self):
     def test_inference_retriever(self):
         num_candidates = 2
 
-        model = RealmRetriever.from_pretrained(
+        model = RealmScorer.from_pretrained(
             "qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates
         )
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 6037a51c65f2..2b2d00a2a471 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -75,7 +75,7 @@
     "ProphetNetDecoderWrapper",  # Building part of bigger (tested) model.
     "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmReader",  # Not regular model.
-    "RealmRetriever",  # Not regular model.
+    "RealmScorer",  # Not regular model.
     "RealmSearcher",  # Not regular model.
     "RealmForOpenQA",  # Not regular model.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
@@ -136,7 +136,7 @@
     "RagTokenForGeneration",
     "RealmEmbedder",
     "RealmForOpenQA",
-    "RealmRetriever",
+    "RealmScorer",
     "RealmReader",
     "RealmSearcher",
     "TFDPRReader",

From 6492942232b5fd9192721bbb05b0dde0f4bae341 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 30 Dec 2021 12:58:43 +0000
Subject: [PATCH 71/98] update

---
 .../models/realm/modeling_realm.py            | 95 +++----------------
 .../models/realm/retrieval_realm.py           |  7 +-
 tests/test_modeling_realm.py                  | 13 ++-
 3 files changed, 29 insertions(+), 86 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 8f67bbad30a0..4af7142ca579 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -26,8 +26,6 @@
 from torch import nn
 from torch.nn import CrossEntropyLoss
 
-from transformers.models.realm.tokenization_realm import RealmTokenizer
-
 from ...activations import ACT2FN
 from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
 from ...modeling_outputs import (
@@ -1673,15 +1671,10 @@ def mask_to_score(mask):
 
 class RealmSearcher(RealmPreTrainedModel):
 
-    def __init__(self, config, block_records_path):
+    def __init__(self, config):
         # TODO(PVP) - this class has to be removed
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
-#        self.block_records = convert_tfrecord_to_np(
-#            block_records_path=block_records_path,
-#            num_block_records=config.num_block_records,
-#        )
-
         self.register_buffer(
             "block_emb",
             torch.zeros(()).new_empty(
@@ -1698,13 +1691,11 @@ def __init__(self, config, block_records_path):
     REALM_START_DOCSTRING,
 )
 class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, tokenizer, retriever):
+    def __init__(self, config, searcher, reader, retriever):
         super().__init__(config)
         self.embedder = searcher.embedder
         self.reader = reader
-#        self.block_records = searcher.block_records
         self.block_emb = searcher.block_emb
-        self.tokenizer = tokenizer
         self.retriever = retriever
 
 #        self.init_weights()
@@ -1717,7 +1708,7 @@ def beam_size(self):
 
     @classmethod
     def from_pretrained(
-        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, block_records_path, **kwargs
+        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, **kwargs
     ):
         """
         Args:
@@ -1725,64 +1716,22 @@ def from_pretrained(
                 Searcher pretrained name or path.
             reader_pretrained_name_or_path (`str`):
                 Reader pretrained name or path.
-            block_records_path (`str`):
-                Block records path.
 
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
         searcher = RealmSearcher.from_pretrained(
-            searcher_pretrained_name_or_path, block_records_path, config=config, **kwargs
+            searcher_pretrained_name_or_path, config=config, **kwargs
         )
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
-        tokenizer = RealmTokenizer.from_pretrained(searcher_pretrained_name_or_path)
-        return cls(config, searcher, reader, tokenizer, retriever)
+        return cls(config, searcher, reader, retriever)
 
     def save_pretrained(self, save_directory):
         self.searcher.save_pretrained(save_directory)
         self.reader.save_pretrained(save_directory)
 
-    def block_has_answer(self, concat_inputs, answer_ids):
-        """check if retrieved_blocks has answers."""
-        has_answers = []
-        start_pos = []
-        end_pos = []
-        max_answers = 0
-
-        for input_id in concat_inputs.input_ids:
-            start_pos.append([])
-            end_pos.append([])
-            input_id = input_id.tolist()
-            sep_idx = input_id.index(self.tokenizer.sep_token_id)
-            for answer in answer_ids:
-                for idx in range(sep_idx, len(input_id)):
-                    if answer[0] == input_id[idx]:
-                        if input_id[idx: idx + len(answer)] == answer:
-                            start_pos[-1].append(idx)
-                            end_pos[-1].append(idx + len(answer) - 1)
-
-            if len(start_pos[-1]) == 0:
-                has_answers.append(False)
-            else:
-                has_answers.append(True)
-                if len(start_pos[-1]) > max_answers:
-                    max_answers = len(start_pos[-1])
-
-        # Pad -1 to max_answers
-        for start_pos_, end_pos_ in zip(start_pos, end_pos):
-            if len(start_pos_) < max_answers:
-                padded = [-1] * (max_answers - len(start_pos_))
-                start_pos_ += padded
-                end_pos_ += padded
-
-        return (
-            torch.tensor(has_answers, dtype=torch.bool),
-            torch.tensor(start_pos, dtype=torch.int64),
-            torch.tensor(end_pos, dtype=torch.int64),
-        )
-
     @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(self, question, answer_ids=None, return_dict=None):
+    def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, return_dict=None):
         r"""
         Returns:
 
@@ -1803,19 +1752,10 @@ def forward(self, question, answer_ids=None, return_dict=None):
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        # TODO(PVP) move out - tokenizer!
-        question_ids = self.tokenizer(
-            [question],
-            padding=True,
-            truncation=True,
-            max_length=self.config.searcher_seq_len,
-            return_tensors="pt",
-        ).to(self.device)
-
-        if question_ids.input_ids is not None and question_ids.input_ids.shape[0] != 1:
+        if input_ids is not None and input_ids.shape[0] != 1:
             raise ValueError("The batch_size of the inputs must be 1.")
 
-        question_outputs = self.embedder(**question_ids, return_dict=True)
+        question_outputs = self.embedder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
 
         # [1, projection_size]
         question_projection = question_outputs[0]
@@ -1826,16 +1766,14 @@ def forward(self, question, answer_ids=None, return_dict=None):
         retrieved_block_ids = retrieved_block_ids.squeeze().cpu()
 
         # Must return cpu tensor for subsequent numpy operations
-        # [searcher_beam_size]
-        # [searcher_beam_size]
-        has_answers, start_pos, end_pos, concat_inputs = self.retriever(retrieved_block_ids, question, answer_ids)
+        # Retrieve possible answers
+        has_answers, start_pos, end_pos, concat_inputs = self.retriever(retrieved_block_ids, input_ids, answer_ids)
 
         if has_answers is not None:
             has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device)
             start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device)
             end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device)
 
-        # TODO(PVP) - keep
         # [searcher_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(
             self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
@@ -1845,7 +1783,7 @@ def forward(self, question, answer_ids=None, return_dict=None):
             "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(question_projection.device)
         )
 
-        output = self.reader(
+        reader_output = self.reader(
             input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size],
             attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size],
             token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size],
@@ -1856,18 +1794,13 @@ def forward(self, question, answer_ids=None, return_dict=None):
             return_dict=True,
         )
 
-        # TODO(PVP) - tokenizer! move out
-        answer = self.tokenizer.decode(
-            concat_inputs.input_ids[output.block_idx][output.start_pos : output.end_pos + 1]
-        )
-
-        reader_output, predicted_answer = output, answer
+        predicted_answer_ids = concat_inputs.input_ids[reader_output.block_idx][reader_output.start_pos : reader_output.end_pos + 1]
 
         if not return_dict:
-            return {}, reader_output, predicted_answer
+            return answer_ids, searcher_output, reader_output
 
         return RealmForOpenQAOutput(
+            predicted_answer=predicted_answer_ids,
             searcher_output={},
             reader_output=reader_output,
-            predicted_answer=predicted_answer,
         )
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index dd0211e10f04..a35e39d99f0d 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -72,9 +72,11 @@ def __init__(self, config, tokenizer, block_records_path):
         self.tokenizer = tokenizer
 
 #    ) -> BatchEncoding:
-    def __call__(self, retrieved_block_ids, question, answer_ids, return_tensors="pt"):
+    def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_tensors="pt"):
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
+        question = self.tokenizer.decode(question_input_ids[0], skip_special_tokens=True)
+
         text = []
         text_pair = []
         for retrieved_block in retrieved_blocks:
@@ -88,11 +90,10 @@ def __call__(self, retrieved_block_ids, question, answer_ids, return_tensors="pt
 
         # concat inputs should come from the retriever here
         if answer_ids is not None:
-            return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_inputs_tensors,)
+            return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_tensors,)
         else:
             return (None, None, None, concat_inputs_tensors)
 
-
     def block_has_answer(self, concat_inputs, answer_ids):
         """check if retrieved_blocks has answers."""
         has_answers = []
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index dd4fb6f3b919..937d23826f56 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -487,13 +487,22 @@ def test_inference_open_qa(self):
             r"qqaatw/realm-orqa-nq-searcher",
             r"qqaatw/realm-orqa-nq-reader",
             retriever,
-            BLOCK_RECORDS_PATH,
             config=config,
         )
 
         question = "Who is the pioneer in modern computer science?"
-        predicted_answer = model(question).predicted_answer
 
+        question = tokenizer(
+            [question],
+            padding=True,
+            truncation=True,
+            max_length=model.config.searcher_seq_len,
+            return_tensors="pt",
+        ).to(model.device)
+
+        predicted_answer_ids = model(**question).predicted_answer
+
+        predicted_answer = tokenizer.decode(predicted_answer_ids)
         self.assertEqual(predicted_answer, "alan mathison turing")
 
     @slow

From 0fb7b8676a9d81d04b29d9cac107678c46020bc4 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Thu, 30 Dec 2021 12:58:59 +0000
Subject: [PATCH 72/98] make style

---
 .../models/realm/configuration_realm.py       |  3 +--
 .../models/realm/modeling_realm.py            | 21 +++++++++----------
 .../models/realm/retrieval_realm.py           |  8 +++----
 .../models/realm/tokenization_realm.py        |  3 +--
 src/transformers/models/realm/utils_realm.py  |  2 +-
 tests/test_modeling_realm.py                  |  4 +---
 6 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index f8a83d8a539d..2efe31bba478 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -122,8 +122,7 @@ class RealmConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> configuration = model.config
-    ```
-"""
+    ```"""
     model_type = "realm"
 
     def __init__(
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 4af7142ca579..e6a712d7ca20 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -45,7 +45,7 @@
 from .utils_realm import BruteForceSearcher, ScaNNSearcher, convert_tfrecord_to_np
 
 
-T = TypeVar('T', bound='Module')
+T = TypeVar("T", bound="Module")
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
 _EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
@@ -1670,7 +1670,6 @@ def mask_to_score(mask):
 
 
 class RealmSearcher(RealmPreTrainedModel):
-
     def __init__(self, config):
         # TODO(PVP) - this class has to be removed
         super().__init__(config)
@@ -1698,7 +1697,7 @@ def __init__(self, config, searcher, reader, retriever):
         self.block_emb = searcher.block_emb
         self.retriever = retriever
 
-#        self.init_weights()
+    #        self.init_weights()
 
     @property
     def beam_size(self):
@@ -1707,9 +1706,7 @@ def beam_size(self):
         return self.config.reader_beam_size
 
     @classmethod
-    def from_pretrained(
-        cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, **kwargs
-    ):
+    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, **kwargs):
         """
         Args:
             searcher_pretrained_name_or_path (`str`):
@@ -1719,9 +1716,7 @@ def from_pretrained(
 
         """
         config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(
-            searcher_pretrained_name_or_path, config=config, **kwargs
-        )
+        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, config=config, **kwargs)
         reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
         return cls(config, searcher, reader, retriever)
 
@@ -1755,7 +1750,9 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
         if input_ids is not None and input_ids.shape[0] != 1:
             raise ValueError("The batch_size of the inputs must be 1.")
 
-        question_outputs = self.embedder(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True)
+        question_outputs = self.embedder(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True
+        )
 
         # [1, projection_size]
         question_projection = question_outputs[0]
@@ -1794,7 +1791,9 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
             return_dict=True,
         )
 
-        predicted_answer_ids = concat_inputs.input_ids[reader_output.block_idx][reader_output.start_pos : reader_output.end_pos + 1]
+        predicted_answer_ids = concat_inputs.input_ids[reader_output.block_idx][
+            reader_output.start_pos : reader_output.end_pos + 1
+        ]
 
         if not return_dict:
             return answer_ids, searcher_output, reader_output
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index a35e39d99f0d..beb2235bbfc4 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -15,7 +15,7 @@
 """Realm Retriever model implementation."""
 import numpy as np
 
-#from ...tokenization_utils_base import BatchEncoding
+# from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
 
 
@@ -57,7 +57,7 @@ def __init__(
     def search_batched(self, question_projection):
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
         # Must return cpu tensor for subsequent numpy operations
-#        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
+        #        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
         return retrieved_block_ids.astype("int64")
 
 
@@ -71,7 +71,7 @@ def __init__(self, config, tokenizer, block_records_path):
         )
         self.tokenizer = tokenizer
 
-#    ) -> BatchEncoding:
+    #    ) -> BatchEncoding:
     def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_tensors="pt"):
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
@@ -108,7 +108,7 @@ def block_has_answer(self, concat_inputs, answer_ids):
             for answer in answer_ids:
                 for idx in range(sep_idx, len(input_id)):
                     if answer[0] == input_id[idx]:
-                        if input_id[idx: idx + len(answer)] == answer:
+                        if input_id[idx : idx + len(answer)] == answer:
                             start_pos[-1].append(idx)
                             end_pos[-1].append(idx + len(answer) - 1)
 
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index bc5377fdd113..7f6ee9a41a61 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -106,8 +106,7 @@ def batch_encode_candidates(self, text, **kwargs):
 
         >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
         >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
-        ```
-"""
+        ```"""
 
         # Always using a fixed sequence length to encode in order to stack candidates into a batch.
         kwargs["padding"] = PaddingStrategy.MAX_LENGTH
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
index 4cd71d794d39..187a282fc274 100644
--- a/src/transformers/models/realm/utils_realm.py
+++ b/src/transformers/models/realm/utils_realm.py
@@ -42,7 +42,7 @@ def __init__(
     def search_batched(self, question_projection):
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
         # Must return cpu tensor for subsequent numpy operations
-#        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
+        #        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
         return retrieved_block_ids.astype("int64")
 
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 937d23826f56..6020e9740648 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -537,9 +537,7 @@ def test_inference_reader(self):
     def test_inference_retriever(self):
         num_candidates = 2
 
-        model = RealmScorer.from_pretrained(
-            "qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates
-        )
+        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates)
 
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])

From 38e2ed059f7b281089690cce0e301a86fdc815bb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Mon, 3 Jan 2022 12:30:24 +0000
Subject: [PATCH 73/98] finish main structure

---
 docs/source/index.mdx                         |  1 +
 docs/source/model_doc/realm.rst               |  7 --
 src/transformers/__init__.py                  |  2 -
 src/transformers/models/realm/__init__.py     |  2 -
 .../models/realm/modeling_realm.py            | 78 ++++---------------
 src/transformers/utils/dummy_pt_objects.py    |  8 +-
 tests/test_modeling_realm.py                  | 55 +------------
 utils/check_repo.py                           |  2 -
 8 files changed, 23 insertions(+), 132 deletions(-)

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 69d9dbf540b8..9be321e99abf 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -243,6 +243,7 @@ Flax), PyTorch, and/or TensorFlow.
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|            Realm            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
index 9b58eaaaecdd..f673ed6d93ac 100644
--- a/docs/source/model_doc/realm.rst
+++ b/docs/source/model_doc/realm.rst
@@ -75,13 +75,6 @@ RealmKnowledgeAugEncoder
     :members: forward
 
 
-RealmSearcher
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmSearcher
-    :members: forward
-
-
 RealmReader
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 698aca8568a4..73e1ef2f2a2a 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1186,7 +1186,6 @@
             "RealmPreTrainedModel",
             "RealmReader",
             "RealmScorer",
-            "RealmSearcher",
             "load_tf_weights_in_realm",
         ]
     )
@@ -3098,7 +3097,6 @@
             RealmPreTrainedModel,
             RealmReader,
             RealmScorer,
-            RealmSearcher,
             load_tf_weights_in_realm,
         )
         from .models.reformer import (
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 7017470a85d1..03a69e01bf03 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -35,7 +35,6 @@
         "RealmPreTrainedModel",
         "RealmReader",
         "RealmScorer",
-        "RealmSearcher",
         "load_tf_weights_in_realm",
     ]
 
@@ -53,7 +52,6 @@
             RealmPreTrainedModel,
             RealmReader,
             RealmScorer,
-            RealmSearcher,
             load_tf_weights_in_realm,
         )
 
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index e6a712d7ca20..874e94856c48 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -831,25 +831,6 @@ class RealmScorerOutput(ModelOutput):
     candidate_score: torch.FloatTensor = None
 
 
-@dataclass
-class RealmSearcherOutput(ModelOutput):
-    """
-    Outputs of RealmSearcher models.
-
-    Args:
-        retrieved_logits (`torch.FloatTensor` of shape `(config.searcher_beam_size,)`):
-            The relevance score of document candidates (before softmax).
-        retrieved_blocks (`np.ndarray` of shape `(config.searcher_beam_size,)`):
-            Retrieved document blocks.
-        retrieved_block_ids (`torch.LongTensor` of shape `(config.searcher_beam_size,)`):
-            IDs of retrieved blocks.
-    """
-
-    retrieved_logits: torch.FloatTensor = None
-    retrieved_blocks: np.ndarray = None
-    retrieved_block_ids: torch.LongTensor = None
-
-
 @dataclass
 class RealmReaderOutput(ModelOutput):
     """
@@ -1669,35 +1650,22 @@ def mask_to_score(mask):
 """
 
 
-class RealmSearcher(RealmPreTrainedModel):
-    def __init__(self, config):
-        # TODO(PVP) - this class has to be removed
-        super().__init__(config)
-        self.embedder = RealmEmbedder(config)
-        self.register_buffer(
-            "block_emb",
-            torch.zeros(()).new_empty(
-                size=(config.num_block_records, config.retriever_proj_size),
-                dtype=torch.float32,
-                device=torch.device("cpu"),
-            ),
-        )
-        self.init_weights()
-
-
 @add_start_docstrings(
-    "A wrapper of `RealmSearcher` and `RealmReader` providing end-to-end open domain question answering.",
+    "`RealmForOpenQA` for end-to-end open domain question answering.",
     REALM_START_DOCSTRING,
 )
 class RealmForOpenQA(RealmPreTrainedModel):
-    def __init__(self, config, searcher, reader, retriever):
+    def __init__(self, config, retriever=None):
         super().__init__(config)
-        self.embedder = searcher.embedder
-        self.reader = reader
-        self.block_emb = searcher.block_emb
+        self.embedder = RealmEmbedder(config)
+        self.reader = RealmReader(config)
+        self.block_emb = nn.Parameter(torch.FloatTensor(size=(config.num_block_records, config.retriever_proj_size)).uniform_())
+
+        # add retriever for 
+        # single-call forward pass
         self.retriever = retriever
 
-    #        self.init_weights()
+        self.init_weights()
 
     @property
     def beam_size(self):
@@ -1705,25 +1673,6 @@ def beam_size(self):
             return self.config.searcher_beam_size
         return self.config.reader_beam_size
 
-    @classmethod
-    def from_pretrained(cls, searcher_pretrained_name_or_path, reader_pretrained_name_or_path, retriever, **kwargs):
-        """
-        Args:
-            searcher_pretrained_name_or_path (`str`):
-                Searcher pretrained name or path.
-            reader_pretrained_name_or_path (`str`):
-                Reader pretrained name or path.
-
-        """
-        config = kwargs.pop("config", None) or RealmConfig.from_pretrained(searcher_pretrained_name_or_path, **kwargs)
-        searcher = RealmSearcher.from_pretrained(searcher_pretrained_name_or_path, config=config, **kwargs)
-        reader = RealmReader.from_pretrained(reader_pretrained_name_or_path, config=config, **kwargs)
-        return cls(config, searcher, reader, retriever)
-
-    def save_pretrained(self, save_directory):
-        self.searcher.save_pretrained(save_directory)
-        self.reader.save_pretrained(save_directory)
-
     @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
     def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, return_dict=None):
@@ -1736,12 +1685,15 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
         >>> import torch
         >>> from transformers import RealmForOpenQA, RealmTokenizer
 
-        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", blocks.tfr)
+        >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-open-qa")
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-open-qa")
+        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-open-qa", retriever=retriever)
 
         >>> question = "Who is the pioneer in modern computer science?"
-        >>> answer_ids = tokenizer(["alan mathison turing"], add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False).input_ids
+        >>> quastion_ids = tokenizer(question, return_tensors="pt").input_ids
+        >>> answer_ids = tokenizer("alan mathison turing", add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False, return_tensors="pt").input_ids
 
-        >>> searcher_output, reader_output, predicted_answer = model(question, answer_ids)
+        >>> searcher_output, reader_output, predicted_answer = model(question_ids, answer_ids)
         >>> loss = reader_output.loss
         ```"""
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index debe1af3ecae..65407f5dfbe8 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3979,18 +3979,16 @@ def __init__(self, *args, **kwargs):
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
-
-class RealmReader:
-    def __init__(self, *args, **kwargs):
+    def forward(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmScorer:
+class RealmReader:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmSearcher:
+class RealmScorer:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 6020e9740648..35a008bab698 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -34,7 +34,6 @@
         RealmKnowledgeAugEncoder,
         RealmReader,
         RealmScorer,
-        RealmSearcher,
         RealmTokenizer,
     )
 
@@ -300,31 +299,6 @@ def create_and_check_retriever(
             result.candidate_score.shape, (self.batch_size, self.num_candidates, self.retriever_proj_size)
         )
 
-    def create_and_check_searcher(
-        self,
-        config,
-        input_ids,
-        token_type_ids,
-        input_mask,
-        retriever_encoder_inputs,
-        reader_inputs,
-        searcher_inputs,
-        sequence_labels,
-        token_labels,
-        choice_labels,
-    ):
-        model = RealmSearcher(config=config)
-        model.to(torch_device)
-        model.eval()
-        result = model(
-            searcher_inputs[0],
-            attention_mask=searcher_inputs[1],
-            token_type_ids=searcher_inputs[2],
-        )
-        self.parent.assertEqual(result.retrieved_logits.shape, (self.searcher_beam_size,))
-        self.parent.assertEqual(result.retrieved_blocks.shape, (self.searcher_beam_size,))
-        self.parent.assertEqual(result.retrieved_block_ids.shape, (self.searcher_beam_size,))
-
     def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (
@@ -433,12 +407,6 @@ def test_retriever_from_pretrained(self):
         model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
         self.assertIsNotNone(model)
 
-    @slow
-    def test_searcher_from_pretrained(self):
-        # TODO: TF record dataset
-        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", BLOCK_RECORDS_PATH)
-        self.assertIsNotNone(model)
-
 
 @require_torch
 class RealmModelIntegrationTest(unittest.TestCase):
@@ -477,16 +445,16 @@ def test_inference_encoder(self):
 
     @slow
     def test_inference_open_qa(self):
-        # TODO: TF record dataset
         from transformers.models.realm.retrieval_realm import RealmRetriever
 
         config = RealmConfig(use_scann=False)
+
         tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-searcher")
         retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
+
         model = RealmForOpenQA.from_pretrained(
-            r"qqaatw/realm-orqa-nq-searcher",
-            r"qqaatw/realm-orqa-nq-reader",
-            retriever,
+            "patrickvonplaten/realm-open-qa",
+            retriever=retriever,
             config=config,
         )
 
@@ -548,18 +516,3 @@ def test_inference_retriever(self):
 
         expected_slice = torch.tensor([[0.7410, 0.7170]])
         self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4))
-
-    @slow
-    def test_inference_searcher(self):
-        # TODO: TF record dataset
-        config = RealmConfig(searcher_beam_size=5)
-        model = RealmSearcher.from_pretrained("qqaatw/realm-orqa-nq-searcher", BLOCK_RECORDS_PATH, config=config)
-
-        input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
-        output = model(input_ids)[0]
-
-        expected_shape = torch.Size((5,))
-        self.assertEqual(output.shape, expected_shape)
-
-        expected_slice = torch.tensor([[5.2747, 4.3768, 5.0444, 5.4152, 5.2922]])
-        self.assertTrue(torch.allclose(output, expected_slice, atol=1e-4), output)
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 2b2d00a2a471..21549350535f 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -76,7 +76,6 @@
     "RealmBertModel",  # Building part of bigger (tested) model.
     "RealmReader",  # Not regular model.
     "RealmScorer",  # Not regular model.
-    "RealmSearcher",  # Not regular model.
     "RealmForOpenQA",  # Not regular model.
     "ReformerForMaskedLM",  # Needs to be setup as decoder.
     "Speech2Text2DecoderWrapper",  # Building part of bigger (tested) model.
@@ -138,7 +137,6 @@
     "RealmForOpenQA",
     "RealmScorer",
     "RealmReader",
-    "RealmSearcher",
     "TFDPRReader",
     "TFGPT2DoubleHeadsModel",
     "TFOpenAIGPTDoubleHeadsModel",

From bc56dbb919c350a4ad377a7d98734d1d33656cd0 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 4 Jan 2022 16:32:56 +0800
Subject: [PATCH 74/98] Resolve merge conflict

---
 .../models/realm/modeling_realm.py            | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 874e94856c48..7acdd3bf45fd 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -884,7 +884,7 @@ class RealmReaderOutput(ModelOutput):
 class RealmForOpenQAOutput(ModelOutput):
     """
 
-    Outputs of RealmReader models.
+    Outputs of RealmForOpenQA models.
 
     Args:
         searcher_output (`dict`):
@@ -1659,10 +1659,14 @@ def __init__(self, config, retriever=None):
         super().__init__(config)
         self.embedder = RealmEmbedder(config)
         self.reader = RealmReader(config)
-        self.block_emb = nn.Parameter(torch.FloatTensor(size=(config.num_block_records, config.retriever_proj_size)).uniform_())
-
-        # add retriever for 
-        # single-call forward pass
+        self.register_buffer(
+            "block_emb",
+            torch.zeros(()).new_empty(
+                size=(config.num_block_records, config.retriever_proj_size),
+                dtype=torch.FloatTensor,
+                device=torch.device("cpu"),
+            ),
+        )
         self.retriever = retriever
 
         self.init_weights()
@@ -1708,13 +1712,15 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
 
         # [1, projection_size]
         question_projection = question_outputs[0]
-        # [1, searcher_beam_size]
 
         batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection)
+        # [1, searcher_beam_size]
         _, retrieved_block_ids = torch.topk(batch_scores, k=self.beam_size, dim=-1)
+        
+        # [searcher_beam_size]
+        # Must return cpu tensor for subsequent numpy operations
         retrieved_block_ids = retrieved_block_ids.squeeze().cpu()
 
-        # Must return cpu tensor for subsequent numpy operations
         # Retrieve possible answers
         has_answers, start_pos, end_pos, concat_inputs = self.retriever(retrieved_block_ids, input_ids, answer_ids)
 

From a62ae6f0871a6519ba2c16b34361e86721b8edf4 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 4 Jan 2022 23:37:13 +0800
Subject: [PATCH 75/98] Make everything work

---
 docs/source/_toctree.yml                      |  2 +
 docs/source/model_doc/realm.mdx               | 80 +++++++++++++++
 docs/source/model_doc/realm.rst               | 89 -----------------
 src/transformers/__init__.py                  |  2 +
 src/transformers/models/realm/__init__.py     |  6 ++
 .../models/realm/configuration_realm.py       | 19 ++--
 .../models/realm/modeling_realm.py            | 98 +++++++++++--------
 .../models/realm/retrieval_realm.py           | 27 +++--
 tests/test_modeling_realm.py                  | 58 +++++------
 9 files changed, 196 insertions(+), 185 deletions(-)
 create mode 100644 docs/source/model_doc/realm.mdx
 delete mode 100644 docs/source/model_doc/realm.rst

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 1094b406b70e..85c66694741f 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -236,6 +236,8 @@
       title: QDQBert
     - local: model_doc/rag
       title: RAG
+    - local: model_doc/realm
+      title: REALM
     - local: model_doc/reformer
       title: Reformer
     - local: model_doc/rembert
diff --git a/docs/source/model_doc/realm.mdx b/docs/source/model_doc/realm.mdx
new file mode 100644
index 000000000000..c42ddbad32a3
--- /dev/null
+++ b/docs/source/model_doc/realm.mdx
@@ -0,0 +1,80 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# REALM
+
+## Overview
+
+The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
+<https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
+retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then
+utilizes retrieved documents to process question answering tasks.
+
+The abstract from the paper is the following:
+
+*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
+such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
+requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
+augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
+over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
+first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
+modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
+demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
+challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
+explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
+methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
+interpretability and modularity.*
+
+This model was contributed by `qqaatw <https://huggingface.co/qqaatw>`__. The original code can be found `here
+<https://github.com/google-research/language/tree/master/language/realm>`__.
+
+## RealmConfig
+
+[[autodoc]] RealmConfig
+
+## RealmTokenizer
+
+[[autodoc]] RealmTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+    - batch_encode_candidates
+
+## RealmRetriever
+
+[[autodoc]] RealmRetriever
+
+## RealmEmbedder
+
+[[autodoc]] RealmEmbedder
+    - forward
+
+## RealmScorer
+
+[[autodoc]] RealmScorer
+    - forward
+
+## RealmKnowledgeAugEncoder
+
+[[autodoc]] RealmKnowledgeAugEncoder
+    - forward
+
+## RealmReader
+
+[[autodoc]] RealmReader
+    - forward
+
+## RealmForOpenQA
+
+[[autodoc]] RealmForOpenQA
+    - forward
\ No newline at end of file
diff --git a/docs/source/model_doc/realm.rst b/docs/source/model_doc/realm.rst
deleted file mode 100644
index f673ed6d93ac..000000000000
--- a/docs/source/model_doc/realm.rst
+++ /dev/null
@@ -1,89 +0,0 @@
-.. 
-    Copyright 2020 The HuggingFace Team. All rights reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-    the License. You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-    specific language governing permissions and limitations under the License.
-
-REALM
------------------------------------------------------------------------------------------------------------------------
-
-Overview
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
-<https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
-retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then
-utilizes retrieved documents to process question answering tasks.
-
-The abstract from the paper is the following:
-
-*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
-such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
-requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
-augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
-over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
-first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
-modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
-demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
-challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
-explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
-methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
-interpretability and modularity.*
-
-This model was contributed by `qqaatw <https://huggingface.co/qqaatw>`__. The original code can be found `here
-<https://github.com/google-research/language/tree/master/language/realm>`__.
-
-RealmConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmConfig
-    :members:
-
-
-RealmTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
-        create_token_type_ids_from_sequences, save_vocabulary, batch_encode_candidates
-
-
-RealmEmbedder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmEmbedder
-    :members: forward
-
-
-RealmScorer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmScorer
-    :members: forward
-
-
-RealmKnowledgeAugEncoder
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmKnowledgeAugEncoder
-    :members: forward
-
-
-RealmReader
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmReader
-    :members: forward
-
-
-RealmForOpenQA
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.RealmForOpenQA
-    :members: from_pretrained, forward
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 73e1ef2f2a2a..ec07152b7e22 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1185,6 +1185,7 @@
             "RealmKnowledgeAugEncoder",
             "RealmPreTrainedModel",
             "RealmReader",
+            "RealmRetriever",
             "RealmScorer",
             "load_tf_weights_in_realm",
         ]
@@ -3096,6 +3097,7 @@
             RealmKnowledgeAugEncoder,
             RealmPreTrainedModel,
             RealmReader,
+            RealmRetriever,
             RealmScorer,
             load_tf_weights_in_realm,
         )
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 03a69e01bf03..66bfa56948a2 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -37,6 +37,9 @@
         "RealmScorer",
         "load_tf_weights_in_realm",
     ]
+    _import_structure["retrieval_realm"] = [
+        "RealmRetriever",
+    ]
 
 
 if TYPE_CHECKING:
@@ -54,6 +57,9 @@
             RealmScorer,
             load_tf_weights_in_realm,
         )
+        from .retrieval_realm import (
+            RealmRetriever
+        )
 
 
 else:
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 2efe31bba478..f794e31d08a5 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -24,8 +24,10 @@
     "realm-cc-news-pretrained-bert": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-bert/resolve/main/config.json",
     "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/config.json",
     "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/config.json",
-    "realm-orqa-nq-searcher": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/config.json",
-    "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/config.json",
+    "realm-orqa-nq-openqa": "https://huggingface.co/qqaatw/realm-orqa-nq-openqa/resolve/main/config.json",
+    "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-reader/resolve/main/config.json",
+    "realm-orqa-wq-openqa": "https://huggingface.co/qqaatw/realm-orqa-wq-openqa/resolve/main/config.json",
+    "realm-orqa-wq-reader": "https://huggingface.co/qqaatw/realm-orqa-wq-reader/resolve/main/config.json",
     # See all REALM models at https://huggingface.co/models?filter=realm
 }
 
@@ -37,7 +39,7 @@ class RealmConfig(PretrainedConfig):
     1. [`RealmEmbedder`]
     2. [`RealmScorer`]
     3. [`RealmKnowledgeAugEncoder`]
-    4. [`RealmSearcher`]
+    4. [`RealmRetriever`]
     5. [`RealmReader`]
     6. [`RealmForOpenQA`]
 
@@ -53,8 +55,7 @@ class RealmConfig(PretrainedConfig):
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
             `inputs_ids` passed when calling [`RealmEmbedder`],
-            [`RealmScorer`], [`RealmKnowledgeAugEncoder`],
-            [`RealmSearcher`], or [`RealmReader`].
+            [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (`int`, *optional*, defaults to 128):
@@ -79,8 +80,7 @@ class RealmConfig(PretrainedConfig):
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
             The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`],
-            [`RealmScorer`], [`RealmKnowledgeAugEncoder`],
-            [`RealmSearcher`], or [`RealmReader`].
+            [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
@@ -88,9 +88,6 @@ class RealmConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
-        use_scann (`bool`, *optional*, defaults to `True`):
-            Whether or not [`RealmSearcher`] uses *ScaNN* as the vector similarity searcher. This
-            option has no effect and is reserved for future development.
         span_hidden_size (`int`, *optional*, defaults to 256):
             Dimension of the reader's spans.
         max_span_width (`int`, *optional*, defaults to 10):
@@ -142,7 +139,6 @@ def __init__(
         initializer_range=0.02,
         layer_norm_eps=1e-12,
         use_cache=True,
-        use_scann=True,
         span_hidden_size=256,
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
@@ -174,7 +170,6 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
         self.use_cache = use_cache
-        self.use_scann = use_scann
 
         # Reader config
         self.span_hidden_size = span_hidden_size
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7acdd3bf45fd..93092a9c2832 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -91,17 +91,20 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
 
     for name, array in zip(names, arrays):
         # For reader
-        if isinstance(model, RealmReader) and "reader" not in name:
+        if not isinstance(model, RealmForOpenQA) and isinstance(model, RealmReader) and "reader" not in name:
             logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
             continue
-        elif not isinstance(model, RealmReader) and "reader" in name:
+        elif not isinstance(model, RealmForOpenQA) and not isinstance(model, RealmReader) and "reader" in name:
             logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
             continue
-        name = name.replace("reader/module/bert/", "bert/")
-        name = name.replace("reader/module/cls/", "cls/")
-        name = name.replace("reader/dense/", "qa_outputs/dense_intermediate/")
-        name = name.replace("reader/dense_1/", "qa_outputs/dense_output/")
-        name = name.replace("reader/layer_normalization", "qa_outputs/layer_normalization")
+
+        # For reader
+        reader_prefix = "" if isinstance(model, RealmReader) else "reader/"
+        name = name.replace("reader/module/bert/", f"{reader_prefix}bert/")
+        name = name.replace("reader/module/cls/", f"{reader_prefix}cls/")
+        name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/")
+        name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/")
+        name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
 
         # For embedder and retriever
         embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
@@ -110,12 +113,6 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
         name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
 
-        # Fine-tuned checkpoints
-        name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}bert/")
-        name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
-        name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/")
-        name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
-
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
@@ -848,9 +845,9 @@ class RealmReaderOutput(ModelOutput):
         reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*):
             Whether or not a span candidate contains answer.
         block_idx (`torch.LongTensor` of shape `()`):
-            The index of retrieved evidence blocks in which the predicted answer in most likely
+            The index of retrieved evidence blocks in which the predicted answer most likely.
         candidate (`torch.LongTensor` of shape `()`):
-            .
+            The index of retrieved span candidates in which the predicted answer most likely.
         start_pos (`torch.IntTensor` of shape `()`):
             Predicted answer starting position in *RealmReader*'s inputs.
         end_pos: (`torch.IntTensor` of shape `()`):
@@ -887,17 +884,14 @@ class RealmForOpenQAOutput(ModelOutput):
     Outputs of RealmForOpenQA models.
 
     Args:
-        searcher_output (`dict`):
-            Searcher output.
         reader_output (`dict`):
             Reader output.
-        predicted_answer (`str`):
-            Predicted answer.
+        predicted_answer_ids (`torch.LongTensor` of shape `(answer_sequence_length)`):
+            Predicted answer ids.
     """
 
-    searcher_output: dict = None
     reader_output: dict = None
-    predicted_answer: str = None
+    predicted_answer_ids: torch.LongTensor = None
 
 
 class RealmPredictionHeadTransform(nn.Module):
@@ -975,11 +969,11 @@ def span_candidates(masks):
             Generate span candidates.
 
             Args:
-            masks: <int32> [num_retrievals, max_sequence_len]
+                masks: <int32> [num_retrievals, max_sequence_len]
 
             Returns:
-            starts: <int32> [num_spans] ends: <int32> [num_spans] span_masks: <int32> [num_retrievals, num_spans]
-            whether spans locate in evidence block.
+                starts: <int32> [num_spans] ends: <int32> [num_spans] span_masks: <int32> [num_retrievals, num_spans]
+                whether spans locate in evidence block.
             """
             _, max_sequence_len = masks.shape
 
@@ -1196,7 +1190,7 @@ def forward(
 
 
 @add_start_docstrings(
-    "The retriever of REALM outputting relevance score representing the score of document candidates (before softmax).",
+    "The scorer of REALM outputting relevance score representing the score of document candidates (before softmax).",
     REALM_START_DOCSTRING,
 )
 class RealmScorer(RealmPreTrainedModel):
@@ -1640,9 +1634,29 @@ def mask_to_score(mask):
 
 REALM_FOR_OPEN_QA_DOCSTRING = r"""
     Args:
-        question (`str`):
-            OpenQA Question.
-        answer_ids (`torch.LongTensor` of shape `(num_answers, answer_length)`, *optional*):
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`RealmTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token (should not be used in this model by design).
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        answer_ids (`list` of shape `(num_answers, answer_length)`, *optional*):
             Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked),
             the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
         return_dict (`bool`, *optional*):
@@ -1663,7 +1677,7 @@ def __init__(self, config, retriever=None):
             "block_emb",
             torch.zeros(()).new_empty(
                 size=(config.num_block_records, config.retriever_proj_size),
-                dtype=torch.FloatTensor,
+                dtype=torch.float32,
                 device=torch.device("cpu"),
             ),
         )
@@ -1677,9 +1691,15 @@ def beam_size(self):
             return self.config.searcher_beam_size
         return self.config.reader_beam_size
 
-    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING)
+    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING.format("1, sequence_length"))
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, return_dict=None):
+    def forward(self, 
+        input_ids,
+        attention_mask=None, 
+        token_type_ids=None,
+        answer_ids=None,
+        return_dict=None,
+        ):
         r"""
         Returns:
 
@@ -1687,7 +1707,7 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
 
         ```python
         >>> import torch
-        >>> from transformers import RealmForOpenQA, RealmTokenizer
+        >>> from transformers import RealmForOpenQA, RealmRetriever, RealmTokenizer
 
         >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-open-qa")
         >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-open-qa")
@@ -1712,13 +1732,12 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
 
         # [1, projection_size]
         question_projection = question_outputs[0]
-
+        # [1, block_emb_size]
         batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection)
         # [1, searcher_beam_size]
         _, retrieved_block_ids = torch.topk(batch_scores, k=self.beam_size, dim=-1)
-        
         # [searcher_beam_size]
-        # Must return cpu tensor for subsequent numpy operations
+        # Must convert to cpu tensor for subsequent numpy operations
         retrieved_block_ids = retrieved_block_ids.squeeze().cpu()
 
         # Retrieve possible answers
@@ -1729,6 +1748,8 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
             start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device)
             end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device)
 
+        concat_inputs = concat_inputs.to(self.reader.device)
+
         # [searcher_beam_size, projection_size]
         retrieved_block_emb = torch.index_select(
             self.block_emb, dim=0, index=retrieved_block_ids.to(self.block_emb.device)
@@ -1754,10 +1775,9 @@ def forward(self, input_ids, token_type_ids, attention_mask, answer_ids=None, re
         ]
 
         if not return_dict:
-            return answer_ids, searcher_output, reader_output
+            return reader_output, predicted_answer_ids
 
         return RealmForOpenQAOutput(
-            predicted_answer=predicted_answer_ids,
-            searcher_output={},
             reader_output=reader_output,
-        )
+            predicted_answer_ids=predicted_answer_ids,
+        )
\ No newline at end of file
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index beb2235bbfc4..361bcead0ebc 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -15,9 +15,9 @@
 """Realm Retriever model implementation."""
 import numpy as np
 
-# from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
-
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from .modeling_realm import REALM_START_DOCSTRING
 
 logger = logging.get_logger(__name__)
 
@@ -31,7 +31,6 @@ def convert_tfrecord_to_np(block_records_path, num_block_records):
 
     return np_record
 
-
 class ScaNNSearcher:
     def __init__(
         self,
@@ -56,12 +55,20 @@ def __init__(
 
     def search_batched(self, question_projection):
         retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
-        # Must return cpu tensor for subsequent numpy operations
-        #        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
         return retrieved_block_ids.astype("int64")
 
 
 class RealmRetriever:
+    """"The retriever of REALM outputting retrieved evidence block and whether the block has answers."
+    
+    Parameters:
+        config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
+            weights.
+        tokenizer ([`RealmTokenizer`]): RealmTokenizer to encode retrieved texts.
+        block_records_path ([`str`]): The path of `block_records`, which cantains evidence texts.
+    """
     def __init__(self, config, tokenizer, block_records_path):
         super().__init__()
         self.config = config
@@ -71,7 +78,6 @@ def __init__(self, config, tokenizer, block_records_path):
         )
         self.tokenizer = tokenizer
 
-    #    ) -> BatchEncoding:
     def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_tensors="pt"):
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
@@ -88,7 +94,6 @@ def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_t
         )
         concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors)
 
-        # concat inputs should come from the retriever here
         if answer_ids is not None:
             return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_tensors,)
         else:
@@ -104,11 +109,13 @@ def block_has_answer(self, concat_inputs, answer_ids):
         for input_id in concat_inputs.input_ids:
             start_pos.append([])
             end_pos.append([])
-            sep_idx = input_id.index(self.tokenizer.sep_token_id)
+            input_id_list = input_id.tolist()
+            # Checking answers after the [SEP] token
+            sep_idx = input_id_list.index(self.tokenizer.sep_token_id)
             for answer in answer_ids:
                 for idx in range(sep_idx, len(input_id)):
-                    if answer[0] == input_id[idx]:
-                        if input_id[idx : idx + len(answer)] == answer:
+                    if answer[0] == input_id_list[idx]:
+                        if input_id_list[idx : idx + len(answer)] == answer:
                             start_pos[-1].append(idx)
                             end_pos[-1].append(idx + len(answer) - 1)
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 35a008bab698..1a277c89bf76 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -39,8 +39,8 @@
 
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
-# BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
-BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
+BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+#BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
 
 
 class RealmModelTester:
@@ -126,7 +126,6 @@ def prepare_config_and_inputs(self):
         input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
         candiate_input_ids = ids_tensor([self.batch_size, self.num_candidates, self.seq_length], self.vocab_size)
         reader_input_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.vocab_size)
-        searcher_input_ids = ids_tensor([1, self.searcher_seq_len], self.vocab_size)
 
         input_mask = None
         candiate_input_mask = None
@@ -135,7 +134,6 @@ def prepare_config_and_inputs(self):
             input_mask = random_attention_mask([self.batch_size, self.seq_length])
             candiate_input_mask = random_attention_mask([self.batch_size, self.num_candidates, self.seq_length])
             reader_input_mask = random_attention_mask([self.reader_beam_size, self.reader_seq_len])
-            searcher_input_mask = random_attention_mask([1, self.reader_seq_len])
 
         token_type_ids = None
         candidate_token_type_ids = None
@@ -146,7 +144,6 @@ def prepare_config_and_inputs(self):
                 [self.batch_size, self.num_candidates, self.seq_length], self.type_vocab_size
             )
             reader_token_type_ids = ids_tensor([self.reader_beam_size, self.reader_seq_len], self.type_vocab_size)
-            searcher_token_type_ids = ids_tensor([1, self.searcher_seq_len], self.type_vocab_size)
 
         sequence_labels = None
         token_labels = None
@@ -159,19 +156,17 @@ def prepare_config_and_inputs(self):
         config = self.get_config()
 
         # inputs with additional num_candidates axis.
-        retriever_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
+        scorer_encoder_inputs = (candiate_input_ids, candiate_input_mask, candidate_token_type_ids)
         # reader inputs
         reader_inputs = (reader_input_ids, reader_input_mask, reader_token_type_ids)
-        searcher_inputs = (searcher_input_ids, searcher_input_mask, searcher_token_type_ids)
 
         return (
             config,
             input_ids,
             token_type_ids,
             input_mask,
-            retriever_encoder_inputs,
+            scorer_encoder_inputs,
             reader_inputs,
-            searcher_inputs,
             sequence_labels,
             token_labels,
             choice_labels,
@@ -200,9 +195,8 @@ def create_and_check_embedder(
         input_ids,
         token_type_ids,
         input_mask,
-        retriever_encoder_inputs,
+        scorer_encoder_inputs,
         reader_inputs,
-        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -219,9 +213,8 @@ def create_and_check_encoder(
         input_ids,
         token_type_ids,
         input_mask,
-        retriever_encoder_inputs,
+        scorer_encoder_inputs,
         reader_inputs,
-        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -231,9 +224,9 @@ def create_and_check_encoder(
         model.eval()
         relevance_score = floats_tensor([self.batch_size, self.num_candidates])
         result = model(
-            retriever_encoder_inputs[0],
-            attention_mask=retriever_encoder_inputs[1],
-            token_type_ids=retriever_encoder_inputs[2],
+            scorer_encoder_inputs[0],
+            attention_mask=scorer_encoder_inputs[1],
+            token_type_ids=scorer_encoder_inputs[2],
             relevance_score=relevance_score,
             labels=token_labels,
         )
@@ -247,9 +240,8 @@ def create_and_check_reader(
         input_ids,
         token_type_ids,
         input_mask,
-        retriever_encoder_inputs,
+        scorer_encoder_inputs,
         reader_inputs,
-        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -269,15 +261,14 @@ def create_and_check_reader(
         self.parent.assertEqual(result.start_pos.shape, ())
         self.parent.assertEqual(result.end_pos.shape, ())
 
-    def create_and_check_retriever(
+    def create_and_check_scorer(
         self,
         config,
         input_ids,
         token_type_ids,
         input_mask,
-        retriever_encoder_inputs,
+        scorer_encoder_inputs,
         reader_inputs,
-        searcher_inputs,
         sequence_labels,
         token_labels,
         choice_labels,
@@ -289,9 +280,9 @@ def create_and_check_retriever(
             input_ids,
             attention_mask=input_mask,
             token_type_ids=token_type_ids,
-            candidate_input_ids=retriever_encoder_inputs[0],
-            candidate_attention_mask=retriever_encoder_inputs[1],
-            candidate_token_type_ids=retriever_encoder_inputs[2],
+            candidate_input_ids=scorer_encoder_inputs[0],
+            candidate_attention_mask=scorer_encoder_inputs[1],
+            candidate_token_type_ids=scorer_encoder_inputs[2],
         )
         self.parent.assertEqual(result.relevance_score.shape, (self.batch_size, self.num_candidates))
         self.parent.assertEqual(result.query_score.shape, (self.batch_size, self.retriever_proj_size))
@@ -306,9 +297,8 @@ def prepare_config_and_inputs_for_common(self):
             input_ids,
             token_type_ids,
             input_mask,
-            retriever_encoder_inputs,
+            scorer_encoder_inputs,
             reader_inputs,
-            searcher_inputs,
             sequence_labels,
             token_labels,
             choice_labels,
@@ -362,7 +352,7 @@ def test_model_various_embeddings(self):
 
     def test_retriever(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_retriever(*config_and_inputs)
+        self.model_tester.create_and_check_scorer(*config_and_inputs)
 
     def test_training(self):
         if not self.model_tester.is_training:
@@ -392,9 +382,7 @@ def test_encoder_from_pretrained(self):
     @slow
     def test_open_qa_from_pretrained(self):
         # TODO: TF record dataset
-        model = RealmForOpenQA.from_pretrained(
-            "qqaatw/realm-orqa-nq-searcher", "qqaatw/realm-orqa-nq-reader", BLOCK_RECORDS_PATH
-        )
+        model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-openqa", BLOCK_RECORDS_PATH)
         self.assertIsNotNone(model)
 
     @slow
@@ -403,7 +391,7 @@ def test_reader_from_pretrained(self):
         self.assertIsNotNone(model)
 
     @slow
-    def test_retriever_from_pretrained(self):
+    def test_scorer_from_pretrained(self):
         model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
         self.assertIsNotNone(model)
 
@@ -449,11 +437,11 @@ def test_inference_open_qa(self):
 
         config = RealmConfig(use_scann=False)
 
-        tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-searcher")
+        tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
         retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
 
         model = RealmForOpenQA.from_pretrained(
-            "patrickvonplaten/realm-open-qa",
+            "qqaatw/realm-orqa-nq-openqa",
             retriever=retriever,
             config=config,
         )
@@ -468,7 +456,7 @@ def test_inference_open_qa(self):
             return_tensors="pt",
         ).to(model.device)
 
-        predicted_answer_ids = model(**question).predicted_answer
+        predicted_answer_ids = model(**question).predicted_answer_ids
 
         predicted_answer = tokenizer.decode(predicted_answer_ids)
         self.assertEqual(predicted_answer, "alan mathison turing")
@@ -502,7 +490,7 @@ def test_inference_reader(self):
         self.assertTrue(torch.allclose(output.end_pos, expected_end_pos, atol=1e-4))
 
     @slow
-    def test_inference_retriever(self):
+    def test_inference_scorer(self):
         num_candidates = 2
 
         model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates)

From 167b17be47a25788bf7e1ae7851ac33980e5d9db Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 4 Jan 2022 23:45:16 +0800
Subject: [PATCH 76/98] Style

---
 src/transformers/models/realm/__init__.py        |  8 ++------
 src/transformers/models/realm/modeling_realm.py  | 14 ++++++--------
 src/transformers/models/realm/retrieval_realm.py | 13 +++++++------
 tests/test_modeling_realm.py                     |  2 +-
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 66bfa56948a2..33105a6927f2 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -37,9 +37,7 @@
         "RealmScorer",
         "load_tf_weights_in_realm",
     ]
-    _import_structure["retrieval_realm"] = [
-        "RealmRetriever",
-    ]
+    _import_structure["retrieval_realm"] = ["RealmRetriever"]
 
 
 if TYPE_CHECKING:
@@ -57,9 +55,7 @@
             RealmScorer,
             load_tf_weights_in_realm,
         )
-        from .retrieval_realm import (
-            RealmRetriever
-        )
+        from .retrieval_realm import RealmRetriever
 
 
 else:
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 93092a9c2832..917087985e09 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -18,9 +18,8 @@
 import math
 import os
 from dataclasses import dataclass
-from typing import Optional, Tuple, TypeVar
+from typing import Optional, Tuple
 
-import numpy as np
 import torch
 from packaging import version
 from torch import nn
@@ -42,10 +41,8 @@
 )
 from ...utils import logging
 from .configuration_realm import RealmConfig
-from .utils_realm import BruteForceSearcher, ScaNNSearcher, convert_tfrecord_to_np
 
 
-T = TypeVar("T", bound="Module")
 logger = logging.get_logger(__name__)
 _BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
 _EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
@@ -1693,13 +1690,14 @@ def beam_size(self):
 
     @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING.format("1, sequence_length"))
     @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
-    def forward(self, 
+    def forward(
+        self,
         input_ids,
-        attention_mask=None, 
+        attention_mask=None,
         token_type_ids=None,
         answer_ids=None,
         return_dict=None,
-        ):
+    ):
         r"""
         Returns:
 
@@ -1780,4 +1778,4 @@ def forward(self,
         return RealmForOpenQAOutput(
             reader_output=reader_output,
             predicted_answer_ids=predicted_answer_ids,
-        )
\ No newline at end of file
+        )
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 361bcead0ebc..7e8e60e7ed4f 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -16,8 +16,7 @@
 import numpy as np
 
 from ...utils import logging
-from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
-from .modeling_realm import REALM_START_DOCSTRING
+
 
 logger = logging.get_logger(__name__)
 
@@ -31,6 +30,7 @@ def convert_tfrecord_to_np(block_records_path, num_block_records):
 
     return np_record
 
+
 class ScaNNSearcher:
     def __init__(
         self,
@@ -59,16 +59,17 @@ def search_batched(self, question_projection):
 
 
 class RealmRetriever:
-    """"The retriever of REALM outputting retrieved evidence block and whether the block has answers."
-    
+    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers."
+
     Parameters:
         config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
             configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
             weights.
-        tokenizer ([`RealmTokenizer`]): RealmTokenizer to encode retrieved texts.
-        block_records_path ([`str`]): The path of `block_records`, which cantains evidence texts.
+        tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
+        block_records_path (`str`): The path of `block_records`, which cantains evidence texts.
     """
+
     def __init__(self, config, tokenizer, block_records_path):
         super().__init__()
         self.config = config
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 1a277c89bf76..9b05eb2c217d 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -40,7 +40,7 @@
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
 BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
-#BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
+# BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
 
 
 class RealmModelTester:

From 9627fe8d6f7b9183d0fffd7146429aa895cef292 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 5 Jan 2022 00:02:13 +0800
Subject: [PATCH 77/98] Fixup

---
 .../models/realm/modeling_realm.py            | 14 ++--
 src/transformers/models/realm/utils_realm.py  | 69 -------------------
 src/transformers/utils/dummy_pt_objects.py    |  5 ++
 3 files changed, 13 insertions(+), 75 deletions(-)
 delete mode 100644 src/transformers/models/realm/utils_realm.py

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 917087985e09..40c4609e9969 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -54,8 +54,10 @@
     "qqaatw/realm-cc-news-pretrained-bert",
     "qqaatw/realm-cc-news-pretrained-embedder",
     "qqaatw/realm-cc-news-pretrained-retriever",
-    "qqaatw/realm-orqa-nq-searcher",
+    "qqaatw/realm-orqa-nq-openqa",
     "qqaatw/realm-orqa-nq-reader",
+    "qqaatw/realm-orqa-wq-openqa",
+    "qqaatw/realm-orqa-wq-reader",
     # See all REALM models at https://huggingface.co/models?filter=realm
 ]
 
@@ -838,13 +840,13 @@ class RealmReaderOutput(ModelOutput):
         reader_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
             Reader loss.
         retriever_correct (`torch.BoolTensor` of shape `(config.searcher_beam_size,)`, *optional*):
-            Whether or not a evidence block derived from *RealmSearcher* contains answer.
+            Whether or not an evidence block contains answer.
         reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*):
             Whether or not a span candidate contains answer.
         block_idx (`torch.LongTensor` of shape `()`):
-            The index of retrieved evidence blocks in which the predicted answer most likely.
+            The index of the retrieved evidence block in which the predicted answer is most likely.
         candidate (`torch.LongTensor` of shape `()`):
-            The index of retrieved span candidates in which the predicted answer most likely.
+            The index of the retrieved span candidates in which the predicted answer is most likely.
         start_pos (`torch.IntTensor` of shape `()`):
             Predicted answer starting position in *RealmReader*'s inputs.
         end_pos: (`torch.IntTensor` of shape `()`):
@@ -1498,7 +1500,7 @@ def forward(
     ):
         r"""
         relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*):
-            Relevance score derived from *RealmSearcher*, must be specified if you want to compute the marginal log
+            Relevance score, which must be specified if you want to compute the marginal log
             loss.
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
@@ -1509,7 +1511,7 @@ def forward(
             Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
             sequence are not taken into account for computing the loss.
         has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*):
-            Whether or not the evidence blocks derived from *RealmSearcher* have answer(s).
+            Whether or not the evidence block has answer(s).
 
         Returns:
         """
diff --git a/src/transformers/models/realm/utils_realm.py b/src/transformers/models/realm/utils_realm.py
deleted file mode 100644
index 187a282fc274..000000000000
--- a/src/transformers/models/realm/utils_realm.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Utilities for REALM."""
-
-import torch
-
-
-class ScaNNSearcher:
-    def __init__(
-        self,
-        db,
-        num_neighbors,
-        dimensions_per_block=2,
-        num_leaves=1000,
-        num_leaves_to_search=100,
-        training_sample_size=100000,
-    ):
-        """Build scann searcher."""
-
-        from scann.scann_ops.py.scann_ops_pybind import builder as Builder
-
-        builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product")
-        builder = builder.tree(
-            num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size
-        )
-        builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
-
-        self.searcher = builder.build()
-
-    def search_batched(self, question_projection):
-        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
-        # Must return cpu tensor for subsequent numpy operations
-        #        return torch.tensor(retrieved_block_ids.astype("int64"), device=torch.device("cpu"))
-        return retrieved_block_ids.astype("int64")
-
-
-class BruteForceSearcher:
-    def __init__(self, db, num_neighbors):
-        """Build brute force searcher."""
-        self.db = db
-        self.num_neighbors = num_neighbors
-
-    def search_batched(self, question_projection):
-        batch_scores = torch.einsum("BD,QD->QB", self.db, question_projection)
-        _, retrieved_block_ids = torch.topk(batch_scores, k=self.num_neighbors, dim=-1)
-        # Must return cpu tensor for subsequent numpy operations
-        return retrieved_block_ids.cpu()
-
-
-def convert_tfrecord_to_np(block_records_path, num_block_records):
-    import tensorflow.compat.v1 as tf
-
-    blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
-    blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True)
-    np_record = next(blocks_dataset.take(1).as_numpy_iterator())
-
-    return np_record
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index 65407f5dfbe8..91a4fae468a6 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -3988,6 +3988,11 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class RealmRetriever:
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class RealmScorer:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])

From 8bbebd485e6e71135de24fa4c1b7d0d8371f86b8 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Wed, 5 Jan 2022 00:13:28 +0800
Subject: [PATCH 78/98] Fixup

---
 .../models/realm/configuration_realm.py       | 18 ++--
 .../models/realm/modeling_realm.py            | 96 ++++++++++---------
 .../models/realm/retrieval_realm.py           |  3 +-
 .../models/realm/tokenization_realm.py        | 12 +--
 4 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index f794e31d08a5..f36a599f69db 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" REALM model configuration """
+""" REALM model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -47,15 +47,15 @@ class RealmConfig(PretrainedConfig):
     Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
     [realm-cc-news-pretrained](https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder) architecture.
 
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
-    outputs. Read the documentation from [`PretrainedConfig`] for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
         vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`RealmEmbedder`],
-            [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
+            `inputs_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or
+            [`RealmReader`].
         hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
         retriever_proj_size (`int`, *optional*, defaults to 128):
@@ -69,8 +69,8 @@ class RealmConfig(PretrainedConfig):
         intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string,
-            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
@@ -79,8 +79,8 @@ class RealmConfig(PretrainedConfig):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
         type_vocab_size (`int`, *optional*, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`],
-            [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
+            The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`],
+            [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 40c4609e9969..7adec5d17afa 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch REALM model. """
+""" PyTorch REALM model."""
 
 
 import math
@@ -792,12 +792,13 @@ class RealmEmbedderOutput(ModelOutput):
 
             Projected score.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape `(batch_size, sequence_length, hidden_size)`.
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
@@ -852,12 +853,13 @@ class RealmReaderOutput(ModelOutput):
         end_pos: (`torch.IntTensor` of shape `()`):
             Predicted answer ending position in *RealmReader*'s inputs.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape `(batch_size, sequence_length, hidden_size)`.
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
 
             Hidden-states of the model at the output of each layer plus the initial embedding outputs.
         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
 
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
@@ -1063,15 +1065,14 @@ def _flatten_inputs(self, *inputs):
 
 
 REALM_START_DOCSTRING = r"""
-    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
 
     Parameters:
         config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
 """
 
 REALM_INPUTS_DOCSTRING = r"""
@@ -1079,9 +1080,8 @@ def _flatten_inputs(self, *inputs):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
-            details.
+            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -1092,14 +1092,16 @@ def _flatten_inputs(self, *inputs):
 
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
 
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
 
             [What are token type IDs?](../glossary#token-type-ids)
         position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
         head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
@@ -1109,9 +1111,9 @@ def _flatten_inputs(self, *inputs):
             - 0 indicates the head is **masked**.
 
         inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert *input_ids* indices into associated vectors
-            than the model's internal embedding lookup matrix.
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
             tensors for more detail.
@@ -1230,9 +1232,8 @@ def forward(
         candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`):
             Indices of candidate input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
-            details.
+            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         candidate_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
@@ -1243,7 +1244,8 @@ def forward(
 
             [What are attention masks?](../glossary#attention-mask)
         candidate_token_type_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
 
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
@@ -1359,8 +1361,9 @@ def forward(
             modeling loss.
 
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
 
         mlm_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
             Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked.
@@ -1377,14 +1380,11 @@ def forward(
         >>> import torch
         >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
 
-        >>> tokenizer = RealmTokenizer.from_pretrained('qqaatw/realm-cc-news-pretrained-bert')
-        >>> model = RealmKnowledgeAugEncoder.from_pretrained('qqaatw/realm-cc-news-pretrained-bert', num_candidates=2)
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=2)
 
         >>> # batch_size = 2, num_candidates = 2
-        >>> text = [
-        >>>     ["Hello world!", "Nice to meet you!"],
-        >>>     ["The cute cat.", "The adorable dog."]
-        >>> ]
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
 
         >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1500,16 +1500,15 @@ def forward(
     ):
         r"""
         relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*):
-            Relevance score, which must be specified if you want to compute the marginal log
-            loss.
+            Relevance score, which must be specified if you want to compute the marginal log loss.
         start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
         end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
             Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
         has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*):
             Whether or not the evidence block has answer(s).
 
@@ -1636,9 +1635,8 @@ def mask_to_score(mask):
         input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`RealmTokenizer`]. See
-            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
-            details.
+            Indices can be obtained using [`RealmTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
 
             [What are input IDs?](../glossary#input-ids)
         attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
@@ -1649,15 +1647,17 @@ def mask_to_score(mask):
 
             [What are attention masks?](../glossary#attention-mask)
         token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
-            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
 
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token (should not be used in this model by design).
 
             [What are token type IDs?](../glossary#token-type-ids)
         answer_ids (`list` of shape `(num_answers, answer_length)`, *optional*):
-            Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked),
-            the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
         return_dict (`bool`, *optional*):
             Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
 """
@@ -1715,7 +1715,13 @@ def forward(
 
         >>> question = "Who is the pioneer in modern computer science?"
         >>> quastion_ids = tokenizer(question, return_tensors="pt").input_ids
-        >>> answer_ids = tokenizer("alan mathison turing", add_special_tokens=False, return_token_type_ids=False, return_attention_mask=False, return_tensors="pt").input_ids
+        >>> answer_ids = tokenizer(
+        ...     "alan mathison turing",
+        ...     add_special_tokens=False,
+        ...     return_token_type_ids=False,
+        ...     return_attention_mask=False,
+        ...     return_tensors="pt",
+        >>> ).input_ids
 
         >>> searcher_output, reader_output, predicted_answer = model(question_ids, answer_ids)
         >>> loss = reader_output.loss
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 7e8e60e7ed4f..563a89ad94bd 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -64,8 +64,7 @@ class RealmRetriever:
     Parameters:
         config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
-            weights.
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
         tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
         block_records_path (`str`): The path of `block_records`, which cantains evidence texts.
     """
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 7f6ee9a41a61..545214278b92 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -56,11 +56,10 @@ class RealmTokenizer(BertTokenizer):
     r"""
     Construct a REALM tokenizer.
 
-    [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
-    tokenization: punctuation splitting and wordpiece.
+    [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and
+    wordpiece.
 
-    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
-    parameters.
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -99,10 +98,7 @@ def batch_encode_candidates(self, text, **kwargs):
         >>> from transformers import RealmTokenizer
 
         >>> # batch_size = 2, num_candidates = 2
-        >>> text = [
-        >>>     ["Hello world!", "Nice to meet you!"],
-        >>>     ["The cute cat.", "The adorable dog."]
-        >>> ]
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
 
         >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
         >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")

From 5c9118a6db3cea0851808772163218bee42c4d4f Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 00:36:58 +0800
Subject: [PATCH 79/98] Update training test

---
 tests/test_modeling_realm.py | 55 +++++++++++++++++++++++++++---------
 1 file changed, 42 insertions(+), 13 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 9b05eb2c217d..2f1cb9c4fa23 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -33,6 +33,7 @@
         RealmForOpenQA,
         RealmKnowledgeAugEncoder,
         RealmReader,
+        RealmRetriever,
         RealmScorer,
         RealmTokenizer,
     )
@@ -50,7 +51,7 @@ def __init__(
         batch_size=13,
         retriever_proj_size=128,
         seq_length=7,
-        is_training=False,
+        is_training=True,
         use_input_mask=True,
         use_token_type_ids=True,
         use_labels=True,
@@ -67,7 +68,6 @@ def __init__(
         type_sequence_label_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        use_scann=True,
         span_hidden_size=50,
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
@@ -103,7 +103,6 @@ def __init__(
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
-        self.use_scann = use_scann
 
         # Reader config
         self.span_hidden_size = span_hidden_size
@@ -315,7 +314,7 @@ class RealmModelTest(ModelTesterMixin, unittest.TestCase):
             RealmEmbedder,
             RealmKnowledgeAugEncoder,
             # RealmScorer is excluded from common tests as it is a container model
-            # consisting of two RealmEmbedders & simple inner product calculation.
+            # consisting of two RealmEmbedders & a simple inner product calculation.
             # RealmScorer
         )
         if is_torch_available()
@@ -358,16 +357,46 @@ def test_training(self):
         if not self.model_tester.is_training:
             return
 
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config, *inputs = self.model_tester.prepare_config_and_inputs()
+        input_ids, token_type_ids, input_mask, scorer_encoder_inputs = inputs[0:4]
         config.return_dict = True
 
-        for model_class in [RealmKnowledgeAugEncoder]:
-            model = model_class(config)
-            model.to(torch_device)
-            model.train()
-            inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
-            loss = model(**inputs).loss
-            loss.backward()
+        tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
+
+        # RealmKnowledgeAugEncoder training
+        model = RealmKnowledgeAugEncoder(config)
+        model.to(torch_device)
+        model.train()
+        
+        inputs_dict = {
+            "input_ids": scorer_encoder_inputs[0].to(torch_device),
+            "attention_mask": scorer_encoder_inputs[1].to(torch_device),
+            "token_type_ids": scorer_encoder_inputs[2].to(torch_device),
+            "relevance_score": floats_tensor([self.model_tester.batch_size, self.model_tester.num_candidates]),
+        }
+        inputs_dict["labels"] = torch.zeros(
+            (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device
+        )
+        inputs = inputs_dict
+        loss = model(**inputs).loss
+        loss.backward()
+
+        # RealmForOpenQA training
+        config.vocab_size = 30522
+        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH) # TODO: TF record dataset
+        model = RealmForOpenQA(config, retriever)
+        model.to(torch_device)
+        model.train()
+
+        inputs_dict = {
+            "input_ids": input_ids[:1].to(torch_device),
+            "attention_mask": input_mask[:1].to(torch_device),
+            "token_type_ids": token_type_ids[:1].to(torch_device),
+            "answer_ids": input_ids[:1].tolist(),
+        }
+        inputs = self._prepare_for_class(inputs_dict, RealmForOpenQA)
+        loss = model(**inputs).reader_output.loss
+        loss.backward()
 
     @slow
     def test_embedder_from_pretrained(self):
@@ -435,7 +464,7 @@ def test_inference_encoder(self):
     def test_inference_open_qa(self):
         from transformers.models.realm.retrieval_realm import RealmRetriever
 
-        config = RealmConfig(use_scann=False)
+        config = RealmConfig()
 
         tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
         retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)

From d6d94be708375a320d6c511cfdfaeee437916e72 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 5 Jan 2022 18:48:48 +0000
Subject: [PATCH 80/98] fix retriever

---
 .../models/realm/modeling_realm.py            | 24 ++++++-----
 .../models/realm/retrieval_realm.py           | 41 ++++++++++++++-----
 tests/test_modeling_realm.py                  |  7 ++--
 3 files changed, 48 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7adec5d17afa..178646b08a2c 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1133,15 +1133,15 @@ class RealmEmbedder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = RealmBertModel(self.config)
+        self.realm = RealmBertModel(self.config)
         self.cls = RealmScorerProjection(self.config)
         self.init_weights()
 
     def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
+        return self.realm.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.bert.embeddings.word_embeddings = value
+        self.realm.embeddings.word_embeddings = value
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC)
@@ -1163,7 +1163,7 @@ def forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        bert_outputs = self.bert(
+        bert_outputs = self.realm(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1320,15 +1320,15 @@ def forward(
 class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.bert = RealmBertModel(self.config)
+        self.realm = RealmBertModel(self.config)
         self.cls = RealmOnlyMLMHead(self.config)
         self.init_weights()
 
     def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
+        return self.realm.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.bert.embeddings.word_embeddings = value
+        self.realm.embeddings.word_embeddings = value
 
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
@@ -1397,7 +1397,7 @@ def forward(
             input_ids, attention_mask, token_type_ids
         )
 
-        joint_outputs = self.bert(
+        joint_outputs = self.realm(
             flattened_input_ids,
             attention_mask=flattened_attention_mask,
             token_type_ids=flattened_token_type_ids,
@@ -1474,7 +1474,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.bert = RealmBertModel(config)
+        self.realm = RealmBertModel(config)
         self.cls = RealmOnlyMLMHead(config)
         self.qa_outputs = RealmReaderProjection(config)
 
@@ -1522,7 +1522,7 @@ def forward(
             raise ValueError("You have to specify `token_type_ids` to separate question block and evidence block.")
         if token_type_ids.size(1) < self.config.max_span_width:
             raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.")
-        outputs = self.bert(
+        outputs = self.realm(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1747,7 +1747,9 @@ def forward(
         retrieved_block_ids = retrieved_block_ids.squeeze().cpu()
 
         # Retrieve possible answers
-        has_answers, start_pos, end_pos, concat_inputs = self.retriever(retrieved_block_ids, input_ids, answer_ids)
+        has_answers, start_pos, end_pos, concat_inputs = self.retriever(
+            retrieved_block_ids, input_ids, answer_ids, max_length=self.config.reader_seq_len
+        )
 
         if has_answers is not None:
             has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device)
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 563a89ad94bd..1747eadebb79 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -13,9 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Realm Retriever model implementation."""
+import os
+from typing import Optional, Union
+
 import numpy as np
 
+from huggingface_hub import hf_hub_download
+
 from ...utils import logging
+from .tokenization_realm import RealmTokenizer
+
+
+_REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
 
 
 logger = logging.get_logger(__name__)
@@ -69,16 +78,12 @@ class RealmRetriever:
         block_records_path (`str`): The path of `block_records`, which cantains evidence texts.
     """
 
-    def __init__(self, config, tokenizer, block_records_path):
+    def __init__(self, block_records, tokenizer):
         super().__init__()
-        self.config = config
-        self.block_records = convert_tfrecord_to_np(
-            block_records_path=block_records_path,
-            num_block_records=config.num_block_records,
-        )
+        self.block_records = block_records
         self.tokenizer = tokenizer
 
-    def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_tensors="pt"):
+    def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, max_length=None, return_tensors="pt"):
         retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
 
         question = self.tokenizer.decode(question_input_ids[0], skip_special_tokens=True)
@@ -89,9 +94,7 @@ def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_t
             text.append(question)
             text_pair.append(retrieved_block.decode())
 
-        concat_inputs = self.tokenizer(
-            text, text_pair, padding=True, truncation=True, max_length=self.config.reader_seq_len
-        )
+        concat_inputs = self.tokenizer(text, text_pair, padding=True, truncation=True, max_length=max_length)
         concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors)
 
         if answer_ids is not None:
@@ -99,6 +102,24 @@ def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, return_t
         else:
             return (None, None, None, concat_inputs_tensors)
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs):
+
+        block_records_path = hf_hub_download(
+            repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
+        )
+        block_records = np.load(block_records_path, allow_pickle=True)
+
+        tokenizer = RealmTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+
+        return cls(block_records, tokenizer)
+
+    def save_pretrained(self, save_directory):
+        # save block records
+        np.save(os.path.join(save_directory, _REALM_BLOCK_RECORDS_FILENAME), self.block_records)
+        # save tokenizer
+        self.tokenizer.save_pretrained(save_directory)
+
     def block_has_answer(self, concat_inputs, answer_ids):
         """check if retrieved_blocks has answers."""
         has_answers = []
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 9b05eb2c217d..45c74931ba3b 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -39,8 +39,8 @@
 
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
-BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
-# BLOCK_RECORDS_PATH = "/home/patrick/realm/blocks.tfr"
+# BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
+BLOCK_RECORDS_PATH = "/home/patrick/realm/block_records.npy"
 
 
 class RealmModelTester:
@@ -438,7 +438,8 @@ def test_inference_open_qa(self):
         config = RealmConfig(use_scann=False)
 
         tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
-        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
+        retriever = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-openqa")
+        #        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
 
         model = RealmForOpenQA.from_pretrained(
             "qqaatw/realm-orqa-nq-openqa",

From e172e730fb6d5d3a49c76ec87b29e65b8c8195e7 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 5 Jan 2022 18:51:05 +0000
Subject: [PATCH 81/98] remove hardcoded path

---
 tests/test_modeling_realm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 45c74931ba3b..b3437a292bd8 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -39,8 +39,6 @@
 
 # Direct download link
 # https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
-# BLOCK_RECORDS_PATH = r"/mnt/sda1/REALM/language/language/data/enwiki-20181220/blocks.tfr"
-BLOCK_RECORDS_PATH = "/home/patrick/realm/block_records.npy"
 
 
 class RealmModelTester:
@@ -439,7 +437,6 @@ def test_inference_open_qa(self):
 
         tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
         retriever = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-openqa")
-        #        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH)
 
         model = RealmForOpenQA.from_pretrained(
             "qqaatw/realm-orqa-nq-openqa",

From ec695cb7ee636838ef17399944cf69395a3f039b Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 14:28:16 +0800
Subject: [PATCH 82/98] Fix

---
 .../models/realm/modeling_realm.py            | 79 +++++++++++--------
 .../models/realm/retrieval_realm.py           |  1 +
 tests/test_modeling_realm.py                  |  4 +-
 3 files changed, 49 insertions(+), 35 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 7adec5d17afa..d1e764a8ca48 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -89,28 +89,42 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         arrays.append(array)
 
     for name, array in zip(names, arrays):
-        # For reader
-        if not isinstance(model, RealmForOpenQA) and isinstance(model, RealmReader) and "reader" not in name:
-            logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
-            continue
-        elif not isinstance(model, RealmForOpenQA) and not isinstance(model, RealmReader) and "reader" in name:
+        if isinstance(model, RealmReader) and "reader" not in name:
             logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
             continue
 
-        # For reader
-        reader_prefix = "" if isinstance(model, RealmReader) else "reader/"
-        name = name.replace("reader/module/bert/", f"{reader_prefix}bert/")
-        name = name.replace("reader/module/cls/", f"{reader_prefix}cls/")
-        name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/")
-        name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/")
-        name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
-
-        # For embedder and retriever
-        embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
-        name = name.replace("module/module/module/bert/", f"{embedder_prefix}bert/")
-        name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
-        name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
-        name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+        # For pretrained openqa reader
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmForOpenQA):
+            name = name.replace("bert/", "reader/realm/")
+            name = name.replace("cls/", "reader/cls/")
+        
+        # For pretrained encoder
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmKnowledgeAugEncoder):
+            name = name.replace("bert/", "realm/")
+
+        # For finetuned reader
+        if name.startswith("reader"):
+            reader_prefix = "" if isinstance(model, RealmReader) else "reader/"
+            name = name.replace("reader/module/bert/", f"{reader_prefix}realm/")
+            name = name.replace("reader/module/cls/", f"{reader_prefix}cls/")
+            name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/")
+            name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/")
+            name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
+
+        # For embedder and scorer
+        if name.startswith("module/module/module/"): # finetuned
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/")
+            name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+            name = name.replace("module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+        elif name.startswith("module/module/"): # pretrained
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
+            
 
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
@@ -643,7 +657,7 @@ def forward(self, hidden_states):
 
 class RealmBertModel(PreTrainedModel):
     """
-    Same as the original BertModel but remvoe docstrings and inherit PreTrainedModel directly.
+    Same as the original BertModel but remove docstrings and inherit PreTrainedModel directly.
     """
 
     def __init__(self, config, add_pooling_layer=True):
@@ -1133,15 +1147,15 @@ class RealmEmbedder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
 
-        self.bert = RealmBertModel(self.config)
+        self.realm = RealmBertModel(self.config)
         self.cls = RealmScorerProjection(self.config)
         self.init_weights()
 
     def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
+        return self.realm.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.bert.embeddings.word_embeddings = value
+        self.realm.embeddings.word_embeddings = value
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC)
@@ -1163,7 +1177,7 @@ def forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        bert_outputs = self.bert(
+        bert_outputs = self.realm(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1320,15 +1334,15 @@ def forward(
 class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
-        self.bert = RealmBertModel(self.config)
+        self.realm = RealmBertModel(self.config)
         self.cls = RealmOnlyMLMHead(self.config)
         self.init_weights()
 
     def get_input_embeddings(self):
-        return self.bert.embeddings.word_embeddings
+        return self.realm.embeddings.word_embeddings
 
     def set_input_embeddings(self, value):
-        self.bert.embeddings.word_embeddings = value
+        self.realm.embeddings.word_embeddings = value
 
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
@@ -1397,7 +1411,7 @@ def forward(
             input_ids, attention_mask, token_type_ids
         )
 
-        joint_outputs = self.bert(
+        joint_outputs = self.realm(
             flattened_input_ids,
             attention_mask=flattened_attention_mask,
             token_type_ids=flattened_token_type_ids,
@@ -1474,7 +1488,7 @@ def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
 
-        self.bert = RealmBertModel(config)
+        self.realm = RealmBertModel(config)
         self.cls = RealmOnlyMLMHead(config)
         self.qa_outputs = RealmReaderProjection(config)
 
@@ -1522,7 +1536,7 @@ def forward(
             raise ValueError("You have to specify `token_type_ids` to separate question block and evidence block.")
         if token_type_ids.size(1) < self.config.max_span_width:
             raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.")
-        outputs = self.bert(
+        outputs = self.realm(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1714,16 +1728,15 @@ def forward(
         >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-open-qa", retriever=retriever)
 
         >>> question = "Who is the pioneer in modern computer science?"
-        >>> quastion_ids = tokenizer(question, return_tensors="pt").input_ids
+        >>> quastion_ids = tokenizer([question], return_tensors="pt").input_ids
         >>> answer_ids = tokenizer(
         ...     "alan mathison turing",
         ...     add_special_tokens=False,
         ...     return_token_type_ids=False,
         ...     return_attention_mask=False,
-        ...     return_tensors="pt",
         >>> ).input_ids
 
-        >>> searcher_output, reader_output, predicted_answer = model(question_ids, answer_ids)
+        >>> reader_output, predicted_answer = model(**question_ids, answer_ids=answer_ids)
         >>> loss = reader_output.loss
         ```"""
 
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 563a89ad94bd..8db2be4a801d 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -32,6 +32,7 @@ def convert_tfrecord_to_np(block_records_path, num_block_records):
 
 
 class ScaNNSearcher:
+    """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included. """
     def __init__(
         self,
         db,
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 2f1cb9c4fa23..a603e05d2a4e 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -447,7 +447,7 @@ def test_inference_encoder(self):
         vocab_size = 30522
 
         model = RealmKnowledgeAugEncoder.from_pretrained(
-            "qqaatw/realm-cc-news-pretrained-bert", num_candidates=num_candidates
+            "qqaatw/realm-cc-news-pretrained-encoder", num_candidates=num_candidates
         )
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])
         relevance_score = torch.tensor([[0.3, 0.7]], dtype=torch.float32)
@@ -522,7 +522,7 @@ def test_inference_reader(self):
     def test_inference_scorer(self):
         num_candidates = 2
 
-        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever", num_candidates=num_candidates)
+        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-scorer", num_candidates=num_candidates)
 
         input_ids = torch.tensor([[0, 1, 2, 3, 4, 5]])
         candidate_input_ids = torch.tensor([[0, 1, 2, 3, 4, 5], [6, 7, 8, 9, 10, 11]])

From dc865ca1d5fe02fdab1f1bd025fe5f68faa34d4f Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 15:18:15 +0800
Subject: [PATCH 83/98] Fix modeling test

---
 tests/test_modeling_realm.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 0f3cb59a583f..4851a0c06c41 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -38,9 +38,6 @@
         RealmTokenizer,
     )
 
-# Direct download link
-# https://storage.cloud.google.com/orqa-data/enwiki-20181220/blocks.tfr
-
 
 class RealmModelTester:
     def __init__(
@@ -380,8 +377,10 @@ def test_training(self):
         loss.backward()
 
         # RealmForOpenQA training
-        config.vocab_size = 30522
-        retriever = RealmRetriever(config, tokenizer, BLOCK_RECORDS_PATH) # TODO: TF record dataset
+        config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
+        # TODO: think how to provide a dummy block_records.
+        block_records = np.load(block_records_path, allow_pickle=True)
+        retriever = RealmRetriever(config, block_records, tokenizer)
         model = RealmForOpenQA(config, retriever)
         model.to(torch_device)
         model.train()
@@ -403,13 +402,12 @@ def test_embedder_from_pretrained(self):
 
     @slow
     def test_encoder_from_pretrained(self):
-        model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder")
         self.assertIsNotNone(model)
 
     @slow
     def test_open_qa_from_pretrained(self):
-        # TODO: TF record dataset
-        model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-openqa", BLOCK_RECORDS_PATH)
+        model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-openqa")
         self.assertIsNotNone(model)
 
     @slow
@@ -419,7 +417,7 @@ def test_reader_from_pretrained(self):
 
     @slow
     def test_scorer_from_pretrained(self):
-        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-retriever")
+        model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-scorer")
         self.assertIsNotNone(model)
 
 

From ebd507c882d68e0767e07afb203ca0707a1a62bc Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 15:40:04 +0800
Subject: [PATCH 84/98] Update model links

---
 .../models/realm/configuration_realm.py       |  5 ++--
 .../models/realm/modeling_realm.py            |  9 +++----
 .../models/realm/tokenization_realm.py        | 24 ++++++++++++-------
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index f36a599f69db..cba795880642 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -21,9 +21,10 @@
 logger = logging.get_logger(__name__)
 
 REALM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "realm-cc-news-pretrained-bert": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-bert/resolve/main/config.json",
     "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/config.json",
-    "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/config.json",
+    "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/config.json",
+    "realm-cc-news-pretrained-scorer": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-scorer/resolve/main/config.json",
+    "realm-cc-news-pretrained-openqa": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-openqa/aresolve/main/config.json",
     "realm-orqa-nq-openqa": "https://huggingface.co/qqaatw/realm-orqa-nq-openqa/resolve/main/config.json",
     "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-reader/resolve/main/config.json",
     "realm-orqa-wq-openqa": "https://huggingface.co/qqaatw/realm-orqa-wq-openqa/resolve/main/config.json",
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 15cdf4e569c6..ecbf0ada58bd 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -44,16 +44,17 @@
 
 
 logger = logging.get_logger(__name__)
-_BERT_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-bert"
 _EMBEDDER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-embedder"
-_RETRIEVER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-retriever"
+_ENCODER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-encoder"
+_SCORER_CHECKPOINT_FOR_DOC = "qqaatw/realm-cc-news-pretrained-scorer"
 _CONFIG_FOR_DOC = "RealmConfig"
 _TOKENIZER_FOR_DOC = "RealmTokenizer"
 
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "qqaatw/realm-cc-news-pretrained-bert",
     "qqaatw/realm-cc-news-pretrained-embedder",
-    "qqaatw/realm-cc-news-pretrained-retriever",
+    "qqaatw/realm-cc-news-pretrained-encoder",
+    "qqaatw/realm-cc-news-pretrained-scorer",
+    "qqaatw/realm-cc-news-pretrained-openqa",
     "qqaatw/realm-orqa-nq-openqa",
     "qqaatw/realm-orqa-nq-reader",
     "qqaatw/realm-orqa-wq-openqa",
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index 545214278b92..e29b0a7c53cb 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -27,28 +27,36 @@
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
         "realm-cc-news-pretrained-embedder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-embedder/resolve/main/vocab.txt",
-        "realm-cc-news-pretrained-retriever": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-retriever/resolve/main/vocab.txt",
         "realm-cc-news-pretrained-encoder": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-encoder/resolve/main/vocab.txt",
-        "realm-orqa-nq-searcher": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/vocab.txt",
-        "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-searcher/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-scorer": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-scorer/resolve/main/vocab.txt",
+        "realm-cc-news-pretrained-openqa": "https://huggingface.co/qqaatw/realm-cc-news-pretrained-openqa/aresolve/main/vocab.txt",
+        "realm-orqa-nq-openqa": "https://huggingface.co/qqaatw/realm-orqa-nq-openqa/resolve/main/vocab.txt",
+        "realm-orqa-nq-reader": "https://huggingface.co/qqaatw/realm-orqa-nq-reader/resolve/main/vocab.txt",
+        "realm-orqa-wq-openqa": "https://huggingface.co/qqaatw/realm-orqa-wq-openqa/resolve/main/vocab.txt",
+        "realm-orqa-wq-reader": "https://huggingface.co/qqaatw/realm-orqa-wq-reader/resolve/main/vocab.txt",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     "realm-cc-news-pretrained-embedder": 512,
-    "realm-cc-news-pretrained-retriever": 512,
     "realm-cc-news-pretrained-encoder": 512,
-    "realm-orqa-nq-searcher": 512,
+    "realm-cc-news-pretrained-scorer": 512,
+    "realm-cc-news-pretrained-openqa": 512,
+    "realm-orqa-nq-openqa": 512,
     "realm-orqa-nq-reader": 512,
+    "realm-orqa-wq-openqa": 512,
+    "realm-orqa-wq-reader": 512,
 }
 
-
 PRETRAINED_INIT_CONFIGURATION = {
     "realm-cc-news-pretrained-embedder": {"do_lower_case": True},
-    "realm-cc-news-pretrained-retriever": {"do_lower_case": True},
     "realm-cc-news-pretrained-encoder": {"do_lower_case": True},
-    "realm-orqa-nq-searcher": {"do_lower_case": True},
+    "realm-cc-news-pretrained-scorer": {"do_lower_case": True},
+    "realm-cc-news-pretrained-openqa": {"do_lower_case": True},
+    "realm-orqa-nq-openqa": {"do_lower_case": True},
     "realm-orqa-nq-reader": {"do_lower_case": True},
+    "realm-orqa-wq-openqa": {"do_lower_case": True},
+    "realm-orqa-wq-reader": {"do_lower_case": True},
 }
 
 

From db2f4fe0364550d9ab6f2286cd8d85d3338b0bb5 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 16:24:41 +0800
Subject: [PATCH 85/98] Initial retrieval test

---
 tests/test_retrieval_realm.py | 91 +++++++++++++++++++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 tests/test_retrieval_realm.py

diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
new file mode 100644
index 000000000000..b9abaed81846
--- /dev/null
+++ b/tests/test_retrieval_realm.py
@@ -0,0 +1,91 @@
+import os
+import shutil
+import tempfile
+from unittest import TestCase
+
+import numpy as np
+from datasets import Dataset
+
+from transformers.models.realm.configuration_realm import RealmConfig
+from transformers.models.realm.retrieval_realm import RealmRetriever
+from transformers.models.realm.tokenization_realm import RealmTokenizer, VOCAB_FILES_NAMES
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch
+
+
+class RealmRetrieverTest(TestCase):
+    def setUp(self):
+        self.tmpdirname = tempfile.mkdtemp()
+        self.num_block_records = 10
+
+        # Realm tok
+        vocab_tokens = [
+            "[UNK]",
+            "[CLS]",
+            "[SEP]",
+            "[PAD]",
+            "[MASK]",
+            "want",
+            "##want",
+            "##ed",
+            "wa",
+            "un",
+            "runn",
+            "##ing",
+            ",",
+            "low",
+            "lowest",
+        ]
+        realm_tokenizer_path = os.path.join(self.tmpdirname, "realm_tokenizer")
+        os.makedirs(realm_tokenizer_path, exist_ok=True)
+        self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self) -> RealmTokenizer:
+        return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))
+    
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def get_dummy_dataset(self):
+        dataset = Dataset.from_dict(
+            {
+                "id": ["0", "1"],
+                "question": ["foo", "bar"],
+                "answers": [["Foo", "Bar"], ["Bar"]],
+            }
+        )
+        return dataset
+    
+    def get_dummy_block_records(self):
+        np_block_records = np.array(
+            [
+                "This is the first record",
+                "This is the second record",
+            ],
+            np.object,
+        )
+        return np_block_records
+
+    def get_dummy_retriever(self):
+        config = RealmConfig(
+            num_block_records=self.num_block_records
+        )
+        retriever = RealmRetriever(
+            config,
+            block_records=self.get_dummy_block_records,
+            tokenizer=self.get_tokenizer(),
+        )
+        return retriever
+
+    def test_retrieve(self):
+        pass
+
+    def test_block_has_answer(self):
+        pass
+
+    def test_from_pretrained(self):
+        pass
+
+    def test_save_pretrained(self):
+        pass

From 16577d74930bf0764a5f22bd1dc6c1246750eac2 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 16:41:21 +0800
Subject: [PATCH 86/98] Fix modeling test

---
 tests/test_modeling_realm.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 4851a0c06c41..81a801340848 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -16,6 +16,8 @@
 
 
 import unittest
+import copy
+import numpy as np
 
 from tests.test_modeling_common import floats_tensor
 from transformers import RealmConfig, is_torch_available
@@ -377,11 +379,23 @@ def test_training(self):
         loss.backward()
 
         # RealmForOpenQA training
-        config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
-        # TODO: think how to provide a dummy block_records.
-        block_records = np.load(block_records_path, allow_pickle=True)
-        retriever = RealmRetriever(config, block_records, tokenizer)
-        model = RealmForOpenQA(config, retriever)
+        openqa_config = copy.deepcopy(config)
+        openqa_config.vocab_size = 30522  # the retrieved texts will inevitably have more than 99 vocabs.
+        openqa_config.num_block_records = 5
+        openqa_config.searcher_beam_size = 2
+
+        block_records = np.array(
+            [
+                b"This is the first record.",
+                b"This is the second record.",
+                b"This is the third record.",
+                b"This is the fourth record.",
+                b"This is the fifth record.",
+            ],
+            dtype=np.object,
+        )
+        retriever = RealmRetriever(block_records, tokenizer)
+        model = RealmForOpenQA(openqa_config, retriever)
         model.to(torch_device)
         model.train()
 

From 881bbd2ff6b4263c1564c6ca920e89fb7c35694f Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 18:43:36 +0800
Subject: [PATCH 87/98] Complete retrieval tests

---
 .../models/realm/retrieval_realm.py           |  17 ++-
 tests/test_retrieval_realm.py                 | 107 ++++++++++++++----
 2 files changed, 96 insertions(+), 28 deletions(-)

diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index e3445218874e..2cee1978f156 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -69,14 +69,11 @@ def search_batched(self, question_projection):
 
 
 class RealmRetriever:
-    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers."
+    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer positions."
 
     Parameters:
-        config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        block_records (`np.array`): `block_records` which cantains evidence texts.
         tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
-        block_records_path (`str`): The path of `block_records`, which cantains evidence texts.
     """
 
     def __init__(self, block_records, tokenizer):
@@ -105,10 +102,12 @@ def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, max_leng
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs):
-
-        block_records_path = hf_hub_download(
-            repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
-        )
+        if os.path.isdir(pretrained_model_name_or_path):
+            block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME)
+        else:
+            block_records_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
+            )
         block_records = np.load(block_records_path, allow_pickle=True)
 
         tokenizer = RealmTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index b9abaed81846..4e4d6b7d5908 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -2,20 +2,21 @@
 import shutil
 import tempfile
 from unittest import TestCase
+from unittest.mock import patch
 
 import numpy as np
 from datasets import Dataset
 
 from transformers.models.realm.configuration_realm import RealmConfig
-from transformers.models.realm.retrieval_realm import RealmRetriever
+from transformers.models.realm.retrieval_realm import RealmRetriever, _REALM_BLOCK_RECORDS_FILENAME
 from transformers.models.realm.tokenization_realm import RealmTokenizer, VOCAB_FILES_NAMES
-from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch
+from transformers.testing_utils import require_sentencepiece, require_tokenizers
 
 
 class RealmRetrieverTest(TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
-        self.num_block_records = 10
+        self.num_block_records = 5
 
         # Realm tok
         vocab_tokens = [
@@ -24,6 +25,17 @@ def setUp(self):
             "[SEP]",
             "[PAD]",
             "[MASK]",
+            "test",
+            "question",
+            "this",
+            "is",
+            "the",
+            "first",
+            "second",
+            "third",
+            "fourth",
+            "fifth",
+            "record",
             "want",
             "##want",
             "##ed",
@@ -40,6 +52,9 @@ def setUp(self):
         self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        
+        realm_block_records_path = os.path.join(self.tmpdirname, "realm_block_records")
+        os.makedirs(realm_block_records_path, exist_ok=True)
 
     def get_tokenizer(self) -> RealmTokenizer:
         return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))
@@ -47,6 +62,12 @@ def get_tokenizer(self) -> RealmTokenizer:
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
+    def get_config(self):
+        config = RealmConfig(
+            num_block_records=self.num_block_records
+        )
+        return config
+
     def get_dummy_dataset(self):
         dataset = Dataset.from_dict(
             {
@@ -58,34 +79,82 @@ def get_dummy_dataset(self):
         return dataset
     
     def get_dummy_block_records(self):
-        np_block_records = np.array(
+        block_records = np.array(
             [
-                "This is the first record",
-                "This is the second record",
+                b"This is the first record",
+                b"This is the second record",
+                b"This is the third record",
+                b"This is the fourth record",
+                b"This is the fifth record",
             ],
-            np.object,
+            dtype=np.object,
         )
-        return np_block_records
+        return block_records
 
     def get_dummy_retriever(self):
-        config = RealmConfig(
-            num_block_records=self.num_block_records
-        )
         retriever = RealmRetriever(
-            config,
-            block_records=self.get_dummy_block_records,
+            block_records=self.get_dummy_block_records(),
             tokenizer=self.get_tokenizer(),
         )
         return retriever
 
     def test_retrieve(self):
-        pass
+        config = self.get_config()
+        retriever = self.get_dummy_retriever()
+        tokenizer = retriever.tokenizer
+
+        retrieved_block_ids = np.array([0, 3], dtype=np.long)
+        question_input_ids = tokenizer(["Test question"]).input_ids
+        answer_ids = tokenizer(["the fourth"],
+            add_special_tokens=False,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+        ).input_ids
+        max_length=config.reader_seq_len
+
+        has_answers, start_pos, end_pos, concat_inputs = retriever(retrieved_block_ids, question_input_ids, answer_ids, max_length)
+
+        self.assertEqual(len(has_answers), 2)
+        self.assertEqual(len(start_pos), 2)
+        self.assertEqual(len(end_pos), 2)
+        self.assertEqual(concat_inputs.input_ids.shape, (2, 10))
+        self.assertEqual(concat_inputs.attention_mask.shape, (2, 10))
+        self.assertEqual(concat_inputs.token_type_ids.shape, (2, 10))
+        self.assertEqual(tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),  ['[CLS]', 'test', 'question', '[SEP]', 'this', 'is', 'the', 'first', 'record', '[SEP]'])
+        self.assertEqual(tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),  ['[CLS]', 'test', 'question', '[SEP]', 'this', 'is', 'the', 'fourth', 'record', '[SEP]'])
+
 
     def test_block_has_answer(self):
-        pass
+        config = self.get_config()
+        retriever = self.get_dummy_retriever()
+        tokenizer = retriever.tokenizer
+
+        retrieved_block_ids = np.array([0, 3], dtype=np.long)
+        question_input_ids = tokenizer(["Test question"]).input_ids
+        answer_ids = tokenizer(["the fourth"],
+            add_special_tokens=False,
+            return_token_type_ids=False,
+            return_attention_mask=False,
+        ).input_ids
+        max_length=config.reader_seq_len
+
+        has_answers, start_pos, end_pos, _ = retriever(retrieved_block_ids, question_input_ids, answer_ids, max_length)
+
+        self.assertEqual([False, True], has_answers)
+        self.assertEqual([[-1], [6]], start_pos)
+        self.assertEqual([[-1], [7]], end_pos)
+
+    def test_save_load_pretrained(self):
+        retriever = self.get_dummy_retriever()
+        retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
+        
+        # Test local path
+        retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
+        self.assertEqual(retriever.block_records[0], b"This is the first record")
 
-    def test_from_pretrained(self):
-        pass
+        # Test mocked remote path
+        with patch("transformers.models.realm.retrieval_realm.hf_hub_download") as mock_hf_hub_download:
+            mock_hf_hub_download.return_value = os.path.join(os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME)
+            retriever = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
 
-    def test_save_pretrained(self):
-        pass
+        self.assertEqual(retriever.block_records[0], b"This is the first record")

From 1927e4fe682d6d6e711bf96232c267cd7a094a6b Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 19:03:05 +0800
Subject: [PATCH 88/98] Fix

---
 src/transformers/models/realm/__init__.py     |  2 +-
 .../models/realm/configuration_realm.py       | 11 ++-------
 .../models/realm/modeling_realm.py            | 23 +++++++++----------
 .../models/realm/retrieval_realm.py           |  5 ++--
 tests/test_retrieval_realm.py                 | 15 ++++++++++++
 5 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 33105a6927f2..42456b52af98 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2021 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index cba795880642..06bf641a9194 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" REALM model configuration"""
+""" REALM model configuration."""
 
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -86,9 +86,6 @@ class RealmConfig(PretrainedConfig):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
         span_hidden_size (`int`, *optional*, defaults to 256):
             Dimension of the reader's spans.
         max_span_width (`int`, *optional*, defaults to 10):
@@ -139,7 +136,6 @@ def __init__(
         type_vocab_size=2,
         initializer_range=0.02,
         layer_norm_eps=1e-12,
-        use_cache=True,
         span_hidden_size=256,
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
@@ -170,7 +166,6 @@ def __init__(
         self.initializer_range = initializer_range
         self.type_vocab_size = type_vocab_size
         self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
 
         # Reader config
         self.span_hidden_size = span_hidden_size
@@ -179,9 +174,7 @@ def __init__(
         self.reader_beam_size = reader_beam_size
         self.reader_seq_len = reader_seq_len
 
-        # Searcher config
+        # Retrieval config
         self.num_block_records = num_block_records
         self.searcher_beam_size = searcher_beam_size
         self.searcher_seq_len = searcher_seq_len
-
-        # TODO: Remove use_cache
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index ecbf0ada58bd..42912b44caaa 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ PyTorch REALM model."""
 
-
 import math
 import os
 from dataclasses import dataclass
@@ -1178,7 +1177,7 @@ def forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        bert_outputs = self.realm(
+        realm_outputs = self.realm(
             input_ids,
             attention_mask=attention_mask,
             token_type_ids=token_type_ids,
@@ -1191,22 +1190,22 @@ def forward(
         )
 
         # [batch_size, hidden_size]
-        pooler_output = bert_outputs[1]
+        pooler_output = realm_outputs[1]
         # [batch_size, retriever_proj_size]
         projected_score = self.cls(pooler_output)
 
         if not return_dict:
-            return (projected_score,) + bert_outputs[2:4]
+            return (projected_score,) + realm_outputs[2:4]
         else:
             return RealmEmbedderOutput(
                 projected_score=projected_score,
-                hidden_states=bert_outputs.hidden_states,
-                attentions=bert_outputs.attentions,
+                hidden_states=realm_outputs.hidden_states,
+                attentions=realm_outputs.attentions,
             )
 
 
 @add_start_docstrings(
-    "The scorer of REALM outputting relevance score representing the score of document candidates (before softmax).",
+    "The scorer of REALM outputting relevance scores representing the score of document candidates (before softmax).",
     REALM_START_DOCSTRING,
 )
 class RealmScorer(RealmPreTrainedModel):
@@ -1395,8 +1394,8 @@ def forward(
         >>> import torch
         >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
-        >>> model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-bert", num_candidates=2)
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder")
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder", num_candidates=2)
 
         >>> # batch_size = 2, num_candidates = 2
         >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
@@ -1724,9 +1723,9 @@ def forward(
         >>> import torch
         >>> from transformers import RealmForOpenQA, RealmRetriever, RealmTokenizer
 
-        >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-open-qa")
-        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-open-qa")
-        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-open-qa", retriever=retriever)
+        >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
+        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa", retriever=retriever)
 
         >>> question = "Who is the pioneer in modern computer science?"
         >>> quastion_ids = tokenizer([question], return_tensors="pt").input_ids
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 2cee1978f156..0980184c448b 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020, The RAG Authors and The HuggingFace Inc. team.
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Realm Retriever model implementation."""
+"""REALM Retriever model implementation."""
+
 import os
 from typing import Optional, Union
 
diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index 4e4d6b7d5908..88e964ae650c 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -1,3 +1,18 @@
+# coding=utf-8
+# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import shutil
 import tempfile

From 4048d7dd5990977435c7b7d1974c145fb617cb43 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 19:10:19 +0800
Subject: [PATCH 89/98] style

---
 .../models/realm/modeling_realm.py            | 11 +++--
 .../models/realm/retrieval_realm.py           | 12 +++--
 tests/test_modeling_realm.py                  |  5 +-
 tests/test_retrieval_realm.py                 | 46 +++++++++++--------
 4 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 42912b44caaa..36790c81f17b 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -97,7 +97,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
         if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmForOpenQA):
             name = name.replace("bert/", "reader/realm/")
             name = name.replace("cls/", "reader/cls/")
-        
+
         # For pretrained encoder
         if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmKnowledgeAugEncoder):
             name = name.replace("bert/", "realm/")
@@ -112,7 +112,7 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
             name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
 
         # For embedder and scorer
-        if name.startswith("module/module/module/"): # finetuned
+        if name.startswith("module/module/module/"):  # finetuned
             embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
             name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}realm/")
             name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
@@ -120,11 +120,10 @@ def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
             name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
             name = name.replace("module/module/module/bert/", f"{embedder_prefix}realm/")
             name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
-        elif name.startswith("module/module/"): # pretrained
+        elif name.startswith("module/module/"):  # pretrained
             embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
             name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
             name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
-            
 
         name = name.split("/")
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
@@ -1395,7 +1394,9 @@ def forward(
         >>> from transformers import RealmTokenizer, RealmKnowledgeAugEncoder
 
         >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder")
-        >>> model = RealmKnowledgeAugEncoder.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder", num_candidates=2)
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained(
+        ...     "qqaatw/realm-cc-news-pretrained-encoder", num_candidates=2
+        ... )
 
         >>> # batch_size = 2, num_candidates = 2
         >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index 0980184c448b..e509a8b4488a 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -42,7 +42,8 @@ def convert_tfrecord_to_np(block_records_path, num_block_records):
 
 
 class ScaNNSearcher:
-    """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included. """
+    """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included."""
+
     def __init__(
         self,
         db,
@@ -70,11 +71,12 @@ def search_batched(self, question_projection):
 
 
 class RealmRetriever:
-    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer positions."
+    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
+    positions."
 
-    Parameters:
-        block_records (`np.array`): `block_records` which cantains evidence texts.
-        tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
+        Parameters:
+            block_records (`np.array`): `block_records` which cantains evidence texts.
+            tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
     """
 
     def __init__(self, block_records, tokenizer):
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index 81a801340848..fc14690ea261 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -15,8 +15,9 @@
 """ Testing suite for the PyTorch REALM model. """
 
 
-import unittest
 import copy
+import unittest
+
 import numpy as np
 
 from tests.test_modeling_common import floats_tensor
@@ -364,7 +365,7 @@ def test_training(self):
         model = RealmKnowledgeAugEncoder(config)
         model.to(torch_device)
         model.train()
-        
+
         inputs_dict = {
             "input_ids": scorer_encoder_inputs[0].to(torch_device),
             "attention_mask": scorer_encoder_inputs[1].to(torch_device),
diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index 88e964ae650c..ca1aa2ca28a7 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -23,9 +23,8 @@
 from datasets import Dataset
 
 from transformers.models.realm.configuration_realm import RealmConfig
-from transformers.models.realm.retrieval_realm import RealmRetriever, _REALM_BLOCK_RECORDS_FILENAME
-from transformers.models.realm.tokenization_realm import RealmTokenizer, VOCAB_FILES_NAMES
-from transformers.testing_utils import require_sentencepiece, require_tokenizers
+from transformers.models.realm.retrieval_realm import _REALM_BLOCK_RECORDS_FILENAME, RealmRetriever
+from transformers.models.realm.tokenization_realm import VOCAB_FILES_NAMES, RealmTokenizer
 
 
 class RealmRetrieverTest(TestCase):
@@ -67,20 +66,18 @@ def setUp(self):
         self.vocab_file = os.path.join(realm_tokenizer_path, VOCAB_FILES_NAMES["vocab_file"])
         with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-        
+
         realm_block_records_path = os.path.join(self.tmpdirname, "realm_block_records")
         os.makedirs(realm_block_records_path, exist_ok=True)
 
     def get_tokenizer(self) -> RealmTokenizer:
         return RealmTokenizer.from_pretrained(os.path.join(self.tmpdirname, "realm_tokenizer"))
-    
+
     def tearDown(self):
         shutil.rmtree(self.tmpdirname)
 
     def get_config(self):
-        config = RealmConfig(
-            num_block_records=self.num_block_records
-        )
+        config = RealmConfig(num_block_records=self.num_block_records)
         return config
 
     def get_dummy_dataset(self):
@@ -92,7 +89,7 @@ def get_dummy_dataset(self):
             }
         )
         return dataset
-    
+
     def get_dummy_block_records(self):
         block_records = np.array(
             [
@@ -120,14 +117,17 @@ def test_retrieve(self):
 
         retrieved_block_ids = np.array([0, 3], dtype=np.long)
         question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(["the fourth"],
+        answer_ids = tokenizer(
+            ["the fourth"],
             add_special_tokens=False,
             return_token_type_ids=False,
             return_attention_mask=False,
         ).input_ids
-        max_length=config.reader_seq_len
+        max_length = config.reader_seq_len
 
-        has_answers, start_pos, end_pos, concat_inputs = retriever(retrieved_block_ids, question_input_ids, answer_ids, max_length)
+        has_answers, start_pos, end_pos, concat_inputs = retriever(
+            retrieved_block_ids, question_input_ids, answer_ids, max_length
+        )
 
         self.assertEqual(len(has_answers), 2)
         self.assertEqual(len(start_pos), 2)
@@ -135,9 +135,14 @@ def test_retrieve(self):
         self.assertEqual(concat_inputs.input_ids.shape, (2, 10))
         self.assertEqual(concat_inputs.attention_mask.shape, (2, 10))
         self.assertEqual(concat_inputs.token_type_ids.shape, (2, 10))
-        self.assertEqual(tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),  ['[CLS]', 'test', 'question', '[SEP]', 'this', 'is', 'the', 'first', 'record', '[SEP]'])
-        self.assertEqual(tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),  ['[CLS]', 'test', 'question', '[SEP]', 'this', 'is', 'the', 'fourth', 'record', '[SEP]'])
-
+        self.assertEqual(
+            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[0]),
+            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "first", "record", "[SEP]"],
+        )
+        self.assertEqual(
+            tokenizer.convert_ids_to_tokens(concat_inputs.input_ids[1]),
+            ["[CLS]", "test", "question", "[SEP]", "this", "is", "the", "fourth", "record", "[SEP]"],
+        )
 
     def test_block_has_answer(self):
         config = self.get_config()
@@ -146,12 +151,13 @@ def test_block_has_answer(self):
 
         retrieved_block_ids = np.array([0, 3], dtype=np.long)
         question_input_ids = tokenizer(["Test question"]).input_ids
-        answer_ids = tokenizer(["the fourth"],
+        answer_ids = tokenizer(
+            ["the fourth"],
             add_special_tokens=False,
             return_token_type_ids=False,
             return_attention_mask=False,
         ).input_ids
-        max_length=config.reader_seq_len
+        max_length = config.reader_seq_len
 
         has_answers, start_pos, end_pos, _ = retriever(retrieved_block_ids, question_input_ids, answer_ids, max_length)
 
@@ -162,14 +168,16 @@ def test_block_has_answer(self):
     def test_save_load_pretrained(self):
         retriever = self.get_dummy_retriever()
         retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
-        
+
         # Test local path
         retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
         self.assertEqual(retriever.block_records[0], b"This is the first record")
 
         # Test mocked remote path
         with patch("transformers.models.realm.retrieval_realm.hf_hub_download") as mock_hf_hub_download:
-            mock_hf_hub_download.return_value = os.path.join(os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME)
+            mock_hf_hub_download.return_value = os.path.join(
+                os.path.join(self.tmpdirname, "realm_block_records"), _REALM_BLOCK_RECORDS_FILENAME
+            )
             retriever = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
 
         self.assertEqual(retriever.block_records[0], b"This is the first record")

From 34322b57ebd77e7aa2a06db9a3241d04cb942539 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Thu, 6 Jan 2022 19:41:37 +0800
Subject: [PATCH 90/98] Fix tests

---
 tests/test_retrieval_realm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index ca1aa2ca28a7..4060af50d077 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -126,7 +126,7 @@ def test_retrieve(self):
         max_length = config.reader_seq_len
 
         has_answers, start_pos, end_pos, concat_inputs = retriever(
-            retrieved_block_ids, question_input_ids, answer_ids, max_length
+            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
         )
 
         self.assertEqual(len(has_answers), 2)
@@ -159,7 +159,9 @@ def test_block_has_answer(self):
         ).input_ids
         max_length = config.reader_seq_len
 
-        has_answers, start_pos, end_pos, _ = retriever(retrieved_block_ids, question_input_ids, answer_ids, max_length)
+        has_answers, start_pos, end_pos, _ = retriever(
+            retrieved_block_ids, question_input_ids, answer_ids=answer_ids, max_length=max_length, return_tensors="np"
+        )
 
         self.assertEqual([False, True], has_answers)
         self.assertEqual([[-1], [6]], start_pos)

From 06a5412d70521e2db5f2dd19d2906532a48b87a9 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Fri, 7 Jan 2022 00:39:03 +0800
Subject: [PATCH 91/98] Fix docstring example

---
 src/transformers/models/realm/modeling_realm.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 36790c81f17b..e4b3db6d3888 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1724,20 +1724,21 @@ def forward(
         >>> import torch
         >>> from transformers import RealmForOpenQA, RealmRetriever, RealmTokenizer
 
-        >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
-        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa")
-        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-cc-news-pretrained-openqa", retriever=retriever)
+        >>> retriever = RealmRetriever.from_pretrained("qqaatw/realm-orqa-nq-openqa")
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-orqa-nq-openqa")
+        >>> model = RealmForOpenQA.from_pretrained("qqaatw/realm-orqa-nq-openqa", retriever=retriever)
 
         >>> question = "Who is the pioneer in modern computer science?"
-        >>> quastion_ids = tokenizer([question], return_tensors="pt").input_ids
+        >>> question_ids = tokenizer([question], return_tensors="pt")
         >>> answer_ids = tokenizer(
-        ...     "alan mathison turing",
+        ...     ["alan mathison turing"],
         ...     add_special_tokens=False,
         ...     return_token_type_ids=False,
         ...     return_attention_mask=False,
         >>> ).input_ids
 
-        >>> reader_output, predicted_answer = model(**question_ids, answer_ids=answer_ids)
+        >>> reader_output, predicted_answer_ids = model(**question_ids, answer_ids=answer_ids, return_dict=False)
+        >>> predicted_answer = tokenizer.decode(predicted_answer_ids)
         >>> loss = reader_output.loss
         ```"""
 

From 712c4b76ac6fd33dd745cd91287a0122ffc3f31d Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 9 Jan 2022 02:07:21 +0800
Subject: [PATCH 92/98] Minor fix of retrieval test

---
 tests/test_retrieval_realm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index 4060af50d077..de79d775904c 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -172,7 +172,7 @@ def test_save_load_pretrained(self):
         retriever.save_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
 
         # Test local path
-        retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
+        retriever = retriever.from_pretrained(os.path.join(self.tmpdirname, "realm_block_records"))
         self.assertEqual(retriever.block_records[0], b"This is the first record")
 
         # Test mocked remote path

From 1701f70819ceebbcde391500232ad921a8ed2c65 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 9 Jan 2022 02:20:33 +0800
Subject: [PATCH 93/98] Update license headers and docs

---
 docs/source/model_doc/realm.mdx                      |  4 ++--
 src/transformers/models/realm/__init__.py            |  2 +-
 src/transformers/models/realm/configuration_realm.py |  4 ++--
 src/transformers/models/realm/retrieval_realm.py     | 10 ++++++----
 src/transformers/models/realm/tokenization_realm.py  |  4 ++--
 tests/test_modeling_realm.py                         |  3 +--
 tests/test_retrieval_realm.py                        |  2 +-
 tests/test_tokenization_realm.py                     |  3 +--
 8 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docs/source/model_doc/realm.mdx b/docs/source/model_doc/realm.mdx
index c42ddbad32a3..2d42a0e159f6 100644
--- a/docs/source/model_doc/realm.mdx
+++ b/docs/source/model_doc/realm.mdx
@@ -1,4 +1,4 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -16,7 +16,7 @@ specific language governing permissions and limitations under the License.
 
 The REALM model was proposed in `REALM: Retrieval-Augmented Language Model Pre-Training
 <https://arxiv.org/abs/2002.08909>`__ by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
-retrieval-augmented language model that firstly retrieves neural knowledge from a textual knowledge corpus and then
+retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
 utilizes retrieved documents to process question answering tasks.
 
 The abstract from the paper is the following:
diff --git a/src/transformers/models/realm/__init__.py b/src/transformers/models/realm/__init__.py
index 42456b52af98..8fe1b83144bb 100644
--- a/src/transformers/models/realm/__init__.py
+++ b/src/transformers/models/realm/__init__.py
@@ -2,7 +2,7 @@
 # There's no way to ignore "F401 '...' imported but unused" warnings in this
 # module, but to preserve other warnings. So, don't check this module at all.
 
-# Copyright 2021 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index 06bf641a9194..de21daf82a81 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -140,7 +140,7 @@ def __init__(
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
         reader_beam_size=5,
-        reader_seq_len=288 + 32,
+        reader_seq_len=320, # 288 + 32
         num_block_records=13353718,
         searcher_beam_size=5000,
         searcher_seq_len=64,
diff --git a/src/transformers/models/realm/retrieval_realm.py b/src/transformers/models/realm/retrieval_realm.py
index e509a8b4488a..20ae30861583 100644
--- a/src/transformers/models/realm/retrieval_realm.py
+++ b/src/transformers/models/realm/retrieval_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
 logger = logging.get_logger(__name__)
 
 
-def convert_tfrecord_to_np(block_records_path, num_block_records):
+def convert_tfrecord_to_np(block_records_path: str, num_block_records: int) -> np.ndarray:
     import tensorflow.compat.v1 as tf
 
     blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
@@ -75,8 +75,10 @@ class RealmRetriever:
     positions."
 
         Parameters:
-            block_records (`np.array`): `block_records` which cantains evidence texts.
-            tokenizer ([`RealmTokenizer`]): The tokenizer to encode retrieved texts.
+            block_records (`np.ndarray`):
+                A numpy array which cantains evidence texts.
+            tokenizer ([`RealmTokenizer`]):
+                The tokenizer to encode retrieved texts.
     """
 
     def __init__(self, block_records, tokenizer):
diff --git a/src/transformers/models/realm/tokenization_realm.py b/src/transformers/models/realm/tokenization_realm.py
index e29b0a7c53cb..9d7b72ac8917 100644
--- a/src/transformers/models/realm/tokenization_realm.py
+++ b/src/transformers/models/realm/tokenization_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -108,7 +108,7 @@ def batch_encode_candidates(self, text, **kwargs):
         >>> # batch_size = 2, num_candidates = 2
         >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
 
-        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-bert")
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-encoder")
         >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
         ```"""
 
diff --git a/tests/test_modeling_realm.py b/tests/test_modeling_realm.py
index fc14690ea261..3126c71861c9 100644
--- a/tests/test_modeling_realm.py
+++ b/tests/test_modeling_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Testing suite for the PyTorch REALM model. """
 
-
 import copy
 import unittest
 
diff --git a/tests/test_retrieval_realm.py b/tests/test_retrieval_realm.py
index de79d775904c..2813f31a367c 100644
--- a/tests/test_retrieval_realm.py
+++ b/tests/test_retrieval_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/tests/test_tokenization_realm.py b/tests/test_tokenization_realm.py
index 2934c1e2309f..53f22dd7f2ad 100644
--- a/tests/test_tokenization_realm.py
+++ b/tests/test_tokenization_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright 2022 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import os
 import unittest
 

From 493aa10d0038fed6d092b96f948810c2b17a6e57 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 9 Jan 2022 18:48:30 +0800
Subject: [PATCH 94/98] Apply suggestions from code review

---
 .../models/realm/modeling_realm.py            | 399 +++++++++---------
 1 file changed, 199 insertions(+), 200 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index e4b3db6d3888..133f6d6ecbef 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2021 The REALM authors and The HuggingFace Inc. team.
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -654,151 +654,10 @@ def forward(self, hidden_states):
         return pooled_output
 
 
-class RealmBertModel(PreTrainedModel):
-    """
-    Same as the original BertModel but remove docstrings and inherit PreTrainedModel directly.
-    """
-
-    def __init__(self, config, add_pooling_layer=True):
-        super().__init__(config)
-        self.config = config
-
-        self.embeddings = RealmEmbeddings(config)
-        self.encoder = RealmEncoder(config)
-
-        self.pooler = RealmPooler(config) if add_pooling_layer else None
-
-        # Weight initialization is managed by Realm models.
-        # self.init_weights()
-
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-    ):
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        if self.config.is_decoder:
-            use_cache = use_cache if use_cache is not None else self.config.use_cache
-        else:
-            use_cache = False
-
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input_shape = input_ids.size()
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-
-        batch_size, seq_length = input_shape
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
-
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-
-        if attention_mask is None:
-            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-
-        if token_type_ids is None:
-            if hasattr(self.embeddings, "token_type_ids"):
-                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
-                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
-                token_type_ids = buffered_token_type_ids_expanded
-            else:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-
-        # If a 2D or 3D attention mask is provided for the cross-attention
-        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if self.config.is_decoder and encoder_hidden_states is not None:
-            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
-            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
-            if encoder_attention_mask is None:
-                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
-        else:
-            encoder_extended_attention_mask = None
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
-        encoder_outputs = self.encoder(
-            embedding_output,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = encoder_outputs[0]
-        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
-
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-
-
 @dataclass
 class RealmEmbedderOutput(ModelOutput):
     """
-    Outputs of RealmEmbedder models.
+    Outputs of [`RealmEmbedder`] models.
 
     Args:
         projected_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
@@ -825,7 +684,7 @@ class RealmEmbedderOutput(ModelOutput):
 @dataclass
 class RealmScorerOutput(ModelOutput):
     """
-    Outputs of RealmScorer models.
+    Outputs of [`RealmScorer`] models.
 
     Args:
         relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`):
@@ -844,7 +703,7 @@ class RealmScorerOutput(ModelOutput):
 @dataclass
 class RealmReaderOutput(ModelOutput):
     """
-    Outputs of RealmReader models.
+    Outputs of [`RealmReader`] models.
 
     Args:
         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
@@ -895,7 +754,7 @@ class RealmReaderOutput(ModelOutput):
 class RealmForOpenQAOutput(ModelOutput):
     """
 
-    Outputs of RealmForOpenQA models.
+    Outputs of [`RealmForOpenQA`] models.
 
     Args:
         reader_output (`dict`):
@@ -1036,47 +895,6 @@ def mask_to_score(mask):
         return reader_logits, candidate_starts, candidate_ends
 
 
-class RealmPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = RealmConfig
-    load_tf_weights = load_tf_weights_in_realm
-    base_model_prefix = "realm"
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-
-    def _flatten_inputs(self, *inputs):
-        """Flatten inputs' shape to (-1, input_shape[-1])"""
-        flattened_inputs = []
-        for tensor in inputs:
-            if tensor is None:
-                flattened_inputs.append(None)
-            else:
-                input_shape = tensor.shape
-                if len(input_shape) > 2:
-                    tensor = tensor.view((-1, input_shape[-1]))
-                flattened_inputs.append(tensor)
-        return flattened_inputs
-
-
 REALM_START_DOCSTRING = r"""
     This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
     it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
@@ -1138,6 +956,188 @@ def _flatten_inputs(self, *inputs):
 """
 
 
+class RealmPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RealmConfig
+    load_tf_weights = load_tf_weights_in_realm
+    base_model_prefix = "realm"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _flatten_inputs(self, *inputs):
+        """Flatten inputs' shape to (-1, input_shape[-1])"""
+        flattened_inputs = []
+        for tensor in inputs:
+            if tensor is None:
+                flattened_inputs.append(None)
+            else:
+                input_shape = tensor.shape
+                if len(input_shape) > 2:
+                    tensor = tensor.view((-1, input_shape[-1]))
+                flattened_inputs.append(tensor)
+        return flattened_inputs
+
+
+class RealmBertModel(RealmPreTrainedModel):
+    """
+        Same as the original BertModel but remove docstrings.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RealmEmbeddings(config)
+        self.encoder = RealmEncoder(config)
+
+        self.pooler = RealmPooler(config) if add_pooling_layer else None
+
+        # Weights initialization is mostly managed by other Realm models,
+        # but we also have them initialized here to keep a consistency.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
 @add_start_docstrings(
     "The embedder of REALM outputting projected score that will be used to calculate relevance score.",
     REALM_START_DOCSTRING,
@@ -1148,7 +1148,7 @@ def __init__(self, config):
 
         self.realm = RealmBertModel(self.config)
         self.cls = RealmScorerProjection(self.config)
-        self.init_weights()
+        self.post_init()
 
     def get_input_embeddings(self):
         return self.realm.embeddings.word_embeddings
@@ -1221,7 +1221,7 @@ def __init__(self, config, query_embedder=None):
 
         self.query_embedder = query_embedder if query_embedder is not None else self.embedder
 
-        self.init_weights()
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
     @replace_return_docstrings(output_type=RealmScorerOutput, config_class=_CONFIG_FOR_DOC)
@@ -1274,11 +1274,11 @@ def forward(
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-        if not (
-            any((input_ids is not None, inputs_embeds is not None))
-            and any((candidate_input_ids is not None, candidate_inputs_embeds is not None))
-        ):
-            raise ValueError("You have to specify both inputs and candidate inputs")
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds.")
+        
+        if candidate_input_ids is None and candidate_inputs_embeds is None:
+            raise ValueError("You have to specify either candidate_input_ids or candidate_inputs_embeds.")
 
         query_outputs = self.query_embedder(
             input_ids,
@@ -1335,7 +1335,7 @@ def __init__(self, config):
         super().__init__(config)
         self.realm = RealmBertModel(self.config)
         self.cls = RealmOnlyMLMHead(self.config)
-        self.init_weights()
+        self.post_init()
 
     def get_input_embeddings(self):
         return self.realm.embeddings.word_embeddings
@@ -1493,7 +1493,7 @@ def __init__(self, config):
         self.cls = RealmOnlyMLMHead(config)
         self.qa_outputs = RealmReaderProjection(config)
 
-        self.init_weights()
+        self.post_init()
 
     @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length"))
     @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC)
@@ -1697,7 +1697,7 @@ def __init__(self, config, retriever=None):
         )
         self.retriever = retriever
 
-        self.init_weights()
+        self.post_init()
 
     @property
     def beam_size(self):
@@ -1793,9 +1793,8 @@ def forward(
             return_dict=True,
         )
 
-        predicted_answer_ids = concat_inputs.input_ids[reader_output.block_idx][
-            reader_output.start_pos : reader_output.end_pos + 1
-        ]
+        predicted_block = concat_inputs.input_ids[reader_output.block_idx]
+        predicted_answer_ids = predicted_block[reader_output.start_pos : reader_output.end_pos + 1]
 
         if not return_dict:
             return reader_output, predicted_answer_ids

From fb43dd533b757c22f8ce634c4c7ddfbcf732e46a Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Sun, 9 Jan 2022 18:50:30 +0800
Subject: [PATCH 95/98] Style

---
 src/transformers/models/realm/configuration_realm.py | 2 +-
 src/transformers/models/realm/modeling_realm.py      | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/transformers/models/realm/configuration_realm.py b/src/transformers/models/realm/configuration_realm.py
index de21daf82a81..49975c1a055a 100644
--- a/src/transformers/models/realm/configuration_realm.py
+++ b/src/transformers/models/realm/configuration_realm.py
@@ -140,7 +140,7 @@ def __init__(
         max_span_width=10,
         reader_layer_norm_eps=1e-3,
         reader_beam_size=5,
-        reader_seq_len=320, # 288 + 32
+        reader_seq_len=320,  # 288 + 32
         num_block_records=13353718,
         searcher_beam_size=5000,
         searcher_seq_len=64,
diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 133f6d6ecbef..54e874982fe2 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -999,7 +999,7 @@ def _flatten_inputs(self, *inputs):
 
 class RealmBertModel(RealmPreTrainedModel):
     """
-        Same as the original BertModel but remove docstrings.
+    Same as the original BertModel but remove docstrings.
     """
 
     def __init__(self, config, add_pooling_layer=True):
@@ -1138,6 +1138,7 @@ def forward(
             cross_attentions=encoder_outputs.cross_attentions,
         )
 
+
 @add_start_docstrings(
     "The embedder of REALM outputting projected score that will be used to calculate relevance score.",
     REALM_START_DOCSTRING,
@@ -1276,7 +1277,7 @@ def forward(
 
         if input_ids is None and inputs_embeds is None:
             raise ValueError("You have to specify either input_ids or input_embeds.")
-        
+
         if candidate_input_ids is None and candidate_inputs_embeds is None:
             raise ValueError("You have to specify either candidate_input_ids or candidate_inputs_embeds.")
 

From a3cbaf03295953380517900707422cbe18c697f7 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 18 Jan 2022 00:18:23 +0800
Subject: [PATCH 96/98] Apply suggestions from code review

---
 .../models/realm/modeling_realm.py            | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 54e874982fe2..597c9b0012bb 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1271,7 +1271,31 @@ def forward(
             into associated vectors than the model's internal embedding lookup matrix.
 
         Returns:
-        """
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import RealmTokenizer, RealmScorer
+
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-scorer")
+        >>> model = RealmScorer.from_pretrained("qqaatw/realm-cc-news-pretrained-scorer", num_candidates=2)
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> input_texts = ["How are you?", "What is the item in the picture?"]
+        >>> candidates_texts = [["Hello world!", "Nice to meet you!"], ["A cute cat.", "An adorable dog."]]
+
+        >>> inputs = tokenizer(input_texts, return_tensors="pt")
+        >>> candidates_inputs = tokenizer.batch_encode_candidates(candidates_texts, max_length=10, return_tensors="pt")
+
+        >>> outputs = model(
+        ...     **inputs,
+        ...     candidate_input_ids=candidates_inputs.input_ids,
+        ...     candidate_attention_mask=candidates_inputs.attention_mask,
+        ...     candidate_token_type_ids=candidates_inputs.token_type_ids,
+        ... )
+        >>> relevance_score = outputs.relevance_score
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1478,10 +1502,7 @@ def forward(
         )
 
 
-@add_start_docstrings(
-    "The reader of REALM.",
-    REALM_START_DOCSTRING,
-)
+@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
 class RealmReader(RealmPreTrainedModel):
 
     _keys_to_ignore_on_load_unexpected = [r"pooler", "cls"]

From 0f4721b32bb56148c14f5aed6739f6951468d040 Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 18 Jan 2022 00:36:33 +0800
Subject: [PATCH 97/98] Add an example to RealmEmbedder

---
 src/transformers/models/realm/modeling_realm.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/transformers/models/realm/modeling_realm.py b/src/transformers/models/realm/modeling_realm.py
index 597c9b0012bb..95e941f9ef66 100644
--- a/src/transformers/models/realm/modeling_realm.py
+++ b/src/transformers/models/realm/modeling_realm.py
@@ -1173,6 +1173,21 @@ def forward(
     ):
         r"""
         Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import RealmTokenizer, RealmEmbedder
+        >>> import torch
+
+        >>> tokenizer = RealmTokenizer.from_pretrained("qqaatw/realm-cc-news-pretrained-embedder")
+        >>> model = RealmEmbedder.from_pretrained("qqaatw/realm-cc-news-pretrained-embedder")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> projected_score = outputs.projected_score
+        ```
         """
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

From 894ce5fb2164efda9ecb8681585a010e2fcca5eb Mon Sep 17 00:00:00 2001
From: qqaatw <qqaatw@gmail.com>
Date: Tue, 18 Jan 2022 01:37:15 +0800
Subject: [PATCH 98/98] Fix

---
 src/transformers/utils/dummy_pt_objects.py | 33 +++++++++++++---------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f60db6e9aa28..bd591c846246 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -2786,44 +2786,51 @@ def __init__(self, *args, **kwargs):
 REALM_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class RealmEmbedder:
+class RealmEmbedder(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmForOpenQA:
+class RealmForOpenQA(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmKnowledgeAugEncoder:
+class RealmKnowledgeAugEncoder(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmPreTrainedModel:
+class RealmPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    def forward(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
 
+class RealmReader(metaclass=DummyObject):
+    _backends = ["torch"]
 
-class RealmReader:
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmRetriever:
+class RealmRetriever(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class RealmScorer:
+class RealmScorer(metaclass=DummyObject):
+    _backends = ["torch"]
+
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])