Track the number of tokens seen to metrics (#27274)

muellerzr · amyeroberts · web-flow · commit 2fc33ebead50 · 2023-11-14T15:31:04.000-05:00
* Add tokens seen

* Address comments, add to TrainingArgs

* Update log

* Apply suggestions from code review

Co-authored-by: amyeroberts &lt;22614925+amyeroberts@users.noreply.github.com&gt;

* Use self.args

* Fix docstring

Co-authored-by: amyeroberts &lt;22614925+amyeroberts@users.noreply.github.com&gt;

---------

Co-authored-by: amyeroberts &lt;22614925+amyeroberts@users.noreply.github.com&gt;
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -1838,6 +1838,17 @@ def _inner_training_loop(
             step = -1
             for step, inputs in enumerate(epoch_iterator):
                 total_batched_samples += 1
+
+                if self.args.include_num_input_tokens_seen:
+                    main_input_name = getattr(self.model, "main_input_name", "input_ids")
+                    if main_input_name not in inputs:
+                        logger.warning(
+                            "Tried to track the number of tokens seen, however the current model is "
+                            "not configured properly to know what item is the input. To fix this, add "
+                            "a `main_input_name` attribute to the model class you are using."
+                        )
+                    else:
+                        self.state.num_input_tokens_seen += self.accelerator.gather(inputs[main_input_name]).numel()
                 if rng_to_sync:
                     self._load_rng_state(resume_from_checkpoint)
                     rng_to_sync = False
@@ -2640,6 +2651,8 @@ def log(self, logs: Dict[str, float]) -> None:
         """
         if self.state.epoch is not None:
             logs["epoch"] = round(self.state.epoch, 2)
+        if self.args.include_num_input_tokens_seen:
+            logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen
 
         output = {**logs, **{"step": self.state.global_step}}
         self.state.log_history.append(output)
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
@@ -59,6 +59,8 @@ class TrainerState:
             Run an evaluation every X steps.
         save_steps (`int`, *optional*, defaults to 500):
             Save checkpoint every X updates steps.
+        num_input_tokens_seen (`int`, *optional*, defaults to 0):
+            The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
         total_flos (`float`, *optional*, defaults to 0):
             The total number of floating operations done by the model since the beginning of training (stored as floats
             to avoid overflow).
@@ -87,6 +89,7 @@ class TrainerState:
     eval_steps: int = 500
     save_steps: int = 500
     num_train_epochs: int = 0
+    num_input_tokens_seen: int = 0
     total_flos: float = 0
     log_history: List[Dict[str, float]] = None
     best_metric: Optional[float] = None
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
@@ -637,6 +637,12 @@ class TrainingArguments:
             This will iterate over the entire training dataloader once beforehand,
 
             and will slow down the entire process.
+
+        include_num_input_tokens_seen (`bool`, *optional*):
+            Whether or not to track the number of input tokens seen throughout training.
+
+            May be slower in distributed training as gather operations must be called.
+
         neftune_noise_alpha (`Optional[float]`):
             If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
             for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
@@ -1258,6 +1264,13 @@ class TrainingArguments:
         metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."},
     )
 
+    include_num_input_tokens_seen: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)"
+        },
+    )
+
     neftune_noise_alpha: float = field(
         default=None,
         metadata={