Skip to content

Commit 942f479

Browse files
committed
[BE][6/n] replace large c4_mini datasets by c4_test with the first 2K entries
ghstack-source-id: 319f496 Pull Request resolved: #512
1 parent 48485a8 commit 942f479

File tree

5 files changed

+2011
-45014
lines changed

5 files changed

+2011
-45014
lines changed

test/assets/c4_test/data.json

Lines changed: 2000 additions & 0 deletions
Large diffs are not rendered by default.

test/datasets/test_checkpoint.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212
class TestCheckpoint:
1313
def test_c4_resumption(self):
14-
dataset_name = "c4_mini"
15-
dataset_path = "./torchtitan/datasets/c4_mini"
14+
dataset_name = "c4_test"
15+
dataset_path = "./test/assets/c4_test"
1616
batch_size = 1
1717
seq_len = 1024
1818
world_size = 4

torchtitan/datasets/c4_mini/data.json

Lines changed: 0 additions & 45000 deletions
This file was deleted.

torchtitan/datasets/hf_datasets.py

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
# map from dataset name to a local directory, or
2929
# a dataset repository on the HF hub
3030
_supported_datasets = {
31-
"c4_mini": "torchtitan/datasets/c4_mini",
31+
"c4_test": "test/assets/c4_test",
3232
"c4": "allenai/c4",
3333
}
3434

@@ -48,8 +48,8 @@ class HuggingFaceDataset(IterableDataset, Stateful):
4848
rank (int): rank of the current data parallel process
4949
infinite (bool): whether to loop infinitely over the dataset
5050
51-
We currently support the c4 dataset and a subset of it:
52-
c4_mini (45K training entries)
51+
We currently support the c4 dataset, and a subset of it for testing purposes:
52+
c4_test (2K training entries)
5353
c4 (177M training entries - this dataset is streamed due to the size)
5454
5555
>> c4 (EN) <<:
@@ -83,12 +83,12 @@ def __init__(
8383
if dataset_path:
8484
logger.warning(
8585
f"Dataset {dataset_name} is not tested or verfied. "
86-
f"Recommended datasets are: {list(_supported_datasets.keys())}."
86+
f"Recommended datasets are: {list(_supported_datasets.keys())}"
8787
)
8888
else:
8989
raise ValueError(
9090
f"Dataset {dataset_name} is not supported. "
91-
f"Supported datasets are: {list(_supported_datasets.keys())}."
91+
f"Supported datasets are: {list(_supported_datasets.keys())}"
9292
)
9393

9494
if not dataset_path:
@@ -132,15 +132,12 @@ def __iter__(self):
132132
yield input, label
133133

134134
if not self.infinite:
135-
logger.warning(f"Dataset {self.dataset_name} has run out of data.")
135+
logger.warning(f"Dataset {self.dataset_name} has run out of data")
136136
break
137137
else:
138138
# Reset offset for the next iteration
139139
self._sample_idx = 0
140-
logger.warning(
141-
f"Dataset {self.dataset_name} is being re-looped. "
142-
"Loss related metrics might be misleading."
143-
)
140+
logger.warning(f"Dataset {self.dataset_name} is being re-looped")
144141

145142
def _get_data_iter(self):
146143
if self._sample_idx == 0:
@@ -188,7 +185,7 @@ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
188185

189186
if self._rank_id not in state_dict:
190187
logger.warning(
191-
f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}."
188+
f"DataLoader state is empty for dp rank {self._dp_rank}, expected key {self._rank_id}"
192189
)
193190
return
194191
super().load_state_dict(pickle.loads(state_dict[self._rank_id]))

train_configs/debug_model.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ steps = 10
3838
data_parallel_degree = -1
3939
tensor_parallel_degree = 1
4040
compile = false
41-
dataset = "c4_mini" # supported datasets: c4_mini (45K), c4 (177M)
41+
dataset = "c4_test" # supported datasets: c4_test (2K), c4 (177M)
4242

4343
[experimental]
4444
pipeline_parallel_degree = 1

0 commit comments

Comments
 (0)