Skip to content

Commit 4ed5bb2

Browse files
Fix dataset builder default version (#4356)
* Fix dataset builder default version * Fix dummy_data command and tests * Fix BuilderConfig version type hint * Revert default BuilderConfig.version to str * Revert default BuilderConfig.version to Version * Test dataset builder config version * Refactor tests
1 parent 0bb4727 commit 4ed5bb2

File tree

4 files changed

+69
-14
lines changed

4 files changed

+69
-14
lines changed

src/datasets/builder.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class BuilderConfig:
9595
"""
9696

9797
name: str = "default"
98-
version: Optional[Union[str, utils.Version]] = "0.0.0"
98+
version: Optional[Union[utils.Version, str]] = utils.Version("0.0.0")
9999
data_dir: Optional[str] = None
100100
data_files: Optional[DataFilesDict] = None
101101
description: Optional[str] = None
@@ -193,8 +193,8 @@ class DatasetBuilder:
193193
pre-defined set of configurations in :meth:`datasets.DatasetBuilder.builder_configs`.
194194
"""
195195

196-
# Default version.
197-
VERSION = utils.Version("0.0.0")
196+
# Default version
197+
VERSION = None # Default version set in BuilderConfig
198198

199199
# Class for the builder config.
200200
BUILDER_CONFIG_CLASS = BuilderConfig

src/datasets/commands/dummy_data.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -295,14 +295,9 @@ def run(self):
295295
auto_generate_results = []
296296
with tempfile.TemporaryDirectory() as tmp_dir:
297297
for builder_config in builder_configs:
298-
if builder_config is None:
299-
name = None
300-
version = builder_cls.VERSION
301-
else:
302-
version = builder_config.version
303-
name = builder_config.name
304-
298+
name = builder_config.name if builder_config else None
305299
dataset_builder = builder_cls(name=name, hash=dataset_module.hash, cache_dir=tmp_dir)
300+
version = builder_config.version if builder_config else dataset_builder.config.version
306301
mock_dl_manager = MockDownloadManager(
307302
dataset_name=self._dataset_name,
308303
config=builder_config,

tests/test_builder.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,3 +756,66 @@ def test_custom_writer_batch_size(tmp_path, writer_batch_size, default_writer_ba
756756
builder.download_and_prepare(try_from_hf_gcs=False, download_mode=DownloadMode.FORCE_REDOWNLOAD)
757757
dataset = builder.as_dataset("train")
758758
assert len(dataset.data[0].chunks) == expected_chunks
759+
760+
761+
class DummyBuilderWithVersion(GeneratorBasedBuilder):
762+
VERSION = "2.0.0"
763+
764+
def _info(self):
765+
return DatasetInfo(features=Features({"text": Value("string")}))
766+
767+
def _split_generators(self, dl_manager):
768+
pass
769+
770+
def _generate_examples(self):
771+
pass
772+
773+
774+
class DummyBuilderWithBuilderConfigs(GeneratorBasedBuilder):
775+
BUILDER_CONFIGS = [BuilderConfig(name="custom", version="2.0.0")]
776+
777+
def _info(self):
778+
return DatasetInfo(features=Features({"text": Value("string")}))
779+
780+
def _split_generators(self, dl_manager):
781+
pass
782+
783+
def _generate_examples(self):
784+
pass
785+
786+
787+
class CustomBuilderConfig(BuilderConfig):
788+
def __init__(self, date=None, language=None, version="2.0.0", **kwargs):
789+
name = f"{date}.{language}"
790+
super().__init__(name=name, version=version, **kwargs)
791+
self.date = date
792+
self.language = language
793+
794+
795+
class DummyBuilderWithCustomBuilderConfigs(GeneratorBasedBuilder):
796+
BUILDER_CONFIGS = [CustomBuilderConfig(date="20220501", language="en")]
797+
BUILDER_CONFIG_CLASS = CustomBuilderConfig
798+
799+
def _info(self):
800+
return DatasetInfo(features=Features({"text": Value("string")}))
801+
802+
def _split_generators(self, dl_manager):
803+
pass
804+
805+
def _generate_examples(self):
806+
pass
807+
808+
809+
@pytest.mark.parametrize(
810+
"builder_class, kwargs",
811+
[
812+
(DummyBuilderWithVersion, {}),
813+
(DummyBuilderWithBuilderConfigs, {"name": "custom"}),
814+
(DummyBuilderWithCustomBuilderConfigs, {"name": "20220501.en"}),
815+
(DummyBuilderWithCustomBuilderConfigs, {"date": "20220501", "language": "ca"}),
816+
],
817+
)
818+
def test_builder_config_version(builder_class, kwargs, tmp_path):
819+
cache_dir = str(tmp_path)
820+
builder = builder_class(cache_dir=cache_dir, **kwargs)
821+
assert builder.config.version == "2.0.0"

tests/test_dataset_common.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,7 @@ def check_load_dataset(self, dataset_name, configs, is_local=False, use_local_du
136136
logger.info("Skip tests for this dataset for now")
137137
return
138138

139-
if config is not None:
140-
version = config.version
141-
else:
142-
version = dataset_builder.VERSION
139+
version = config.version if config else dataset_builder.config.version
143140

144141
def check_if_url_is_valid(url):
145142
if is_remote_url(url) and "\\" in url:

0 commit comments

Comments
 (0)