Skip to content

Commit 1b1c19c

Browse files
authored
Merge pull request #406 from ssl-hep/3.0_develop_add_dataset
Add dataset keyword
2 parents 43740a5 + e2d1f7f commit 1b1c19c

File tree

8 files changed

+141
-25
lines changed

8 files changed

+141
-25
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,11 @@ FuncADL_CMS = "servicex.func_adl.func_adl_dataset:FuncADLQuery_CMS"
101101
PythonFunction = "servicex.python_dataset:PythonQuery"
102102
UprootRaw = "servicex.uproot_raw.uproot_raw:UprootRawQuery"
103103

104+
[project.entry-points.'servicex.dataset']
105+
Rucio = "servicex.dataset_identifier:RucioDatasetIdentifier"
106+
FileList = "servicex.dataset_identifier:FileListDataset"
107+
CERNOpenData = "servicex.dataset_identifier:CERNOpenDataDatasetIdentifier"
108+
104109
[tool.hatch.build.targets.sdist]
105110
# hatchling always includes:
106111
# pyproject.toml, .gitignore, any README, any LICENSE, AUTHORS

servicex/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022, IRIS-HEP
1+
# Copyright (c) 2024, IRIS-HEP
22
# All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
@@ -38,7 +38,8 @@
3838
from .models import ResultFormat, ResultDestination
3939
from .dataset_group import DatasetGroup
4040
from .dataset_identifier import RucioDatasetIdentifier, FileListDataset
41-
from . import query
41+
import servicex.dataset as dataset
42+
import servicex.query as query
4243

4344
__all__ = [
4445
"ServiceXClient",
@@ -60,5 +61,6 @@
6061
"DefinitionList",
6162
"ServiceXSpec",
6263
"deliver",
64+
"dataset",
6365
"query"
6466
]

servicex/databinder_models.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022, IRIS-HEP
1+
# Copyright (c) 2024, IRIS-HEP
22
# All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
@@ -33,7 +33,8 @@
3333
model_validator,
3434
)
3535

36-
from servicex.dataset_identifier import RucioDatasetIdentifier, FileListDataset
36+
from servicex.dataset_identifier import (DataSetIdentifier, RucioDatasetIdentifier,
37+
FileListDataset)
3738
from servicex.query_core import Query as SXQuery, QueryStringGenerator
3839
from servicex.models import ResultFormat
3940

@@ -43,31 +44,37 @@ class Sample(BaseModel):
4344
Codegen: Optional[str] = None
4445
RucioDID: Optional[str] = None
4546
XRootDFiles: Optional[Union[str, List[str]]] = None
47+
Dataset: Optional[DataSetIdentifier] = None
4648
NFiles: Optional[int] = Field(default=None)
4749
Query: Optional[Union[str, SXQuery, QueryStringGenerator]] = Field(default=None)
4850
IgnoreLocalCache: bool = False
4951

5052
model_config = {"arbitrary_types_allowed": True}
5153

5254
@property
53-
def dataset_identifier(self):
54-
if self.RucioDID:
55-
return RucioDatasetIdentifier(self.RucioDID, num_files=self.NFiles or 0)
55+
def dataset_identifier(self) -> DataSetIdentifier:
56+
if self.Dataset:
57+
return self.Dataset
58+
elif self.RucioDID:
59+
return RucioDatasetIdentifier(self.RucioDID, num_files=self.NFiles)
5660
elif self.XRootDFiles:
5761
return FileListDataset(self.XRootDFiles)
62+
else: # pragma: no cover
63+
raise RuntimeError("No valid dataset found, somehow validation failed")
5864

5965
@model_validator(mode="before")
6066
@classmethod
6167
def validate_did_xor_file(cls, values):
6268
"""
63-
Ensure that only one of RootFile or RucioDID is specified.
69+
Ensure that only one of Dataset, RootFile, or RucioDID is specified.
6470
:param values:
6571
:return:
6672
"""
67-
if "XRootDFiles" in values and "RucioDID" in values:
68-
raise ValueError("Only specify one of XRootDFiles or RucioDID, not both.")
69-
if "XRootDFiles" not in values and "RucioDID" not in values:
70-
raise ValueError("Must specify one of XRootDFiles or RucioDID.")
73+
count = sum(["RucioDID" in values, "XRootDFiles" in values, "Dataset" in values])
74+
if count > 1:
75+
raise ValueError("Only specify one of Dataset, XRootDFiles, or RucioDID.")
76+
if count == 0:
77+
raise ValueError("Must specify one of Dataset, XRootDFiles, or RucioDID.")
7178
return values
7279

7380

servicex/dataset/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright (c) 2024, IRIS-HEP
2+
# All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# * Redistributions of source code must retain the above copyright notice, this
8+
# list of conditions and the following disclaimer.
9+
#
10+
# * Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# * Neither the name of the copyright holder nor the names of its
15+
# contributors may be used to endorse or promote products derived from
16+
# this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22+
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23+
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24+
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25+
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26+
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27+
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28+
29+
from ..dataset_identifier import (RucioDatasetIdentifier as Rucio, # noqa: F401
30+
FileListDataset as FileList,
31+
CERNOpenDataDatasetIdentifier as CERNOpenData,
32+
DataSetIdentifier as GenericDataSet)
33+
34+
35+
__any__ = ['Rucio', 'FileList', 'CERNOpenData', 'GenericDataSet']

servicex/dataset_identifier.py

Lines changed: 44 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2022, IRIS-HEP
1+
# Copyright (c) 2024, IRIS-HEP
22
# All rights reserved.
33
#
44
# Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
2525
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
2626
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28-
from typing import List, Union
28+
from typing import List, Union, Optional
2929

3030
from servicex.models import TransformRequest
3131

@@ -35,14 +35,14 @@ class DataSetIdentifier:
3535
Base class for specifying the dataset to transform. This can either be a list of
3636
xRootD URIs or a rucio DID
3737
"""
38-
def __init__(self, scheme: str, dataset: str, num_files: int = None):
38+
def __init__(self, scheme: str, dataset: str, num_files: Optional[int] = None):
3939
self.scheme = scheme
4040
self.dataset = dataset
4141
self.num_files = num_files
4242

4343
@property
4444
def did(self):
45-
num_files_arg = f"?files={self.num_files}" if self.num_files else ""
45+
num_files_arg = f"?files={self.num_files}" if self.num_files is not None else ""
4646
return f"{self.scheme}://{self.dataset}{num_files_arg}"
4747

4848
def populate_transform_request(self, transform_request: TransformRequest) -> None:
@@ -51,7 +51,7 @@ def populate_transform_request(self, transform_request: TransformRequest) -> Non
5151

5252

5353
class RucioDatasetIdentifier(DataSetIdentifier):
54-
def __init__(self, dataset: str, num_files: int = None):
54+
def __init__(self, dataset: str, num_files: Optional[int] = None):
5555
r"""
5656
Rucio Dataset - this will be looked up using the Rucio data management
5757
service.
@@ -64,19 +64,56 @@ def __init__(self, dataset: str, num_files: int = None):
6464
"""
6565
super().__init__("rucio", dataset, num_files=num_files)
6666

67+
yaml_tag = '!Rucio'
6768

68-
class FileListDataset:
69+
@classmethod
70+
def from_yaml(cls, _, node):
71+
return cls(node.value)
72+
73+
74+
class FileListDataset(DataSetIdentifier):
6975
def __init__(self, files: Union[List[str], str]):
7076
r"""
7177
Dataset specified as a list of XRootD URIs.
7278
7379
:param files: Either a list of URIs or a single URI string
7480
"""
75-
if type(files) is str:
81+
self.files: List[str]
82+
if isinstance(files, str):
7683
self.files = [files]
7784
else:
7885
self.files = files
7986

8087
def populate_transform_request(self, transform_request: TransformRequest) -> None:
8188
transform_request.file_list = self.files
8289
transform_request.did = None
90+
91+
@property
92+
def did(self):
93+
return None
94+
95+
yaml_tag = '!FileList'
96+
97+
@classmethod
98+
def from_yaml(cls, constructor, node):
99+
return cls(constructor.construct_sequence(node))
100+
101+
102+
class CERNOpenDataDatasetIdentifier(DataSetIdentifier):
103+
def __init__(self, dataset: int, num_files: Optional[int] = None):
104+
r"""
105+
CERN Open Data Dataset - this will be looked up using the CERN Open Data DID finder.
106+
107+
:param dataset: The dataset ID - this is an integer.
108+
:param num_files: Maximum number of files to return. This is useful during development
109+
to perform quick runs. ServiceX is careful to make sure it always
110+
returns the same subset of files.
111+
112+
"""
113+
super().__init__("cernopendata", f'{dataset}', num_files=num_files)
114+
115+
yaml_tag = '!CERNOpenData'
116+
117+
@classmethod
118+
def from_yaml(cls, _, node):
119+
return cls(int(node.value))

servicex/query/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
2727
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2828

29-
3029
import sys
3130
if sys.version_info < (3, 10):
3231
from importlib_metadata import entry_points

servicex/servicex_client.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,9 @@ def _load_ServiceXSpec(
7777
from importlib.metadata import entry_points
7878

7979
plugins = entry_points(group="servicex.query")
80+
for _ in plugins:
81+
yaml.register_class(_.load())
82+
plugins = entry_points(group="servicex.dataset")
8083
for _ in plugins:
8184
yaml.register_class(_.load())
8285

tests/test_databinder.py

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@
22
from unittest.mock import patch
33
from pydantic import ValidationError
44

5-
from servicex import ServiceXSpec, FileListDataset, RucioDatasetIdentifier
5+
from servicex import ServiceXSpec, FileListDataset, RucioDatasetIdentifier, dataset
66

77

88
def basic_spec(samples=None):
99
return {
10-
"General": {
11-
"Codegen": "python",
12-
},
1310
"Sample": samples
1411
or [{"Name": "sampleA", "XRootDFiles": "root://a.root", "Query": "a"}],
1512
}
@@ -35,7 +32,6 @@ def test_load_config():
3532

3633

3734
def test_single_root_file():
38-
3935
spec = ServiceXSpec.model_validate(
4036
basic_spec(
4137
samples=[
@@ -52,6 +48,7 @@ def test_single_root_file():
5248
assert spec.Sample[0].dataset_identifier.files == [
5349
"root://eospublic.cern.ch//file1.root"
5450
]
51+
assert spec.Sample[0].dataset_identifier.did is None
5552

5653

5754
def test_list_of_root_files():
@@ -118,6 +115,19 @@ def test_rucio_did_numfiles():
118115
)
119116

120117

118+
def test_cernopendata():
119+
spec = ServiceXSpec.model_validate({
120+
"Sample": [
121+
{
122+
"Name": "sampleA",
123+
"Dataset": dataset.CERNOpenData(1507),
124+
"Function": "a"
125+
}
126+
]
127+
})
128+
assert spec.Sample[0].dataset_identifier.did == "cernopendata://1507"
129+
130+
121131
def test_invalid_dataset_identifier():
122132
with pytest.raises(ValidationError):
123133
ServiceXSpec.model_validate(
@@ -172,6 +182,7 @@ def test_submit_mapping(transformed_result, codegen_list):
172182

173183
def test_yaml(tmp_path):
174184
from servicex.servicex_client import _load_ServiceXSpec
185+
from servicex.dataset import FileList, Rucio, CERNOpenData
175186
# Nominal paths
176187
with open(path := (tmp_path / "python.yaml"), "w") as f:
177188
f.write("""
@@ -194,12 +205,29 @@ def run_query(input_filenames=None):
194205
RucioDID: user.kchoi:user.kchoi.fcnc_tHq_ML.ttH.v11
195206
Query: !UprootRaw |
196207
[{"treename": "nominal"}]
208+
- Name: ttH4
209+
Dataset: !Rucio user.kchoi:user.kchoi.fcnc_tHq_ML.ttH.v11
210+
Query: !UprootRaw '[{"treename": "nominal"}]'
211+
- Name: ttH5
212+
Dataset: !FileList ["/path/to/file1.root", "/path/to/file2.root"]
213+
Query: !UprootRaw '[{"treename": "nominal"}]'
214+
- Name: ttH6
215+
Dataset: !CERNOpenData 1507
216+
Query: !UprootRaw '[{"treename": "nominal"}]'
197217
""")
198218
f.flush()
199219
result = _load_ServiceXSpec(path)
200220
assert type(result.Sample[0].Query).__name__ == 'PythonQuery'
201221
assert type(result.Sample[1].Query).__name__ == 'FuncADLQuery_Uproot'
202222
assert type(result.Sample[2].Query).__name__ == 'UprootRawQuery'
223+
assert isinstance(result.Sample[3].dataset_identifier, Rucio)
224+
assert (result.Sample[3].dataset_identifier.did
225+
== 'rucio://user.kchoi:user.kchoi.fcnc_tHq_ML.ttH.v11')
226+
assert isinstance(result.Sample[4].dataset_identifier, FileList)
227+
assert (result.Sample[4].dataset_identifier.files
228+
== ["/path/to/file1.root", "/path/to/file2.root"])
229+
assert isinstance(result.Sample[5].dataset_identifier, CERNOpenData)
230+
assert result.Sample[5].dataset_identifier.did == 'cernopendata://1507'
203231

204232
# Path from string
205233
result2 = _load_ServiceXSpec(str(path))

0 commit comments

Comments
 (0)