From d47f1925acd48550d46c2fa070fb243d99fb4d24 Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 20:46:26 +0000 Subject: [PATCH 1/8] Add XRootD dataset identifier class --- pyproject.toml | 1 + servicex/dataset/__init__.py | 5 +++-- servicex/dataset_identifier.py | 20 ++++++++++++++++++++ tests/test_databinder.py | 17 +++++++++++++++++ 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 008ed025..d90094ae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,7 @@ UprootRaw = "servicex.uproot_raw.uproot_raw:UprootRawQuery" Rucio = "servicex.dataset_identifier:RucioDatasetIdentifier" FileList = "servicex.dataset_identifier:FileListDataset" CERNOpenData = "servicex.dataset_identifier:CERNOpenDataDatasetIdentifier" +XRootD = "servicex.dataset_identifier:XRootDDatasetIdentifier" [tool.hatch.build.targets.sdist] # hatchling always includes: diff --git a/servicex/dataset/__init__.py b/servicex/dataset/__init__.py index 070f0f35..ff14c5aa 100644 --- a/servicex/dataset/__init__.py +++ b/servicex/dataset/__init__.py @@ -29,7 +29,8 @@ from ..dataset_identifier import (RucioDatasetIdentifier as Rucio, # noqa: F401 FileListDataset as FileList, CERNOpenDataDatasetIdentifier as CERNOpenData, - DataSetIdentifier as GenericDataSet) + DataSetIdentifier as GenericDataSet, + XRootDDatasetIdentifier as XRootD) -__any__ = ['Rucio', 'FileList', 'CERNOpenData', 'GenericDataSet'] +__any__ = ['Rucio', 'FileList', 'CERNOpenData', 'GenericDataSet', 'XRootD'] diff --git a/servicex/dataset_identifier.py b/servicex/dataset_identifier.py index de765bc8..d795d65d 100644 --- a/servicex/dataset_identifier.py +++ b/servicex/dataset_identifier.py @@ -117,3 +117,23 @@ def __init__(self, dataset: int, num_files: Optional[int] = None): @classmethod def from_yaml(cls, _, node): return cls(int(node.value)) + + +class XRootDDatasetIdentifier(DataSetIdentifier): + def __init__(self, pattern: str, num_files: Optional[int] = None): + r""" + CERN Open Data Dataset - this will be looked up using the CERN Open Data DID finder. + + :param dataset: The dataset ID - this is an integer. + :param num_files: Maximum number of files to return. This is useful during development + to perform quick runs. ServiceX is careful to make sure it always + returns the same subset of files. + + """ + super().__init__("xrootd", pattern, num_files=num_files) + + yaml_tag = '!XRootD' + + @classmethod + def from_yaml(cls, _, node): + return cls(node.value) diff --git a/tests/test_databinder.py b/tests/test_databinder.py index 40cd9695..2327a5df 100644 --- a/tests/test_databinder.py +++ b/tests/test_databinder.py @@ -168,6 +168,19 @@ def test_cernopendata(): assert spec.Sample[0].dataset_identifier.did == "cernopendata://1507" +def test_xrootd(): + spec = ServiceXSpec.model_validate({ + "Sample": [ + { + "Name": "sampleA", + "Dataset": dataset.XRootD('root://blablabla/*/?.root'), + "Function": "a" + } + ] + }) + assert spec.Sample[0].dataset_identifier.did == "xrootd://root://blablabla/*/?.root" + + def test_invalid_dataset_identifier(): with pytest.raises(ValidationError): ServiceXSpec.model_validate( @@ -328,6 +341,8 @@ def run_query(input_filenames=None): - Name: ttH6 Dataset: !CERNOpenData 1507 Query: !UprootRaw '[{"treename": "nominal"}]' + - Name: ttH7 + Dataset: !XRootD root://eosatlas.cern.ch//eos/atlas/path/*/file.root """) f.flush() result = _load_ServiceXSpec(path) @@ -342,6 +357,8 @@ def run_query(input_filenames=None): == ["/path/to/file1.root", "/path/to/file2.root"]) assert isinstance(result.Sample[5].dataset_identifier, CERNOpenData) assert result.Sample[5].dataset_identifier.did == 'cernopendata://1507' + assert (result.Sample[6].dataset_identifier.did + == 'xrootd://root://eosatlas.cern.ch//eos/atlas/path/*/file.root') # Path from string result2 = _load_ServiceXSpec(str(path)) From 43320295c3adf59eae7c9cff890357a327e64b7d Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 20:47:43 +0000 Subject: [PATCH 2/8] Flake8 --- tests/test_databinder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_databinder.py b/tests/test_databinder.py index 2327a5df..89b99b41 100644 --- a/tests/test_databinder.py +++ b/tests/test_databinder.py @@ -357,7 +357,7 @@ def run_query(input_filenames=None): == ["/path/to/file1.root", "/path/to/file2.root"]) assert isinstance(result.Sample[5].dataset_identifier, CERNOpenData) assert result.Sample[5].dataset_identifier.did == 'cernopendata://1507' - assert (result.Sample[6].dataset_identifier.did + assert (result.Sample[6].dataset_identifier.did == 'xrootd://root://eosatlas.cern.ch//eos/atlas/path/*/file.root') # Path from string From 0d7e8e808d23d0b9239399d8287684fd6a6bd3e0 Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 21:28:03 +0000 Subject: [PATCH 3/8] Add documentation --- docs/datasets.rst | 30 ++++++++++++++++++++++++++++++ docs/index.rst | 1 + docs/transform_request.rst | 4 ++-- 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 docs/datasets.rst diff --git a/docs/datasets.rst b/docs/datasets.rst new file mode 100644 index 00000000..ac2c42f1 --- /dev/null +++ b/docs/datasets.rst @@ -0,0 +1,30 @@ +Specifying input datasets +======== + +When making a ServiceX request, you need to specify where the source data are to be found. These is done by creating instances of ``servicex.dataset`` classes (when writing Python code) or by using appropriate YAML tags (when writing a YAML configuration). The possibilities are given below: + +Rucio +^^^^ +This will look up a dataset using a query to the Rucio data management system. The request is assumed to be for a Rucio dataset or container. + * Python: ``{ "Dataset": servicex.dataset.Rucio("my.rucio.dataset.name") }`` + * YAML: ``Dataset: !Rucio my.rucio.dataset.name`` + +CERN Open Data Portal +^^^^ +This looks up files in datasets using their integer ID in the `CERN Open Data Portal `_. (If you access the web page for a dataset, this is the number following ``opendata.cern.ch/record/`` in the URL.) + * Python: ``{ "Dataset": servicex.dataset.CERNOpenData(179) }`` + * YAML: ``Dataset: !CERNOpenData 179`` + +A list of network-accessible files +^^^^ +If you have URLs for files that ServiceX can access via either the HTTP or XRootD protocols, then these URLs can be given directly to ServiceX. Note that the ServiceX instance must have permissions to read these files; in particular if generic members of your experiment can't access the files, ServiceX will probably not be able to either. + * Python: ``{ "Dataset": servicex.dataset.FileList(["http://server/file1.root", "root://server/file2.root"]) }`` + * YAML: ``Dataset: !FileList ["http://server/file1.root", "root://server/file2.root"]`` + +XRootD pattern (useful for EOS) +^^^^ +Files can also be located using wildcard patterns with XRootD. So, for example, if you want to include all files in the directory ``/eos/opendata/mystuff`` in the CERN EOS system, given that the directory is made available to the world as ``root://eospublic.cern.ch//eos/opendata/mystuff``, you can ask for ``root://eospublic.cern.ch//eos/opendata/mystuff/*``. + +*Note: available from ServiceX client version 3.0.1.* + * Python: ``{ "Dataset": servicex.dataset.XRootD("root://eospublic.cern.ch//eos/opendata/mystuff/*") }`` + * YAML: ``Dataset: !XRootD root://eospublic.cern.ch//eos/opendata/mystuff/*`` \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 32b5c902..88f177c6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -65,6 +65,7 @@ re-running queries that have already been executed. connect_servicex query_types transform_request + datasets examples command_line contribute diff --git a/docs/transform_request.rst b/docs/transform_request.rst index 56fdcad0..016a16ed 100644 --- a/docs/transform_request.rst +++ b/docs/transform_request.rst @@ -41,5 +41,5 @@ data, select ``LocalCache`` for ``Delivery``. If you are located at an analysis The Definitions Sections ^^^^^^^^^^^^^^^^^^^^^^^^ -The Definitions section is a dictionary of values that can be substituted into fields in the Sample -sections. This is useful for defining common values that are used in multiple samples. This is an advanced concept. \ No newline at end of file +The Definitions section (only available when setting up the request using YAML files) is a list of values that can be substituted into fields in the Sample +sections, defined using the YAML anchor/alias syntax. This is useful for defining common values that are used in multiple samples. This is an advanced concept. \ No newline at end of file From fc0a0b7a4ab997fcaff392f3a103c505dfaa061a Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 21:55:46 +0000 Subject: [PATCH 4/8] Avoid server access --- tests/test_servicex_dataset.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_servicex_dataset.py b/tests/test_servicex_dataset.py index 626a5c66..f700c597 100644 --- a/tests/test_servicex_dataset.py +++ b/tests/test_servicex_dataset.py @@ -27,7 +27,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import tempfile from typing import List -from unittest.mock import AsyncMock +from unittest.mock import AsyncMock, patch from pathlib import PurePath import pytest @@ -422,7 +422,7 @@ async def test_submit_fatal(mocker): @pytest.mark.asyncio -async def test_submit_generic(mocker): +async def test_submit_generic(mocker, codegen_list): """ Uses Uproot-Raw classes which go through the generic query mechanism """ import json sx = AsyncMock() @@ -446,10 +446,12 @@ async def test_submit_generic(mocker): client.servicex = sx client.query_cache = mock_cache - datasource = client.generic_query( - dataset_identifier=did, - query=UprootRawQuery({'treename': 'CollectionTree'}) - ) + with patch('servicex.servicex_client.ServiceXClient.get_code_generators', + return_value=codegen_list): + datasource = client.generic_query( + dataset_identifier=did, + query=UprootRawQuery({'treename': 'CollectionTree'}) + ) with ExpandableProgress(display_progress=False) as progress: datasource.result_format = ResultFormat.parquet _ = await datasource.submit_and_download(signed_urls_only=False, From 1b5f67cb8aa1faf5eb3a805d844b71fe7695749f Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 21:58:24 +0000 Subject: [PATCH 5/8] Better patch --- tests/test_servicex_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_servicex_dataset.py b/tests/test_servicex_dataset.py index f700c597..d6f647d5 100644 --- a/tests/test_servicex_dataset.py +++ b/tests/test_servicex_dataset.py @@ -442,12 +442,12 @@ async def test_submit_generic(mocker, codegen_list): mock_cache = mocker.MagicMock(QueryCache) mocker.patch("servicex.minio_adapter.MinioAdapter", return_value=mock_minio) did = FileListDataset("/foo/bar/baz.root") - client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') - client.servicex = sx - client.query_cache = mock_cache - with patch('servicex.servicex_client.ServiceXClient.get_code_generators', return_value=codegen_list): + client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') + client.servicex = sx + client.query_cache = mock_cache + datasource = client.generic_query( dataset_identifier=did, query=UprootRawQuery({'treename': 'CollectionTree'}) From 4cd5bc21f5a41b9b376d43c5efb1a3a8f900f56c Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 22:00:54 +0000 Subject: [PATCH 6/8] Patch another test --- tests/test_servicex_dataset.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/test_servicex_dataset.py b/tests/test_servicex_dataset.py index d6f647d5..053aff78 100644 --- a/tests/test_servicex_dataset.py +++ b/tests/test_servicex_dataset.py @@ -488,14 +488,16 @@ async def test_submit_cancelled(mocker): mock_cache = mocker.MagicMock(QueryCache) mocker.patch("servicex.minio_adapter.MinioAdapter", return_value=mock_minio) did = FileListDataset("/foo/bar/baz.root") - client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') - client.servicex = sx - client.query_cache = mock_cache + with patch('servicex.servicex_client.ServiceXClient.get_code_generators', + return_value=codegen_list): + client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') + client.servicex = sx + client.query_cache = mock_cache - datasource = client.generic_query( - dataset_identifier=did, - query=UprootRawQuery({'treename': 'CollectionTree'}) - ) + datasource = client.generic_query( + dataset_identifier=did, + query=UprootRawQuery({'treename': 'CollectionTree'}) + ) with ExpandableProgress(display_progress=False) as progress: datasource.result_format = ResultFormat.parquet _ = await datasource.submit_and_download(signed_urls_only=False, From 709046501210f04429022845fc1a6e578d50ec3e Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Fri, 27 Sep 2024 22:01:31 +0000 Subject: [PATCH 7/8] Fix --- tests/test_servicex_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_servicex_dataset.py b/tests/test_servicex_dataset.py index 053aff78..e6e44b4e 100644 --- a/tests/test_servicex_dataset.py +++ b/tests/test_servicex_dataset.py @@ -470,7 +470,7 @@ async def test_submit_generic(mocker, codegen_list): @pytest.mark.asyncio -async def test_submit_cancelled(mocker): +async def test_submit_cancelled(mocker, codegen_list): """ Uses Uproot-Raw classes which go through the query cancelled mechanism """ import json sx = AsyncMock() From c124d55162cbc5804c22d1e360e2f02305cffa33 Mon Sep 17 00:00:00 2001 From: Peter Onyisi Date: Mon, 30 Sep 2024 16:50:37 +0000 Subject: [PATCH 8/8] Fix reduction in code coverage --- tests/test_servicex_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_servicex_dataset.py b/tests/test_servicex_dataset.py index e6e44b4e..d6e5bbfc 100644 --- a/tests/test_servicex_dataset.py +++ b/tests/test_servicex_dataset.py @@ -442,7 +442,7 @@ async def test_submit_generic(mocker, codegen_list): mock_cache = mocker.MagicMock(QueryCache) mocker.patch("servicex.minio_adapter.MinioAdapter", return_value=mock_minio) did = FileListDataset("/foo/bar/baz.root") - with patch('servicex.servicex_client.ServiceXClient.get_code_generators', + with patch('servicex.servicex_adapter.ServiceXAdapter.get_code_generators', return_value=codegen_list): client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') client.servicex = sx @@ -488,7 +488,7 @@ async def test_submit_cancelled(mocker, codegen_list): mock_cache = mocker.MagicMock(QueryCache) mocker.patch("servicex.minio_adapter.MinioAdapter", return_value=mock_minio) did = FileListDataset("/foo/bar/baz.root") - with patch('servicex.servicex_client.ServiceXClient.get_code_generators', + with patch('servicex.servicex_adapter.ServiceXAdapter.get_code_generators', return_value=codegen_list): client = ServiceXClient(backend='servicex-uc-af', config_path='tests/example_config.yaml') client.servicex = sx