From 681c0ec52a7c8b3a8370b908e3c1063d42994645 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:07:49 -0700 Subject: [PATCH 1/8] Allow awkward 2.0 in the setup.py file --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index be40c552..790f635d 100644 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ install_requires=[ "idna==2.10", # Required to thread version needle with requests library "pandas~=1.0", - "uproot>=4.0.1, <5", - "awkward>=1.0.1, <2", + "uproot>=4.0.1", + "awkward>=1.0.1", "backoff>=2.0", "aiohttp~=3.6", "minio~=5.0", @@ -48,6 +48,7 @@ "google-auth", "confuse", "pyarrow>=1.0", + "fsspec", ], extras_require={ "test": [ From 7fa30420dffc4e7391f5a911e5092db2d82566b4 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:08:03 -0700 Subject: [PATCH 2/8] Change test to use supported features of awkward 1 and 2. --- tests/test_data_conversions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_data_conversions.py b/tests/test_data_conversions.py index 9234ceb2..a95ad9b6 100644 --- a/tests/test_data_conversions.py +++ b/tests/test_data_conversions.py @@ -7,7 +7,7 @@ def check_awkward_accessible(col: ak.Array): "Check to make sure we can look at every item in column" - ak.repartition(col, 3) # type: ignore + ak.sum(col) def check_pandas_accessible(col): From 111102c7bda0fdc2ef5c691f83df7f01f74ffa82 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:14:12 -0700 Subject: [PATCH 3/8] Fix up uproot.lazy reference --- tests/test_data_conversions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_data_conversions.py b/tests/test_data_conversions.py index a95ad9b6..8e6b2179 100644 --- a/tests/test_data_conversions.py +++ b/tests/test_data_conversions.py @@ -120,9 +120,9 @@ def test_combine_awkward_from_root(good_root_file_path): def load_df(): import uproot as uproot - with uproot.open(good_root_file_path) as f_in: + with uproot.open(good_root_file_path) as f_in: # type: ignore tree_name = f_in.keys()[0] - return uproot.lazy(f"{good_root_file_path}:{tree_name}") + return f_in[tree_name].arrays() # type: ignore df1 = load_df() df2 = load_df() From 32429936527fc9382e4a1ad3262299cc2a54c6dd Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:18:47 -0700 Subject: [PATCH 4/8] Add dask_awkward --- servicex/data_conversions.py | 10 +++++++++- setup.py | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/servicex/data_conversions.py b/servicex/data_conversions.py index 9f200eb7..68bece1c 100644 --- a/servicex/data_conversions.py +++ b/servicex/data_conversions.py @@ -155,7 +155,15 @@ def do_the_work(file: Path) -> ak.Array: with uproot.open(file) as f_in: tree_name = f_in.keys()[0] - return uproot.lazy(f"{file}:{tree_name}") + if hasattr(uproot, "lazy"): + return uproot.lazyarray(f"{file}:{tree_name}") + + if hasattr(uproot, "dask"): + return uproot.dask(f"{file}:{tree_name}") + + assert ( + False + ), "Uproot version does not have either `dask` or `lazy` - please fix environment!" return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file)) diff --git a/setup.py b/setup.py index 790f635d..d3496bd2 100644 --- a/setup.py +++ b/setup.py @@ -39,6 +39,7 @@ "pandas~=1.0", "uproot>=4.0.1", "awkward>=1.0.1", + "dask_awkward", "backoff>=2.0", "aiohttp~=3.6", "minio~=5.0", From b994247cfb1f0858124d7c3a7e5b51c2cc08eae8 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:20:01 -0700 Subject: [PATCH 5/8] Add minor documentation change --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 737be2d4..222a4c58 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ The file can contain an `api_endpoint` as mentioned earlier. In addition the oth All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables. For non-standard use cases, the user can specify: + - The code generator that is used by the backend. This is done by passing a `codegen` argument to ServiceXDataset. This argument is normally inherited from the backend type set in `servicex.yaml`, but can be overridden with any valid `codegen` contained in the default type listing. A `codegen` entry can also be added to a backend in the yaml file to use as default. - The type of backend, using the `backend_type` argument on ServiceXDataset. This overrides the backend type setting in the `servicex.yaml` file. @@ -206,7 +207,8 @@ Implemented: - Exceptions are used to report back errors of all sorts from the service to the user's code. - Data is return in the following forms: - `pandas.DataFrame` an in process DataFrame of all the data requested - - `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s + - `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s. + - If you have `awkward` 2.0 installed, then a `dask_awkward` array is returned instead. - A list of root files that can be opened with `uproot` and used as desired. - Not all output formats are compatible with all transformations. - Complete returned data must fit in the process' memory From e9e30c7c8687699c7a127709c936807c345863b3 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:22:48 -0700 Subject: [PATCH 6/8] Fix up type errors --- servicex/data_conversions.py | 8 ++++---- tests/test_data_conversions.py | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/servicex/data_conversions.py b/servicex/data_conversions.py index 68bece1c..81772337 100644 --- a/servicex/data_conversions.py +++ b/servicex/data_conversions.py @@ -97,7 +97,7 @@ async def _convert_root_to_pandas(self, file: Path): def do_the_work(file: Path) -> DataFrame: import uproot as uproot - with uproot.open(file) as f_in: + with uproot.open(file) as f_in: # type: ignore r = f_in[f_in.keys()[0]] return r.arrays(library="pd") # type: ignore @@ -152,14 +152,14 @@ async def _convert_root_to_awkward(self, file: Path): def do_the_work(file: Path) -> ak.Array: import uproot as uproot - with uproot.open(file) as f_in: + with uproot.open(file) as f_in: # type: ignore tree_name = f_in.keys()[0] if hasattr(uproot, "lazy"): - return uproot.lazyarray(f"{file}:{tree_name}") + return uproot.lazy(f"{file}:{tree_name}") # type: ignore if hasattr(uproot, "dask"): - return uproot.dask(f"{file}:{tree_name}") + return uproot.dask(f"{file}:{tree_name}") # type: ignore assert ( False diff --git a/tests/test_data_conversions.py b/tests/test_data_conversions.py index 8e6b2179..a17af4dc 100644 --- a/tests/test_data_conversions.py +++ b/tests/test_data_conversions.py @@ -51,14 +51,14 @@ async def test_parquet_to_pandas(good_uproot_file_path): @pytest.mark.asyncio async def test_parquet_to_awkward(good_uproot_file_path): df = await DataConverterAdaptor("parquet").convert_to_awkward(good_uproot_file_path) - assert len(df["JetPT"]) == 115714 + assert len(df["JetPT"]) == 115714 # type: ignore check_awkward_accessible(df["JetPT"]) # type: ignore @pytest.mark.asyncio async def test_root_to_awkward(good_root_file_path): df = await DataConverterAdaptor("root-file").convert_to_awkward(good_root_file_path) - assert len(df["JetPt"]) == 283458 + assert len(df["JetPt"]) == 283458 # type: ignore check_awkward_accessible(df["JetPt"]) # type: ignore @@ -84,7 +84,7 @@ def test_combine_pandas_from_root(good_root_file_path): def load_df(): import uproot as uproot - with uproot.open(good_root_file_path) as f_in: + with uproot.open(good_root_file_path) as f_in: # type: ignore r = f_in[f_in.keys()[0]] return r.arrays(library="pd") # type: ignore @@ -142,7 +142,7 @@ def load_df(): df1 = load_df() df2 = load_df() - combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) + combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) # type: ignore - assert len(combined) == len(df1) + len(df2) + assert len(combined) == len(df1) + len(df2) # type: ignore check_awkward_accessible(combined["JetPT"]) # type: ignore From e7cc93c663c65b4a67c993950e16d0a9bc3eb7b3 Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:26:43 -0700 Subject: [PATCH 7/8] Fix up setup to help with testing all python versions --- setup.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index d3496bd2..51b84ace 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,20 @@ else: version = version.split("/")[-1] +# Awkward 2.0 is only allowed on Python 3.8+ - so we need to shift the +# awkward requirement a little bit. +if sys.version_info < (3, 8): + awkward_requirements = [ + "awkward>=1.0.1,<2", + "uproot>=4.0.1,<5", + ] +else: + awkward_requirements = [ + "awkward>=1.0.1", + "dask_awkward", + "fsspec", + "uproot>=4.0.1", + ] setup( name="servicex", version=version, @@ -38,8 +52,6 @@ "idna==2.10", # Required to thread version needle with requests library "pandas~=1.0", "uproot>=4.0.1", - "awkward>=1.0.1", - "dask_awkward", "backoff>=2.0", "aiohttp~=3.6", "minio~=5.0", @@ -49,8 +61,8 @@ "google-auth", "confuse", "pyarrow>=1.0", - "fsspec", - ], + ] + + awkward_requirements, extras_require={ "test": [ "pytest>=3.9", From ff11ce8c4f76c54d82bb874ca5c5499dcbbf997c Mon Sep 17 00:00:00 2001 From: Gordon Watts Date: Fri, 22 Sep 2023 11:45:39 -0700 Subject: [PATCH 8/8] Add comment reminding us to get rid of this when we upgrade` --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 51b84ace..e6f2e8b1 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ # Awkward 2.0 is only allowed on Python 3.8+ - so we need to shift the # awkward requirement a little bit. +# TODO: Remove this when we stop supporting 3.7. if sys.version_info < (3, 8): awkward_requirements = [ "awkward>=1.0.1,<2",