diff --git a/README.md b/README.md index 737be2d4..222a4c58 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ The file can contain an `api_endpoint` as mentioned earlier. In addition the oth All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables. For non-standard use cases, the user can specify: + - The code generator that is used by the backend. This is done by passing a `codegen` argument to ServiceXDataset. This argument is normally inherited from the backend type set in `servicex.yaml`, but can be overridden with any valid `codegen` contained in the default type listing. A `codegen` entry can also be added to a backend in the yaml file to use as default. - The type of backend, using the `backend_type` argument on ServiceXDataset. This overrides the backend type setting in the `servicex.yaml` file. @@ -206,7 +207,8 @@ Implemented: - Exceptions are used to report back errors of all sorts from the service to the user's code. - Data is return in the following forms: - `pandas.DataFrame` an in process DataFrame of all the data requested - - `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s + - `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s. + - If you have `awkward` 2.0 installed, then a `dask_awkward` array is returned instead. - A list of root files that can be opened with `uproot` and used as desired. - Not all output formats are compatible with all transformations. - Complete returned data must fit in the process' memory diff --git a/servicex/data_conversions.py b/servicex/data_conversions.py index 9f200eb7..81772337 100644 --- a/servicex/data_conversions.py +++ b/servicex/data_conversions.py @@ -97,7 +97,7 @@ async def _convert_root_to_pandas(self, file: Path): def do_the_work(file: Path) -> DataFrame: import uproot as uproot - with uproot.open(file) as f_in: + with uproot.open(file) as f_in: # type: ignore r = f_in[f_in.keys()[0]] return r.arrays(library="pd") # type: ignore @@ -152,10 +152,18 @@ async def _convert_root_to_awkward(self, file: Path): def do_the_work(file: Path) -> ak.Array: import uproot as uproot - with uproot.open(file) as f_in: + with uproot.open(file) as f_in: # type: ignore tree_name = f_in.keys()[0] - return uproot.lazy(f"{file}:{tree_name}") + if hasattr(uproot, "lazy"): + return uproot.lazy(f"{file}:{tree_name}") # type: ignore + + if hasattr(uproot, "dask"): + return uproot.dask(f"{file}:{tree_name}") # type: ignore + + assert ( + False + ), "Uproot version does not have either `dask` or `lazy` - please fix environment!" return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file)) diff --git a/setup.py b/setup.py index be40c552..e6f2e8b1 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,21 @@ else: version = version.split("/")[-1] +# Awkward 2.0 is only allowed on Python 3.8+ - so we need to shift the +# awkward requirement a little bit. +# TODO: Remove this when we stop supporting 3.7. +if sys.version_info < (3, 8): + awkward_requirements = [ + "awkward>=1.0.1,<2", + "uproot>=4.0.1,<5", + ] +else: + awkward_requirements = [ + "awkward>=1.0.1", + "dask_awkward", + "fsspec", + "uproot>=4.0.1", + ] setup( name="servicex", version=version, @@ -37,8 +52,7 @@ install_requires=[ "idna==2.10", # Required to thread version needle with requests library "pandas~=1.0", - "uproot>=4.0.1, <5", - "awkward>=1.0.1, <2", + "uproot>=4.0.1", "backoff>=2.0", "aiohttp~=3.6", "minio~=5.0", @@ -48,7 +62,8 @@ "google-auth", "confuse", "pyarrow>=1.0", - ], + ] + + awkward_requirements, extras_require={ "test": [ "pytest>=3.9", diff --git a/tests/test_data_conversions.py b/tests/test_data_conversions.py index 9234ceb2..a17af4dc 100644 --- a/tests/test_data_conversions.py +++ b/tests/test_data_conversions.py @@ -7,7 +7,7 @@ def check_awkward_accessible(col: ak.Array): "Check to make sure we can look at every item in column" - ak.repartition(col, 3) # type: ignore + ak.sum(col) def check_pandas_accessible(col): @@ -51,14 +51,14 @@ async def test_parquet_to_pandas(good_uproot_file_path): @pytest.mark.asyncio async def test_parquet_to_awkward(good_uproot_file_path): df = await DataConverterAdaptor("parquet").convert_to_awkward(good_uproot_file_path) - assert len(df["JetPT"]) == 115714 + assert len(df["JetPT"]) == 115714 # type: ignore check_awkward_accessible(df["JetPT"]) # type: ignore @pytest.mark.asyncio async def test_root_to_awkward(good_root_file_path): df = await DataConverterAdaptor("root-file").convert_to_awkward(good_root_file_path) - assert len(df["JetPt"]) == 283458 + assert len(df["JetPt"]) == 283458 # type: ignore check_awkward_accessible(df["JetPt"]) # type: ignore @@ -84,7 +84,7 @@ def test_combine_pandas_from_root(good_root_file_path): def load_df(): import uproot as uproot - with uproot.open(good_root_file_path) as f_in: + with uproot.open(good_root_file_path) as f_in: # type: ignore r = f_in[f_in.keys()[0]] return r.arrays(library="pd") # type: ignore @@ -120,9 +120,9 @@ def test_combine_awkward_from_root(good_root_file_path): def load_df(): import uproot as uproot - with uproot.open(good_root_file_path) as f_in: + with uproot.open(good_root_file_path) as f_in: # type: ignore tree_name = f_in.keys()[0] - return uproot.lazy(f"{good_root_file_path}:{tree_name}") + return f_in[tree_name].arrays() # type: ignore df1 = load_df() df2 = load_df() @@ -142,7 +142,7 @@ def load_df(): df1 = load_df() df2 = load_df() - combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) + combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) # type: ignore - assert len(combined) == len(df1) + len(df2) + assert len(combined) == len(df1) + len(df2) # type: ignore check_awkward_accessible(combined["JetPT"]) # type: ignore