Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ The file can contain an `api_endpoint` as mentioned earlier. In addition the oth
All strings are expanded using python's [os.path.expand](https://docs.python.org/3/library/os.path.html#os.path.expandvars) method - so `$NAME` and `${NAME}` will work to expand existing environment variables.

For non-standard use cases, the user can specify:

- The code generator that is used by the backend. This is done by passing a `codegen` argument to ServiceXDataset. This argument is normally inherited from the backend type set in `servicex.yaml`, but can be overridden with any valid `codegen` contained in the default type listing. A `codegen` entry can also be added to a backend in the yaml file to use as default.
- The type of backend, using the `backend_type` argument on ServiceXDataset. This overrides the backend type setting in the `servicex.yaml` file.

Expand All @@ -206,7 +207,8 @@ Implemented:
- Exceptions are used to report back errors of all sorts from the service to the user's code.
- Data is return in the following forms:
- `pandas.DataFrame` an in process DataFrame of all the data requested
- `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s
- `awkward` an in process `JaggedArray` or dictionary of `JaggedArray`s.
- If you have `awkward` 2.0 installed, then a `dask_awkward` array is returned instead.
- A list of root files that can be opened with `uproot` and used as desired.
- Not all output formats are compatible with all transformations.
- Complete returned data must fit in the process' memory
Expand Down
14 changes: 11 additions & 3 deletions servicex/data_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ async def _convert_root_to_pandas(self, file: Path):
def do_the_work(file: Path) -> DataFrame:
import uproot as uproot

with uproot.open(file) as f_in:
with uproot.open(file) as f_in: # type: ignore
r = f_in[f_in.keys()[0]]
return r.arrays(library="pd") # type: ignore

Expand Down Expand Up @@ -152,10 +152,18 @@ async def _convert_root_to_awkward(self, file: Path):
def do_the_work(file: Path) -> ak.Array:
import uproot as uproot

with uproot.open(file) as f_in:
with uproot.open(file) as f_in: # type: ignore
tree_name = f_in.keys()[0]

return uproot.lazy(f"{file}:{tree_name}")
if hasattr(uproot, "lazy"):
return uproot.lazy(f"{file}:{tree_name}") # type: ignore

if hasattr(uproot, "dask"):
return uproot.dask(f"{file}:{tree_name}") # type: ignore

assert (
False
), "Uproot version does not have either `dask` or `lazy` - please fix environment!"

return await asyncio.wrap_future(_conversion_pool.submit(do_the_work, file))

Expand Down
21 changes: 18 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@
else:
version = version.split("/")[-1]

# Awkward 2.0 is only allowed on Python 3.8+ - so we need to shift the
# awkward requirement a little bit.
# TODO: Remove this when we stop supporting 3.7.
if sys.version_info < (3, 8):
awkward_requirements = [
"awkward>=1.0.1,<2",
"uproot>=4.0.1,<5",
]
else:
awkward_requirements = [
"awkward>=1.0.1",
"dask_awkward",
"fsspec",
"uproot>=4.0.1",
]
setup(
name="servicex",
version=version,
Expand All @@ -37,8 +52,7 @@
install_requires=[
"idna==2.10", # Required to thread version needle with requests library
"pandas~=1.0",
"uproot>=4.0.1, <5",
"awkward>=1.0.1, <2",
"uproot>=4.0.1",
"backoff>=2.0",
"aiohttp~=3.6",
"minio~=5.0",
Expand All @@ -48,7 +62,8 @@
"google-auth",
"confuse",
"pyarrow>=1.0",
],
]
+ awkward_requirements,
extras_require={
"test": [
"pytest>=3.9",
Expand Down
16 changes: 8 additions & 8 deletions tests/test_data_conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

def check_awkward_accessible(col: ak.Array):
"Check to make sure we can look at every item in column"
ak.repartition(col, 3) # type: ignore
ak.sum(col)


def check_pandas_accessible(col):
Expand Down Expand Up @@ -51,14 +51,14 @@ async def test_parquet_to_pandas(good_uproot_file_path):
@pytest.mark.asyncio
async def test_parquet_to_awkward(good_uproot_file_path):
df = await DataConverterAdaptor("parquet").convert_to_awkward(good_uproot_file_path)
assert len(df["JetPT"]) == 115714
assert len(df["JetPT"]) == 115714 # type: ignore
check_awkward_accessible(df["JetPT"]) # type: ignore


@pytest.mark.asyncio
async def test_root_to_awkward(good_root_file_path):
df = await DataConverterAdaptor("root-file").convert_to_awkward(good_root_file_path)
assert len(df["JetPt"]) == 283458
assert len(df["JetPt"]) == 283458 # type: ignore
check_awkward_accessible(df["JetPt"]) # type: ignore


Expand All @@ -84,7 +84,7 @@ def test_combine_pandas_from_root(good_root_file_path):
def load_df():
import uproot as uproot

with uproot.open(good_root_file_path) as f_in:
with uproot.open(good_root_file_path) as f_in: # type: ignore
r = f_in[f_in.keys()[0]]
return r.arrays(library="pd") # type: ignore

Expand Down Expand Up @@ -120,9 +120,9 @@ def test_combine_awkward_from_root(good_root_file_path):
def load_df():
import uproot as uproot

with uproot.open(good_root_file_path) as f_in:
with uproot.open(good_root_file_path) as f_in: # type: ignore
tree_name = f_in.keys()[0]
return uproot.lazy(f"{good_root_file_path}:{tree_name}")
return f_in[tree_name].arrays() # type: ignore

df1 = load_df()
df2 = load_df()
Expand All @@ -142,7 +142,7 @@ def load_df():
df1 = load_df()
df2 = load_df()

combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2])
combined = DataConverterAdaptor("root-file").combine_awkward([df1, df2]) # type: ignore

assert len(combined) == len(df1) + len(df2)
assert len(combined) == len(df1) + len(df2) # type: ignore
check_awkward_accessible(combined["JetPT"]) # type: ignore