Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions gnomad/resources/resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import logging
from abc import ABC, abstractmethod
from functools import reduce
from typing import Any, Callable, Dict, List, Optional
from functools import reduce, wraps
from typing import Any, Callable, Dict, Iterable, List, Optional

import hail as hl
from hail.linalg import BlockMatrix
Expand Down Expand Up @@ -373,9 +373,47 @@ def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource
super().__init__(default_version, versions)


class ResourceNotAvailable(Exception):
"""Exception raised if a resource is not available from the selected source."""


class GnomadPublicResource(BaseResource, ABC):
"""Base class for the gnomAD project's public resources."""

def __init_subclass__(cls, *, read_resource_methods: Iterable[str] = []) -> None:
super().__init_subclass__()

# Some resources may not be available from all sources due to delays in syncing, etc.
# This wraps all methods that read the resource and adds a check for if the resource
# is available from the selected source. If the resource is not available, this
# throws a more helpful error than if the read were simply allowed to fail.
def _wrap_read_resource_method(method_name):
original_method = getattr(cls, method_name)

@wraps(original_method)
def read_resource(self, *args, **kwargs):
# If one of the known sources is selected, check if the resource is available.
# For custom sources, skip the check and attempt to read the resource.
resource_source = gnomad_public_resource_configuration.source
if (
isinstance(resource_source, GnomadPublicResourceSource)
and resource_source != GnomadPublicResourceSource.GNOMAD
):
if not self.is_resource_available():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it makes sense to check if the gnomAD resource exists here( given how occasionally the repo lags behind buckets)? I'm wondering if it makes sense to add functionality to load from the gnomAD source on failure? Or if that gets more complicated as we add other cloud providers public buckets.

Copy link
Contributor Author

@nawatts nawatts Jun 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was assuming that resources will always be present in the gnomAD bucket, since they are copied from there to the cloud provider buckets.

I did not consider the case where an older version of this code points to a resource that no longer exists in the gnomAD buckets. Ideally, those URLs would not change but, practically, they do. That's a good point. We could catch that and suggest users update to the most recent version of gnomad_methods.

I'll claim that we should not automatically load from the gnomAD source if the resource doesn't exist. Most Hail resources are in the requester pays bucket, so if a user has configured their pipeline to use one of the cloud provider buckets, automatically falling back to the gnomAD buckets could result in unexpected egress/operation costs for that user. Also, they may not have Hail configured to allow requester pays access. I think the best we can do is point them towards the solution.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did not think of the cost associated with the requester-pays bucket, it absolutely makes sense to not automatically load the gnomAD source.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #390 for this.

raise ResourceNotAvailable(
f"This resource is not currently available from {resource_source.value}.\n\n"
"To load resources directly from gnomAD instead, use:\n\n"
">>> from gnomad.resources.config import gnomad_public_resource_configuration, GnomadPublicResourceSource\n"
">>> gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD"
)

return original_method(self, *args, **kwargs)

setattr(cls, method_name, read_resource)

for method_name in read_resource_methods:
_wrap_read_resource_method(method_name)

def _get_path(self) -> str:
resource_source = gnomad_public_resource_configuration.source
if resource_source == GnomadPublicResourceSource.GNOMAD:
Expand Down Expand Up @@ -406,20 +444,46 @@ def _set_path(self, path):

return super()._set_path(path)

def is_resource_available(self) -> bool:
"""
Check if this resource is available from the selected source.

:return: True if the resource is available.
"""
path = self.path

# Hail Tables, MatrixTables, and BlockMatrices are directories.
# For those, check for the existence of the _SUCCESS object.
path_to_test = (
f"{path}/_SUCCESS"
if any(path.endswith(ext) for ext in (".ht", ".mt", ".bm"))
else path
)

return hl.current_backend().fs.exists(path_to_test)


class GnomadPublicTableResource(TableResource, GnomadPublicResource):
class GnomadPublicTableResource(
TableResource, GnomadPublicResource, read_resource_methods=("ht",)
):
"""Resource class for a public Hail Table published by the gnomAD project."""


class GnomadPublicMatrixTableResource(MatrixTableResource, GnomadPublicResource):
class GnomadPublicMatrixTableResource(
MatrixTableResource, GnomadPublicResource, read_resource_methods=("mt",)
):
"""Resource class for a public Hail MatrixTable published by the gnomAD project."""


class GnomadPublicPedigreeResource(PedigreeResource, GnomadPublicResource):
class GnomadPublicPedigreeResource(
PedigreeResource, GnomadPublicResource, read_resource_methods=("ht", "pedigree")
):
"""Resource class for a public pedigree published by the gnomAD project."""


class GnomadPublicBlockMatrixResource(BlockMatrixResource, GnomadPublicResource):
class GnomadPublicBlockMatrixResource(
BlockMatrixResource, GnomadPublicResource, read_resource_methods=("bm",)
):
"""Resource class for a public Hail BlockMatrix published by the gnomAD project."""


Expand Down
29 changes: 17 additions & 12 deletions tests/resources/test_resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,9 @@ def test_read_gnomad_public_table_resource(

gnomad_public_resource_configuration.source = source

resource.ht()
read_table.assert_called_with(expected_read_path)
with patch.object(resource, "is_resource_available", return_value=True):
resource.ht()
read_table.assert_called_with(expected_read_path)


class TestGnomadPublicMatrixTableResource:
Expand All @@ -158,8 +159,9 @@ def test_read_gnomad_public_matrix_table_resource(

gnomad_public_resource_configuration.source = source

resource.mt()
read_matrix_table.assert_called_with(expected_read_path)
with patch.object(resource, "is_resource_available", return_value=True):
resource.mt()
read_matrix_table.assert_called_with(expected_read_path)


class TestGnomadPublicPedigreeResource:
Expand All @@ -178,9 +180,10 @@ def test_read_gnomad_public_pedigree_resource(

gnomad_public_resource_configuration.source = source

resource.pedigree()
read_pedigree.assert_called()
assert read_pedigree.call_args[0][0] == expected_read_path
with patch.object(resource, "is_resource_available", return_value=True):
resource.pedigree()
read_pedigree.assert_called()
assert read_pedigree.call_args[0][0] == expected_read_path

@pytest.mark.parametrize(
"resource_path,source,expected_read_path",
Expand All @@ -195,9 +198,10 @@ def test_import_gnomad_public_pedigree_resource(

gnomad_public_resource_configuration.source = source

resource.ht()
import_fam.assert_called()
assert import_fam.call_args[0][0] == expected_read_path
with patch.object(resource, "is_resource_available", return_value=True):
resource.ht()
import_fam.assert_called()
assert import_fam.call_args[0][0] == expected_read_path


class TestGnomadPublicBlockMatrixResource:
Expand All @@ -216,5 +220,6 @@ def test_read_gnomad_public_block_matrix_resource(

gnomad_public_resource_configuration.source = source

resource.bm()
read_block_matrix.assert_called_with(expected_read_path)
with patch.object(resource, "is_resource_available", return_value=True):
resource.bm()
read_block_matrix.assert_called_with(expected_read_path)