Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/resource_sources.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@ gnomAD data is available through `multiple cloud providers' public datasets prog

The functions in the :doc:`gnomad.resources </api_reference/resources/index>` package can be configured to load data from different sources.

By default, resources are loaded from Google Cloud Public Datasets. This can be configured using the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable.
If Hail determines that is is running in a cloud provider's Spark environment, resources will default to being read from that cloud provider's datasets program.
For example, resource will be read from Azure Open Datasets if Hail determines that it is running on an Azure HDInsight cluster.
Otherwise, resources will default to being read from Google Cloud Public Datasets.
This can be configured using the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable.

To load resources from a different source (for example, the gnomAD project's public GCS bucket), use:

Expand Down
34 changes: 30 additions & 4 deletions gnomad/resources/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,13 @@ class GnomadPublicResourceSource(Enum):
def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, str]:
"""
Get the default source for public gnomAD resources.

.. note::

Default is pulled from the `GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE` environment variable if it exists. Otherwise `GOOGLE_CLOUD_PUBLIC_DATASETS` is used.

The default source is determined by...

- If the ``GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE`` environment variable is set, use the source configured there.
- Otherwise, if Hail determines that is is running in a cloud provider's Spark environment, use the source from that cloud provider.
For example, use Azure Open Datasets if running on an Azure HDInsight cluster.
- Otherwise, use Google Cloud Public Datasets.

:returns: Default resource source
"""
Expand All @@ -44,6 +47,29 @@ def get_default_public_resource_source() -> Union[GnomadPublicResourceSource, st
)
return default_source_from_env

try:
from hail.utils import guess_cloud_spark_provider
except ImportError:
pass
else:
cloud_spark_provider = guess_cloud_spark_provider()
default_resource_sources_by_provider = {
"dataproc": GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS,
"hdinsight": GnomadPublicResourceSource.AZURE_OPEN_DATASETS,
}
if cloud_spark_provider:
try:
default_source_from_provider = default_resource_sources_by_provider[
cloud_spark_provider
]
logger.info(
"Using default source for gnomAD resources based on cloud provider: %s",
default_source_from_provider,
)
return default_source_from_provider
except KeyError:
pass

return GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS


Expand Down
37 changes: 37 additions & 0 deletions tests/resources/test_resource_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,43 @@ def test_get_default_source_from_environment(
):
assert get_default_public_resource_source() == expected_default_source

@pytest.mark.parametrize(
"cloud_spark_provider,expected_default_source",
[
("dataproc", GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
("hdinsight", GnomadPublicResourceSource.AZURE_OPEN_DATASETS),
("unknown", GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
(None, GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS),
],
)
def test_get_default_source_from_cloud_spark_provider(
self, cloud_spark_provider, expected_default_source
):
"""Test that default source is set based on cloud Spark provider."""
with patch(
"hail.utils.guess_cloud_spark_provider",
return_value=cloud_spark_provider,
create=True,
):
assert get_default_public_resource_source() == expected_default_source

def test_default_source_from_environment_overrides_cloud_spark_provider(self):
"""Test that a default source configured in environment variables is preferred over the one for the current cloud Spark provider."""
with patch(
"hail.utils.guess_cloud_spark_provider",
return_value="hdinsight",
create=True,
), patch.dict(
os.environ,
{
"GNOMAD_DEFAULT_PUBLIC_RESOURCE_SOURCE": "gs://my-bucket/gnomad-resources"
},
):
assert (
get_default_public_resource_source()
== "gs://my-bucket/gnomad-resources"
)


def gnomad_public_resource_test_parameters(
path: str,
Expand Down