diff --git a/CHANGELOG.md b/CHANGELOG.md index 508895de7..cecd28f88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * Added function `make_faf_index_dict` to create a look-up Dictionary for entries contained in the filter allele frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) * Added function `make_freq_index_dict` to create a look-up Dictionary for entries contained in the frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) * VersionedResource objects are no longer subclasses of BaseResource [(#359)](https://github.com/broadinstitute/gnomad_methods/pull/359) +* gnomAD resources can now be imported from different sources [(#373)](https://github.com/broadinstitute/gnomad_methods/pull/373) ## Version 0.5.0 - April 22nd, 2021 diff --git a/gnomad/resources/config.py b/gnomad/resources/config.py new file mode 100644 index 000000000..a2f8a3e8d --- /dev/null +++ b/gnomad/resources/config.py @@ -0,0 +1,47 @@ +"""Configuration for loading resources.""" + +from enum import Enum +from typing import Union + + +class GnomadPublicResourceSource(Enum): + """Sources for public gnomAD resources.""" + + GNOMAD = "gnomAD" + GOOGLE_CLOUD_PUBLIC_DATASETS = "Google Cloud Public Datasets" + + +DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE = GnomadPublicResourceSource.GNOMAD + + +class _GnomadPublicResourceConfiguration: + """Configuration for public gnomAD resources.""" + + __source: Union[ + GnomadPublicResourceSource, str + ] = DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE + + @property + def source(self) -> Union[GnomadPublicResourceSource, str]: + """ + Get the source for public gnomAD resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :returns: Source name or path to root of resources directory + """ + return self.__source + + @source.setter + def source(self, source: Union[GnomadPublicResourceSource, str]) -> None: + """ + Set the default source for resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :param source: Source name or path to root of resources directory + """ + self.__source = source + + +gnomad_public_resource_configuration = _GnomadPublicResourceConfiguration() diff --git a/gnomad/resources/grch37/gnomad.py b/gnomad/resources/grch37/gnomad.py index 99a04ad50..89de0702c 100644 --- a/gnomad/resources/grch37/gnomad.py +++ b/gnomad/resources/grch37/gnomad.py @@ -2,7 +2,7 @@ from gnomad.resources.resource_utils import ( DataException, - TableResource, + GnomadPublicTableResource, VersionedTableResource, ) @@ -124,7 +124,9 @@ def public_release(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_release_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) for release in releases }, ) @@ -150,7 +152,9 @@ def coverage(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_coverage_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) for release in releases }, ) @@ -176,13 +180,15 @@ def liftover(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_liftover_data_path(data_type, release)) + release: GnomadPublicTableResource( + path=_liftover_data_path(data_type, release) + ) for release in releases }, ) -def public_pca_loadings(subpop: str = "") -> TableResource: +def public_pca_loadings(subpop: str = "") -> GnomadPublicTableResource: """ Return the TableResource containing sites and loadings from population PCA. @@ -194,7 +200,7 @@ def public_pca_loadings(subpop: str = "") -> TableResource: 'Available subpops are "eas" or "nfe", default value "" for global' ) - return TableResource(path=_public_pca_ht_path(subpop)) + return GnomadPublicTableResource(path=_public_pca_ht_path(subpop)) def release_vcf_path(data_type: str, version: str, contig: str) -> str: diff --git a/gnomad/resources/grch37/gnomad_ld.py b/gnomad/resources/grch37/gnomad_ld.py index 78affb55d..7cba27d86 100644 --- a/gnomad/resources/grch37/gnomad_ld.py +++ b/gnomad/resources/grch37/gnomad_ld.py @@ -1,6 +1,9 @@ # noqa: D100 -from gnomad.resources.resource_utils import TableResource, BlockMatrixResource +from gnomad.resources.resource_utils import ( + GnomadPublicTableResource, + GnomadPublicBlockMatrixResource, +) from gnomad.resources.grch37.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE from typing import Optional @@ -67,16 +70,16 @@ def _ld_scores_path( return f'gs://gnomad-public-requester-pays/release/{version}/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' -def ld_matrix(pop: str) -> BlockMatrixResource: +def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: """Get resource for the LD matrix for the given population.""" - return BlockMatrixResource(path=_ld_matrix_path("genomes", pop)) + return GnomadPublicBlockMatrixResource(path=_ld_matrix_path("genomes", pop)) -def ld_index(pop: str) -> TableResource: +def ld_index(pop: str) -> GnomadPublicTableResource: """Get resource for the LD indices for the given population.""" - return TableResource(path=_ld_index_path("genomes", pop)) + return GnomadPublicTableResource(path=_ld_index_path("genomes", pop)) -def ld_scores(pop: str) -> TableResource: +def ld_scores(pop: str) -> GnomadPublicTableResource: """Get resource for the LD scores for the given population.""" - return TableResource(path=_ld_scores_path("genomes", pop)) + return GnomadPublicTableResource(path=_ld_scores_path("genomes", pop)) diff --git a/gnomad/resources/grch37/reference_data.py b/gnomad/resources/grch37/reference_data.py index d475bc1fb..c7c89b0d4 100644 --- a/gnomad/resources/grch37/reference_data.py +++ b/gnomad/resources/grch37/reference_data.py @@ -1,15 +1,15 @@ # noqa: D100 from gnomad.resources.resource_utils import ( - MatrixTableResource, - TableResource, + GnomadPublicMatrixTableResource, + GnomadPublicTableResource, VersionedMatrixTableResource, VersionedTableResource, import_sites_vcf, ) import hail as hl -na12878_giab = MatrixTableResource( +na12878_giab = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt", import_func=hl.import_vcf, import_args={ @@ -20,7 +20,7 @@ }, ) -hapmap = TableResource( +hapmap = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht", import_func=import_sites_vcf, import_args={ @@ -31,7 +31,7 @@ }, ) -kgp_omni = TableResource( +kgp_omni = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.ht", import_func=import_sites_vcf, import_args={ @@ -42,7 +42,7 @@ }, ) -mills = TableResource( +mills = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.ht", import_func=import_sites_vcf, import_args={ @@ -53,7 +53,7 @@ }, ) -syndip = MatrixTableResource( +syndip = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/syndip/hybrid.m37m.mt", import_func=hl.import_vcf, import_args={ @@ -67,7 +67,7 @@ vep_context = VersionedTableResource( default_version="85", versions={ - "85": TableResource( + "85": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht", ) }, @@ -76,7 +76,7 @@ dbsnp = VersionedTableResource( default_version="20180423", versions={ - "20180423": TableResource( + "20180423": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/dbsnp/All_20180423.ht", import_func=import_sites_vcf, import_args={ @@ -93,7 +93,7 @@ clinvar = VersionedTableResource( default_version="20181028", versions={ - "20181028": TableResource( + "20181028": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/clinvar/clinvar_20181028.vep.ht", import_func=import_sites_vcf, import_args={ @@ -110,7 +110,7 @@ kgp_phase_3 = VersionedMatrixTableResource( default_version="phase_3_split", versions={ - "phase_3_split": MatrixTableResource( + "phase_3_split": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt", import_func=hl.import_vcf, import_args={ @@ -121,7 +121,7 @@ "reference_genome": "GRCh37", }, ), - "phase_3": MatrixTableResource( + "phase_3": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt", import_func=hl.import_vcf, import_args={ @@ -138,7 +138,7 @@ kgp = VersionedTableResource( default_version="phase_1_hc", versions={ - "phase_1_hc": TableResource( + "phase_1_hc": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.ht", import_func=import_sites_vcf, import_args={ @@ -152,13 +152,15 @@ }, ) -cpg_sites = TableResource(path="gs://gnomad-public/resources/grch37/cpg_sites/cpg.ht") +cpg_sites = GnomadPublicTableResource( + path="gs://gnomad-public/resources/grch37/cpg_sites/cpg.ht" +) -methylation_sites = TableResource( +methylation_sites = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/methylation_sites/methylation.ht" ) -lcr_intervals = TableResource( +lcr_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -167,7 +169,7 @@ }, ) -decoy_intervals = TableResource( +decoy_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.ht", import_func=hl.import_bed, import_args={ @@ -176,7 +178,7 @@ }, ) -purcell_5k_intervals = TableResource( +purcell_5k_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/purcell_5k_intervals/purcell5k.ht", import_func=hl.import_locus_intervals, import_args={ @@ -185,7 +187,7 @@ }, ) -seg_dup_intervals = TableResource( +seg_dup_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.ht", import_func=hl.import_bed, import_args={ @@ -194,7 +196,7 @@ }, ) -exome_hc_intervals = TableResource( +exome_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -203,7 +205,7 @@ }, ) -high_coverage_intervals = TableResource( +high_coverage_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/high_coverage.auto.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -212,7 +214,7 @@ }, ) -exome_calling_intervals = TableResource( +exome_calling_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -221,7 +223,7 @@ }, ) -exome_evaluation_intervals = TableResource( +exome_evaluation_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -230,7 +232,7 @@ }, ) -genome_evaluation_intervals = TableResource( +genome_evaluation_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -239,7 +241,7 @@ }, ) -na12878_hc_intervals = TableResource( +na12878_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_intervals.ht", import_func=hl.import_bed, import_args={ @@ -248,7 +250,7 @@ }, ) -syndip_hc_intervals = TableResource( +syndip_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/syndip/syndip_highconf_genome_intervals.ht", import_func=hl.import_bed, import_args={ diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index bcc7d537b..3fedb79aa 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -1,9 +1,9 @@ # noqa: D100 from gnomad.resources.resource_utils import ( - TableResource, + GnomadPublicTableResource, + GnomadPublicMatrixTableResource, VersionedMatrixTableResource, - MatrixTableResource, VersionedTableResource, DataException, ) @@ -147,7 +147,7 @@ gnomad_syndip = VersionedMatrixTableResource( default_version="3.0", versions={ - "3.0": MatrixTableResource( + "3.0": GnomadPublicMatrixTableResource( path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt" ) }, @@ -156,7 +156,7 @@ na12878 = VersionedMatrixTableResource( default_version="3.0", versions={ - "3.0": MatrixTableResource( + "3.0": GnomadPublicMatrixTableResource( path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_na12878.mt" ) }, @@ -209,7 +209,9 @@ def public_release(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_release_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) for release in releases }, ) @@ -237,7 +239,9 @@ def coverage(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_coverage_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) for release in releases }, ) diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index d71fed87c..a3d69f50d 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -2,9 +2,9 @@ from gnomad.resources.resource_utils import ( DBSNP_B154_CHR_CONTIG_RECODING, - TableResource, + GnomadPublicTableResource, + GnomadPublicMatrixTableResource, VersionedTableResource, - MatrixTableResource, VersionedMatrixTableResource, import_sites_vcf, NO_CHR_TO_CHR_CONTIG_RECODING, @@ -57,7 +57,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: # Resources with no versioning needed -purcell_5k_intervals = TableResource( +purcell_5k_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht", import_func=_import_purcell_5k, import_args={ @@ -65,7 +65,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -na12878_giab = MatrixTableResource( +na12878_giab = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt", import_func=hl.import_vcf, import_args={ @@ -76,7 +76,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -na12878_giab_hc_intervals = TableResource( +na12878_giab_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht", import_func=hl.import_bed, import_args={ @@ -90,7 +90,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: vep_context = VersionedTableResource( default_version="95", versions={ - "95": TableResource( + "95": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht", ) }, @@ -99,7 +99,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: syndip = VersionedMatrixTableResource( default_version="20180222", versions={ - "20180222": MatrixTableResource( + "20180222": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch38/syndip/syndip.b38_20180222.mt", import_func=hl.import_vcf, import_args={ @@ -115,7 +115,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: syndip_hc_intervals = VersionedTableResource( default_version="20180222", versions={ - "20180222": TableResource( + "20180222": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/syndip/syndip_b38_20180222_hc_regions.ht", import_func=hl.import_bed, import_args={ @@ -131,7 +131,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: clinvar = VersionedTableResource( default_version="20190923", versions={ - "20190923": TableResource( + "20190923": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/clinvar/clinvar_20190923.ht", import_func=_import_clinvar, import_args={ @@ -149,7 +149,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: dbsnp = VersionedTableResource( default_version="b154", versions={ - "b154": TableResource( + "b154": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/dbsnp/dbsnp_b154_grch38_all_20200514.ht", import_func=_import_dbsnp, import_args={ @@ -162,7 +162,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: "reference_genome": "GRCh38", }, ), - "b151": TableResource( + "b151": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.ht", import_func=import_sites_vcf, import_args={ @@ -178,7 +178,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -hapmap = TableResource( +hapmap = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/hapmap/hapmap_3.3.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -188,7 +188,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -kgp_omni = TableResource( +kgp_omni = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/kgp/1000G_omni2.5.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -201,7 +201,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: kgp = VersionedTableResource( default_version="phase_1_hc", versions={ - "phase_1_hc": TableResource( + "phase_1_hc": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/kgp/1000G_phase1.snps.high_confidence.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -213,7 +213,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -mills = TableResource( +mills = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/mills/Mills_and_1000G_gold_standard.indels.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -223,7 +223,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -lcr_intervals = TableResource( +lcr_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/lcr_intervals/LCRFromHengHg38.ht", import_func=hl.import_locus_intervals, import_args={ @@ -233,7 +233,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -seg_dup_intervals = TableResource( +seg_dup_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/seg_dup_intervals/GRCh38_segdups.ht", import_func=hl.import_bed, import_args={ @@ -242,7 +242,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -telomeres_and_centromeres = TableResource( +telomeres_and_centromeres = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht", import_func=hl.import_bed, import_args={ diff --git a/gnomad/resources/import_resources.py b/gnomad/resources/import_resources.py index a3639357f..12a8e0141 100644 --- a/gnomad/resources/import_resources.py +++ b/gnomad/resources/import_resources.py @@ -1,13 +1,18 @@ # noqa: D100 +import argparse import itertools import textwrap from inspect import getmembers from typing import Dict, Tuple, Optional + +from gnomad.resources.config import ( + GnomadPublicResourceSource, + gnomad_public_resource_configuration, +) from gnomad.resources.resource_utils import BaseResource, BaseVersionedResource import gnomad.resources.grch37 as grch37 import gnomad.resources.grch38 as grch38 -import argparse # Generate a dictionary of resource available for import for a given genome build @@ -83,6 +88,8 @@ def get_resources_descriptions( def main(args): """Import selected resources.""" + gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD + for resource_arg in args.resources: resource_name, resource = all_resources[resource_arg] print(f"Importing {resource_name}...") diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 4309a04ea..92930ef37 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -2,14 +2,24 @@ import logging from abc import ABC, abstractmethod +from functools import reduce from typing import Any, Callable, Dict, List, Optional import hail as hl from hail.linalg import BlockMatrix +from gnomad.resources.config import ( + GnomadPublicResourceSource, + gnomad_public_resource_configuration, +) + + logger = logging.getLogger("gnomad.resources") +GNOMAD_PUBLIC_BUCKETS = ("gnomad-public", "gnomad-public-requester-pays") + + # Resource classes class BaseResource(ABC): """ @@ -51,11 +61,24 @@ def __init__( ) def __repr__(self): - attr_str = [f"path={self.path}"] + attr_str = [f"path={self._path}"] if self.import_args is not None: attr_str.append(f"import_args={self.import_args}") return f'{self.__class__.__name__}({",".join(attr_str)})' + def _get_path(self): + return self._path + + def _set_path(self, path): + self._path = path # pylint: disable=attribute-defined-outside-init + + # Defining path property this way instead of using a decorator allows _get_path and _set_path + # to be overridden in subclasses without having to reconfigure the property. + path = property( + fget=lambda self: self._get_path(), + fset=lambda self, path: self._set_path(path), + ) + @abstractmethod def import_resource(self, overwrite: bool = True, **kwargs) -> None: """ @@ -351,6 +374,56 @@ def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource super().__init__(default_version, versions) +class GnomadPublicResource(BaseResource, ABC): + """Base class for the gnomAD project's public resources.""" + + def _get_path(self) -> str: + resource_source = gnomad_public_resource_configuration.source + if resource_source == GnomadPublicResourceSource.GNOMAD: + return self._path + + relative_path = reduce( + lambda path, bucket: path[5 + len(bucket) :] + if path.startswith(f"gs://{bucket}/") + else path, + GNOMAD_PUBLIC_BUCKETS, + self._path, + ) + + if resource_source == GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS: + return f"gs://gcp-public-data--gnomad{relative_path}" + + return ( + f"{resource_source.rstrip('/')}{relative_path}" # pylint: disable=no-member + ) + + def _set_path(self, path): + if not any( + path.startswith(f"gs://{bucket}/") for bucket in GNOMAD_PUBLIC_BUCKETS + ): + raise ValueError( + f"GnomadPublicResource requires a path to a file in one of the public gnomAD buckets ({', '.join(GNOMAD_PUBLIC_BUCKETS)})" + ) + + return super()._set_path(path) + + +class GnomadPublicTableResource(TableResource, GnomadPublicResource): + """Resource class for a public Hail Table published by the gnomAD project.""" + + +class GnomadPublicMatrixTableResource(MatrixTableResource, GnomadPublicResource): + """Resource class for a public Hail MatrixTable published by the gnomAD project.""" + + +class GnomadPublicPedigreeResource(PedigreeResource, GnomadPublicResource): + """Resource class for a public pedigree published by the gnomAD project.""" + + +class GnomadPublicBlockMatrixResource(BlockMatrixResource, GnomadPublicResource): + """Resource class for a public Hail BlockMatrix published by the gnomAD project.""" + + class DataException(Exception): # noqa: D101 pass diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 1ba82e8db..901021712 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -1,8 +1,15 @@ """Tests for resource classes.""" +from typing import List, Tuple, Union from unittest.mock import patch +import pytest + from gnomad.resources import resource_utils +from gnomad.resources.config import ( + gnomad_public_resource_configuration, + GnomadPublicResourceSource, +) class TestTableResource: @@ -71,3 +78,143 @@ def test_read_block_matrix(self, read_block_matrix): ds = resource.bm() read_block_matrix.assert_called_with("gs://gnomad-public/block_matrix.bm") assert ds == read_block_matrix.return_value + + +def gnomad_public_resource_test_parameters( + path: str, +) -> List[Tuple[str, Union[GnomadPublicResourceSource, str], str]]: + """ + Get parameters for gnomAD public resource tests. + + :param path: Path to resource file inside gnomAD bucket. + """ + return [ + ( + f"gs://gnomad-public{path}", + GnomadPublicResourceSource.GNOMAD, + f"gs://gnomad-public{path}", + ), + ( + f"gs://gnomad-public-requester-pays{path}", + GnomadPublicResourceSource.GNOMAD, + f"gs://gnomad-public-requester-pays{path}", + ), + ( + f"gs://gnomad-public{path}", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + f"gs://gcp-public-data--gnomad{path}", + ), + ( + f"gs://gnomad-public-requester-pays{path}", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + f"gs://gcp-public-data--gnomad{path}", + ), + ( + f"gs://gnomad-public{path}", + "gs://my-bucket/gnomad-resources", + f"gs://my-bucket/gnomad-resources{path}", + ), + ( + f"gs://gnomad-public-requester-pays{path}", + "gs://my-bucket/gnomad-resources", + f"gs://my-bucket/gnomad-resources{path}", + ), + ] + + +class TestGnomadPublicTableResource: + """Tests for GnomadPublicTableResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + gnomad_public_resource_test_parameters("/table.ht"), + ) + @patch("hail.read_table") + def test_read_gnomad_public_table_resource( + self, read_table, resource_path, source, expected_read_path + ): + """Test that Table can be read from different sources.""" + resource = resource_utils.GnomadPublicTableResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.ht() + read_table.assert_called_with(expected_read_path) + + +class TestGnomadPublicMatrixTableResource: + """Tests for GnomadPublicMatrixTableResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + gnomad_public_resource_test_parameters("/matrix_table.mt"), + ) + @patch("hail.read_matrix_table") + def test_read_gnomad_public_matrix_table_resource( + self, read_matrix_table, resource_path, source, expected_read_path + ): + """Test that MatrixTable can be read from different sources.""" + resource = resource_utils.GnomadPublicMatrixTableResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.mt() + read_matrix_table.assert_called_with(expected_read_path) + + +class TestGnomadPublicPedigreeResource: + """Tests for GnomadPublicPedigreeResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + gnomad_public_resource_test_parameters("/pedigree.ped"), + ) + @patch("hail.Pedigree.read") + def test_read_gnomad_public_pedigree_resource( + self, read_pedigree, resource_path, source, expected_read_path + ): + """Test that Pedigree can be read from different sources.""" + resource = resource_utils.GnomadPublicPedigreeResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.pedigree() + read_pedigree.assert_called() + assert read_pedigree.call_args[0][0] == expected_read_path + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + gnomad_public_resource_test_parameters("/pedigree.fam"), + ) + @patch("hail.import_fam") + def test_import_gnomad_public_pedigree_resource( + self, import_fam, resource_path, source, expected_read_path + ): + """Test that pedigree can be imported from different sources.""" + resource = resource_utils.GnomadPublicPedigreeResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.ht() + import_fam.assert_called() + assert import_fam.call_args[0][0] == expected_read_path + + +class TestGnomadPublicBlockMatrixResource: + """Tests for GnomadPublicBlockMatrixResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + gnomad_public_resource_test_parameters("/block_matrix.bm"), + ) + @patch("hail.linalg.BlockMatrix.read") + def test_read_gnomad_public_block_matrix_resource( + self, read_block_matrix, resource_path, source, expected_read_path + ): + """Test that BlockMatrix can be read from different sources.""" + resource = resource_utils.GnomadPublicBlockMatrixResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.bm() + read_block_matrix.assert_called_with(expected_read_path)