From 800148a5b72f68f8892aa4cdd7e15fde6c62fcf4 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Thu, 8 Apr 2021 09:45:22 -0400 Subject: [PATCH 01/16] Make path attribute of BaseResource a property --- gnomad/resources/resource_utils.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 4309a04ea..82aad81a2 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -51,11 +51,24 @@ def __init__( ) def __repr__(self): - attr_str = [f"path={self.path}"] + attr_str = [f"path={self._path}"] if self.import_args is not None: attr_str.append(f"import_args={self.import_args}") return f'{self.__class__.__name__}({",".join(attr_str)})' + def _get_path(self): + return self._path + + def _set_path(self, path): + self._path = path # pylint: disable=attribute-defined-outside-init + + # Defining path property this way instead of using a decorator allows _get_path and _set_path + # to be overridden in subclasses without having to reconfigure the property. + path = property( + fget=lambda self: self._get_path(), + fset=lambda self, path: self._set_path(path), + ) + @abstractmethod def import_resource(self, overwrite: bool = True, **kwargs) -> None: """ From d8b3c9e3233c1a3aa46e6dc6d9956ae57032f1ed Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Thu, 8 Apr 2021 09:59:07 -0400 Subject: [PATCH 02/16] Add resource subclasses for public resources --- gnomad/resources/grch37/gnomad.py | 18 +++++--- gnomad/resources/grch37/gnomad_ld.py | 17 ++++--- gnomad/resources/grch37/reference_data.py | 54 ++++++++++++----------- gnomad/resources/grch38/gnomad.py | 16 ++++--- gnomad/resources/grch38/reference_data.py | 36 +++++++-------- gnomad/resources/resource_utils.py | 33 ++++++++++++++ 6 files changed, 111 insertions(+), 63 deletions(-) diff --git a/gnomad/resources/grch37/gnomad.py b/gnomad/resources/grch37/gnomad.py index 99a04ad50..89de0702c 100644 --- a/gnomad/resources/grch37/gnomad.py +++ b/gnomad/resources/grch37/gnomad.py @@ -2,7 +2,7 @@ from gnomad.resources.resource_utils import ( DataException, - TableResource, + GnomadPublicTableResource, VersionedTableResource, ) @@ -124,7 +124,9 @@ def public_release(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_release_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) for release in releases }, ) @@ -150,7 +152,9 @@ def coverage(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_coverage_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) for release in releases }, ) @@ -176,13 +180,15 @@ def liftover(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_liftover_data_path(data_type, release)) + release: GnomadPublicTableResource( + path=_liftover_data_path(data_type, release) + ) for release in releases }, ) -def public_pca_loadings(subpop: str = "") -> TableResource: +def public_pca_loadings(subpop: str = "") -> GnomadPublicTableResource: """ Return the TableResource containing sites and loadings from population PCA. @@ -194,7 +200,7 @@ def public_pca_loadings(subpop: str = "") -> TableResource: 'Available subpops are "eas" or "nfe", default value "" for global' ) - return TableResource(path=_public_pca_ht_path(subpop)) + return GnomadPublicTableResource(path=_public_pca_ht_path(subpop)) def release_vcf_path(data_type: str, version: str, contig: str) -> str: diff --git a/gnomad/resources/grch37/gnomad_ld.py b/gnomad/resources/grch37/gnomad_ld.py index 78affb55d..7cba27d86 100644 --- a/gnomad/resources/grch37/gnomad_ld.py +++ b/gnomad/resources/grch37/gnomad_ld.py @@ -1,6 +1,9 @@ # noqa: D100 -from gnomad.resources.resource_utils import TableResource, BlockMatrixResource +from gnomad.resources.resource_utils import ( + GnomadPublicTableResource, + GnomadPublicBlockMatrixResource, +) from gnomad.resources.grch37.gnomad import CURRENT_EXOME_RELEASE, CURRENT_GENOME_RELEASE from typing import Optional @@ -67,16 +70,16 @@ def _ld_scores_path( return f'gs://gnomad-public-requester-pays/release/{version}/ld/scores/gnomad.{data_type}.r{version}.{pop}.{"adj." if adj else ""}ld_scores.ht' -def ld_matrix(pop: str) -> BlockMatrixResource: +def ld_matrix(pop: str) -> GnomadPublicBlockMatrixResource: """Get resource for the LD matrix for the given population.""" - return BlockMatrixResource(path=_ld_matrix_path("genomes", pop)) + return GnomadPublicBlockMatrixResource(path=_ld_matrix_path("genomes", pop)) -def ld_index(pop: str) -> TableResource: +def ld_index(pop: str) -> GnomadPublicTableResource: """Get resource for the LD indices for the given population.""" - return TableResource(path=_ld_index_path("genomes", pop)) + return GnomadPublicTableResource(path=_ld_index_path("genomes", pop)) -def ld_scores(pop: str) -> TableResource: +def ld_scores(pop: str) -> GnomadPublicTableResource: """Get resource for the LD scores for the given population.""" - return TableResource(path=_ld_scores_path("genomes", pop)) + return GnomadPublicTableResource(path=_ld_scores_path("genomes", pop)) diff --git a/gnomad/resources/grch37/reference_data.py b/gnomad/resources/grch37/reference_data.py index d475bc1fb..c7c89b0d4 100644 --- a/gnomad/resources/grch37/reference_data.py +++ b/gnomad/resources/grch37/reference_data.py @@ -1,15 +1,15 @@ # noqa: D100 from gnomad.resources.resource_utils import ( - MatrixTableResource, - TableResource, + GnomadPublicMatrixTableResource, + GnomadPublicTableResource, VersionedMatrixTableResource, VersionedTableResource, import_sites_vcf, ) import hail as hl -na12878_giab = MatrixTableResource( +na12878_giab = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt", import_func=hl.import_vcf, import_args={ @@ -20,7 +20,7 @@ }, ) -hapmap = TableResource( +hapmap = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht", import_func=import_sites_vcf, import_args={ @@ -31,7 +31,7 @@ }, ) -kgp_omni = TableResource( +kgp_omni = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.ht", import_func=import_sites_vcf, import_args={ @@ -42,7 +42,7 @@ }, ) -mills = TableResource( +mills = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/mills/Mills_and_1000G_gold_standard.indels.b37.ht", import_func=import_sites_vcf, import_args={ @@ -53,7 +53,7 @@ }, ) -syndip = MatrixTableResource( +syndip = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/syndip/hybrid.m37m.mt", import_func=hl.import_vcf, import_args={ @@ -67,7 +67,7 @@ vep_context = VersionedTableResource( default_version="85", versions={ - "85": TableResource( + "85": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/context/grch37_context_vep_annotated.ht", ) }, @@ -76,7 +76,7 @@ dbsnp = VersionedTableResource( default_version="20180423", versions={ - "20180423": TableResource( + "20180423": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/dbsnp/All_20180423.ht", import_func=import_sites_vcf, import_args={ @@ -93,7 +93,7 @@ clinvar = VersionedTableResource( default_version="20181028", versions={ - "20181028": TableResource( + "20181028": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/clinvar/clinvar_20181028.vep.ht", import_func=import_sites_vcf, import_args={ @@ -110,7 +110,7 @@ kgp_phase_3 = VersionedMatrixTableResource( default_version="phase_3_split", versions={ - "phase_3_split": MatrixTableResource( + "phase_3_split": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt", import_func=hl.import_vcf, import_args={ @@ -121,7 +121,7 @@ "reference_genome": "GRCh37", }, ), - "phase_3": MatrixTableResource( + "phase_3": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt", import_func=hl.import_vcf, import_args={ @@ -138,7 +138,7 @@ kgp = VersionedTableResource( default_version="phase_1_hc", versions={ - "phase_1_hc": TableResource( + "phase_1_hc": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_phase1.snps.high_confidence.b37.ht", import_func=import_sites_vcf, import_args={ @@ -152,13 +152,15 @@ }, ) -cpg_sites = TableResource(path="gs://gnomad-public/resources/grch37/cpg_sites/cpg.ht") +cpg_sites = GnomadPublicTableResource( + path="gs://gnomad-public/resources/grch37/cpg_sites/cpg.ht" +) -methylation_sites = TableResource( +methylation_sites = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/methylation_sites/methylation.ht" ) -lcr_intervals = TableResource( +lcr_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/lcr_intervals/LCR.GRCh37_compliant.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -167,7 +169,7 @@ }, ) -decoy_intervals = TableResource( +decoy_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/decoy_intervals/mm-2-merged.GRCh37_compliant.ht", import_func=hl.import_bed, import_args={ @@ -176,7 +178,7 @@ }, ) -purcell_5k_intervals = TableResource( +purcell_5k_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/purcell_5k_intervals/purcell5k.ht", import_func=hl.import_locus_intervals, import_args={ @@ -185,7 +187,7 @@ }, ) -seg_dup_intervals = TableResource( +seg_dup_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/seg_dup_intervals/hg19_self_chain_split_both.ht", import_func=hl.import_bed, import_args={ @@ -194,7 +196,7 @@ }, ) -exome_hc_intervals = TableResource( +exome_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exomes_high_coverage.auto.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -203,7 +205,7 @@ }, ) -high_coverage_intervals = TableResource( +high_coverage_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/high_coverage.auto.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -212,7 +214,7 @@ }, ) -exome_calling_intervals = TableResource( +exome_calling_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exome_calling_regions.v1.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -221,7 +223,7 @@ }, ) -exome_evaluation_intervals = TableResource( +exome_evaluation_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/exome_evaluation_regions.v1.noheader.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -230,7 +232,7 @@ }, ) -genome_evaluation_intervals = TableResource( +genome_evaluation_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/broad_intervals/hg19-v0-wgs_evaluation_regions.v1.interval_list.ht", import_func=hl.import_locus_intervals, import_args={ @@ -239,7 +241,7 @@ }, ) -na12878_hc_intervals = TableResource( +na12878_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_intervals.ht", import_func=hl.import_bed, import_args={ @@ -248,7 +250,7 @@ }, ) -syndip_hc_intervals = TableResource( +syndip_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch37/syndip/syndip_highconf_genome_intervals.ht", import_func=hl.import_bed, import_args={ diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index bcc7d537b..edeb3cc86 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -1,9 +1,9 @@ # noqa: D100 from gnomad.resources.resource_utils import ( - TableResource, + GnomadPublicTableResource, VersionedMatrixTableResource, - MatrixTableResource, + GnomadPublicMatrixTableResource, VersionedTableResource, DataException, ) @@ -147,7 +147,7 @@ gnomad_syndip = VersionedMatrixTableResource( default_version="3.0", versions={ - "3.0": MatrixTableResource( + "3.0": GnomadPublicMatrixTableResource( path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt" ) }, @@ -156,7 +156,7 @@ na12878 = VersionedMatrixTableResource( default_version="3.0", versions={ - "3.0": MatrixTableResource( + "3.0": GnomadPublicMatrixTableResource( path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_na12878.mt" ) }, @@ -209,7 +209,9 @@ def public_release(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_release_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_release_ht_path(data_type, release) + ) for release in releases }, ) @@ -237,7 +239,9 @@ def coverage(data_type: str) -> VersionedTableResource: return VersionedTableResource( current_release, { - release: TableResource(path=_public_coverage_ht_path(data_type, release)) + release: GnomadPublicTableResource( + path=_public_coverage_ht_path(data_type, release) + ) for release in releases }, ) diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index d71fed87c..076c29e57 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -2,9 +2,9 @@ from gnomad.resources.resource_utils import ( DBSNP_B154_CHR_CONTIG_RECODING, - TableResource, + GnomadPublicTableResource, VersionedTableResource, - MatrixTableResource, + GnomadPublicMatrixTableResource, VersionedMatrixTableResource, import_sites_vcf, NO_CHR_TO_CHR_CONTIG_RECODING, @@ -57,7 +57,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: # Resources with no versioning needed -purcell_5k_intervals = TableResource( +purcell_5k_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht", import_func=_import_purcell_5k, import_args={ @@ -65,7 +65,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -na12878_giab = MatrixTableResource( +na12878_giab = GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt", import_func=hl.import_vcf, import_args={ @@ -76,7 +76,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -na12878_giab_hc_intervals = TableResource( +na12878_giab_hc_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht", import_func=hl.import_bed, import_args={ @@ -90,7 +90,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: vep_context = VersionedTableResource( default_version="95", versions={ - "95": TableResource( + "95": GnomadPublicTableResource( path="gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.ht", ) }, @@ -99,7 +99,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: syndip = VersionedMatrixTableResource( default_version="20180222", versions={ - "20180222": MatrixTableResource( + "20180222": GnomadPublicMatrixTableResource( path="gs://gnomad-public/resources/grch38/syndip/syndip.b38_20180222.mt", import_func=hl.import_vcf, import_args={ @@ -115,7 +115,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: syndip_hc_intervals = VersionedTableResource( default_version="20180222", versions={ - "20180222": TableResource( + "20180222": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/syndip/syndip_b38_20180222_hc_regions.ht", import_func=hl.import_bed, import_args={ @@ -131,7 +131,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: clinvar = VersionedTableResource( default_version="20190923", versions={ - "20190923": TableResource( + "20190923": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/clinvar/clinvar_20190923.ht", import_func=_import_clinvar, import_args={ @@ -149,7 +149,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: dbsnp = VersionedTableResource( default_version="b154", versions={ - "b154": TableResource( + "b154": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/dbsnp/dbsnp_b154_grch38_all_20200514.ht", import_func=_import_dbsnp, import_args={ @@ -162,7 +162,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: "reference_genome": "GRCh38", }, ), - "b151": TableResource( + "b151": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/dbsnp/dbsnp_b151_grch38_all_20180418.ht", import_func=import_sites_vcf, import_args={ @@ -178,7 +178,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -hapmap = TableResource( +hapmap = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/hapmap/hapmap_3.3.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -188,7 +188,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -kgp_omni = TableResource( +kgp_omni = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/kgp/1000G_omni2.5.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -201,7 +201,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: kgp = VersionedTableResource( default_version="phase_1_hc", versions={ - "phase_1_hc": TableResource( + "phase_1_hc": GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/kgp/1000G_phase1.snps.high_confidence.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -213,7 +213,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -mills = TableResource( +mills = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/mills/Mills_and_1000G_gold_standard.indels.hg38.ht", import_func=import_sites_vcf, import_args={ @@ -223,7 +223,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -lcr_intervals = TableResource( +lcr_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/lcr_intervals/LCRFromHengHg38.ht", import_func=hl.import_locus_intervals, import_args={ @@ -233,7 +233,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -seg_dup_intervals = TableResource( +seg_dup_intervals = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/seg_dup_intervals/GRCh38_segdups.ht", import_func=hl.import_bed, import_args={ @@ -242,7 +242,7 @@ def _import_dbsnp(**kwargs) -> hl.Table: }, ) -telomeres_and_centromeres = TableResource( +telomeres_and_centromeres = GnomadPublicTableResource( path="gs://gnomad-public/resources/grch38/telomeres_and_centromeres/hg38.telomeresAndMergedCentromeres.ht", import_func=hl.import_bed, import_args={ diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 82aad81a2..41da56ba2 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -364,6 +364,39 @@ def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource super().__init__(default_version, versions) +GNOMAD_PUBLIC_BUCKETS = ("gnomad-public", "gnomad-public-requester-pays") + + +class GnomadPublicResource(BaseResource, ABC): + """Base class for the gnomAD project's public resources.""" + + def _set_path(self, path): + if not any( + path.startswith(f"gs://{bucket}") for bucket in GNOMAD_PUBLIC_BUCKETS + ): + raise ValueError( + f"GnomadPublicResource requires a path to a file in one of the public gnomAD buckets ({', '.join(GNOMAD_PUBLIC_BUCKETS)})" + ) + + return super()._set_path(path) + + +class GnomadPublicTableResource(TableResource, GnomadPublicResource): + """Resource class for a public Hail Table published by the gnomAD project.""" + + +class GnomadPublicMatrixTableResource(MatrixTableResource, GnomadPublicResource): + """Resource class for a public Hail MatrixTable published by the gnomAD project.""" + + +class GnomadPublicPedigreeResource(PedigreeResource, GnomadPublicResource): + """Resource class for a public pedigree published by the gnomAD project.""" + + +class GnomadPublicBlockMatrixResource(BlockMatrixResource, GnomadPublicResource): + """Resource class for a public Hail BlockMatrix published by the gnomAD project.""" + + class DataException(Exception): # noqa: D101 pass From 60765d5fc1fb97af53cdec9c9942a8030458bdd9 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Thu, 8 Apr 2021 18:39:16 -0400 Subject: [PATCH 03/16] Add configuration for source of gnomAD public resources --- gnomad/resources/config.py | 46 ++++++++++++++++++++++++++ gnomad/resources/import_resources.py | 9 ++++- gnomad/resources/resource_utils.py | 19 +++++++++++ tests/resources/test_resource_utils.py | 32 ++++++++++++++++++ 4 files changed, 105 insertions(+), 1 deletion(-) create mode 100644 gnomad/resources/config.py diff --git a/gnomad/resources/config.py b/gnomad/resources/config.py new file mode 100644 index 000000000..78ff390dc --- /dev/null +++ b/gnomad/resources/config.py @@ -0,0 +1,46 @@ +"""Configuration for loading resources.""" + +import typing +from enum import Enum + + +class GnomadPublicResourceSource(Enum): + """Sources for public gnomAD resources.""" + + GNOMAD = "gnomAD" + + +DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE = GnomadPublicResourceSource.GNOMAD + + +class _GnomadPublicResourceConfiguration: + """Configuration for public gnomAD resources.""" + + __source: typing.Union[ + GnomadPublicResourceSource, str + ] = DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE + + @property + def source(self) -> typing.Union[GnomadPublicResourceSource, str]: + """ + Get the source for public gnomAD resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :returns: Source name or path to root of resources directory + """ + return self.__source + + @source.setter + def source(self, source: typing.Union[GnomadPublicResourceSource, str]) -> None: + """ + Set the default source for resource files. + + This is used to determine which URLs gnomAD resources will be loaded from. + + :param source: Source name or path to root of resources directory + """ + self.__source = source + + +gnomad_public_resource_configuration = _GnomadPublicResourceConfiguration() diff --git a/gnomad/resources/import_resources.py b/gnomad/resources/import_resources.py index a3639357f..12a8e0141 100644 --- a/gnomad/resources/import_resources.py +++ b/gnomad/resources/import_resources.py @@ -1,13 +1,18 @@ # noqa: D100 +import argparse import itertools import textwrap from inspect import getmembers from typing import Dict, Tuple, Optional + +from gnomad.resources.config import ( + GnomadPublicResourceSource, + gnomad_public_resource_configuration, +) from gnomad.resources.resource_utils import BaseResource, BaseVersionedResource import gnomad.resources.grch37 as grch37 import gnomad.resources.grch38 as grch38 -import argparse # Generate a dictionary of resource available for import for a given genome build @@ -83,6 +88,8 @@ def get_resources_descriptions( def main(args): """Import selected resources.""" + gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD + for resource_arg in args.resources: resource_name, resource = all_resources[resource_arg] print(f"Importing {resource_name}...") diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 41da56ba2..140a49156 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -2,11 +2,15 @@ import logging from abc import ABC, abstractmethod +from functools import reduce from typing import Any, Callable, Dict, List, Optional import hail as hl from hail.linalg import BlockMatrix +from .config import GnomadPublicResourceSource, gnomad_public_resource_configuration + + logger = logging.getLogger("gnomad.resources") @@ -370,6 +374,21 @@ def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource class GnomadPublicResource(BaseResource, ABC): """Base class for the gnomAD project's public resources.""" + def _get_path(self) -> str: + resource_source = gnomad_public_resource_configuration.source + if resource_source == GnomadPublicResourceSource.GNOMAD: + return self._path + + relative_path = reduce( + lambda path, bucket: path[5 + len(bucket) :] + if path.startswith(f"gs://{bucket}") + else path, + GNOMAD_PUBLIC_BUCKETS, + self._path, + ) + + return f"{resource_source.rstrip('/')}{relative_path}" + def _set_path(self, path): if not any( path.startswith(f"gs://{bucket}") for bucket in GNOMAD_PUBLIC_BUCKETS diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 1ba82e8db..6d92dfa84 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -3,6 +3,10 @@ from unittest.mock import patch from gnomad.resources import resource_utils +from gnomad.resources.config import ( + gnomad_public_resource_configuration, + GnomadPublicResourceSource, +) class TestTableResource: @@ -71,3 +75,31 @@ def test_read_block_matrix(self, read_block_matrix): ds = resource.bm() read_block_matrix.assert_called_with("gs://gnomad-public/block_matrix.bm") assert ds == read_block_matrix.return_value + + +class TestGnomadPublicTableResource: + """Tests for GnomadPublicTableResource.""" + + @patch("hail.read_table") + def test_gnomad_public_table_resource(self, read_table): + """Test that Table can be read from gnomAD bucket.""" + resource = resource_utils.GnomadPublicTableResource( + "gs://gnomad-public/table.ht" + ) + + gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD + + resource.ht() + read_table.assert_called_with("gs://gnomad-public/table.ht") + + @patch("hail.read_table") + def test_gnomad_public_table_resource_custom_source(self, read_table): + """Test that Table can be read from custom source.""" + resource = resource_utils.GnomadPublicTableResource( + "gs://gnomad-public/table.ht" + ) + + gnomad_public_resource_configuration.source = "gs://my-bucket/gnomad-resources" + + resource.ht() + read_table.assert_called_with("gs://my-bucket/gnomad-resources/table.ht") From f44ae96abcaf958800f41746efebae5703fcc0a5 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Thu, 8 Apr 2021 18:40:32 -0400 Subject: [PATCH 04/16] Add Google Cloud Public Datasets provider for gnomAD public resources --- gnomad/resources/config.py | 1 + gnomad/resources/resource_utils.py | 3 +++ tests/resources/test_resource_utils.py | 16 ++++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/gnomad/resources/config.py b/gnomad/resources/config.py index 78ff390dc..f91d016b0 100644 --- a/gnomad/resources/config.py +++ b/gnomad/resources/config.py @@ -8,6 +8,7 @@ class GnomadPublicResourceSource(Enum): """Sources for public gnomAD resources.""" GNOMAD = "gnomAD" + GOOGLE_CLOUD_PUBLIC_DATASETS = "Google Cloud Public Datasets" DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE = GnomadPublicResourceSource.GNOMAD diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 140a49156..4d6fff8ad 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -387,6 +387,9 @@ def _get_path(self) -> str: self._path, ) + if resource_source == GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS: + return f"gs://gcp-public-data--gnomad{relative_path}" + return f"{resource_source.rstrip('/')}{relative_path}" def _set_path(self, path): diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 6d92dfa84..6f01dc611 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -92,6 +92,22 @@ def test_gnomad_public_table_resource(self, read_table): resource.ht() read_table.assert_called_with("gs://gnomad-public/table.ht") + @patch("hail.read_table") + def test_gnomad_public_table_resource_google_cloud_public_datasets( + self, read_table + ): + """Test that Table can be read from Google Cloud Public Datasets bucket.""" + resource = resource_utils.GnomadPublicTableResource( + "gs://gnomad-public/table.ht" + ) + + gnomad_public_resource_configuration.source = ( + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS + ) + + resource.ht() + read_table.assert_called_with("gs://gcp-public-data--gnomad/table.ht") + @patch("hail.read_table") def test_gnomad_public_table_resource_custom_source(self, read_table): """Test that Table can be read from custom source.""" From c1852a1e67a0cba8bd64569340da7f469b91a6d6 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Mon, 26 Apr 2021 17:01:42 -0400 Subject: [PATCH 05/16] Ignore Pylint error --- gnomad/resources/resource_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 4d6fff8ad..87337c296 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -390,7 +390,9 @@ def _get_path(self) -> str: if resource_source == GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS: return f"gs://gcp-public-data--gnomad{relative_path}" - return f"{resource_source.rstrip('/')}{relative_path}" + return ( + f"{resource_source.rstrip('/')}{relative_path}" # pylint: disable=no-member + ) def _set_path(self, path): if not any( From 8e6e20ebe1a0ab465c007e01d882b48d9f6d0ac3 Mon Sep 17 00:00:00 2001 From: Nick Watts <1156625+nawatts@users.noreply.github.com> Date: Wed, 5 May 2021 10:02:34 -0400 Subject: [PATCH 06/16] Reorder imports Co-authored-by: jkgoodrich <33063077+jkgoodrich@users.noreply.github.com> --- gnomad/resources/grch38/gnomad.py | 2 +- gnomad/resources/grch38/reference_data.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gnomad/resources/grch38/gnomad.py b/gnomad/resources/grch38/gnomad.py index edeb3cc86..3fedb79aa 100644 --- a/gnomad/resources/grch38/gnomad.py +++ b/gnomad/resources/grch38/gnomad.py @@ -2,8 +2,8 @@ from gnomad.resources.resource_utils import ( GnomadPublicTableResource, - VersionedMatrixTableResource, GnomadPublicMatrixTableResource, + VersionedMatrixTableResource, VersionedTableResource, DataException, ) diff --git a/gnomad/resources/grch38/reference_data.py b/gnomad/resources/grch38/reference_data.py index 076c29e57..a3d69f50d 100644 --- a/gnomad/resources/grch38/reference_data.py +++ b/gnomad/resources/grch38/reference_data.py @@ -3,8 +3,8 @@ from gnomad.resources.resource_utils import ( DBSNP_B154_CHR_CONTIG_RECODING, GnomadPublicTableResource, - VersionedTableResource, GnomadPublicMatrixTableResource, + VersionedTableResource, VersionedMatrixTableResource, import_sites_vcf, NO_CHR_TO_CHR_CONTIG_RECODING, From 626292b0a679dcb7b18ed6988506173c798df0cd Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 10:38:17 -0400 Subject: [PATCH 07/16] Change import style --- gnomad/resources/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gnomad/resources/config.py b/gnomad/resources/config.py index f91d016b0..a2f8a3e8d 100644 --- a/gnomad/resources/config.py +++ b/gnomad/resources/config.py @@ -1,7 +1,7 @@ """Configuration for loading resources.""" -import typing from enum import Enum +from typing import Union class GnomadPublicResourceSource(Enum): @@ -17,12 +17,12 @@ class GnomadPublicResourceSource(Enum): class _GnomadPublicResourceConfiguration: """Configuration for public gnomAD resources.""" - __source: typing.Union[ + __source: Union[ GnomadPublicResourceSource, str ] = DEFAULT_GNOMAD_PUBLIC_RESOURCE_SOURCE @property - def source(self) -> typing.Union[GnomadPublicResourceSource, str]: + def source(self) -> Union[GnomadPublicResourceSource, str]: """ Get the source for public gnomAD resource files. @@ -33,7 +33,7 @@ def source(self) -> typing.Union[GnomadPublicResourceSource, str]: return self.__source @source.setter - def source(self, source: typing.Union[GnomadPublicResourceSource, str]) -> None: + def source(self, source: Union[GnomadPublicResourceSource, str]) -> None: """ Set the default source for resource files. From ed1f887b98f4428ea936c600afed9b38f96f82fc Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 10:39:30 -0400 Subject: [PATCH 08/16] Use absolute import --- gnomad/resources/resource_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 87337c296..78c72d7d1 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -8,7 +8,10 @@ import hail as hl from hail.linalg import BlockMatrix -from .config import GnomadPublicResourceSource, gnomad_public_resource_configuration +from gnomad.resources.config import ( + GnomadPublicResourceSource, + gnomad_public_resource_configuration, +) logger = logging.getLogger("gnomad.resources") From 54a1b7205e6c76e361258a15c99d7ca5000c5cb5 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 10:40:46 -0400 Subject: [PATCH 09/16] Move constants to beginning of file --- gnomad/resources/resource_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 78c72d7d1..e4ecf58f4 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -17,6 +17,9 @@ logger = logging.getLogger("gnomad.resources") +GNOMAD_PUBLIC_BUCKETS = ("gnomad-public", "gnomad-public-requester-pays") + + # Resource classes class BaseResource(ABC): """ @@ -371,9 +374,6 @@ def __init__(self, default_version: str, versions: Dict[str, BlockMatrixResource super().__init__(default_version, versions) -GNOMAD_PUBLIC_BUCKETS = ("gnomad-public", "gnomad-public-requester-pays") - - class GnomadPublicResource(BaseResource, ABC): """Base class for the gnomAD project's public resources.""" From 87755630e4346ba5dfc6e47b71f29d0db0e1272c Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 11:01:54 -0400 Subject: [PATCH 10/16] Parametrize tests for GnomadPublicTableResource --- tests/resources/test_resource_utils.py | 62 ++++++++++++-------------- 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 6f01dc611..a20530911 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -2,6 +2,8 @@ from unittest.mock import patch +import pytest + from gnomad.resources import resource_utils from gnomad.resources.config import ( gnomad_public_resource_configuration, @@ -80,42 +82,34 @@ def test_read_block_matrix(self, read_block_matrix): class TestGnomadPublicTableResource: """Tests for GnomadPublicTableResource.""" + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + [ + ( + "gs://gnomad-public/table.ht", + GnomadPublicResourceSource.GNOMAD, + "gs://gnomad-public/table.ht", + ), + ( + "gs://gnomad-public/table.ht", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "gs://gcp-public-data--gnomad/table.ht", + ), + ( + "gs://gnomad-public/table.ht", + "gs://my-bucket/gnomad-resources", + "gs://my-bucket/gnomad-resources/table.ht", + ), + ], + ) @patch("hail.read_table") - def test_gnomad_public_table_resource(self, read_table): - """Test that Table can be read from gnomAD bucket.""" - resource = resource_utils.GnomadPublicTableResource( - "gs://gnomad-public/table.ht" - ) - - gnomad_public_resource_configuration.source = GnomadPublicResourceSource.GNOMAD - - resource.ht() - read_table.assert_called_with("gs://gnomad-public/table.ht") - - @patch("hail.read_table") - def test_gnomad_public_table_resource_google_cloud_public_datasets( - self, read_table + def test_read_gnomad_public_table_resource( + self, read_table, resource_path, source, expected_read_path ): - """Test that Table can be read from Google Cloud Public Datasets bucket.""" - resource = resource_utils.GnomadPublicTableResource( - "gs://gnomad-public/table.ht" - ) - - gnomad_public_resource_configuration.source = ( - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS - ) - - resource.ht() - read_table.assert_called_with("gs://gcp-public-data--gnomad/table.ht") - - @patch("hail.read_table") - def test_gnomad_public_table_resource_custom_source(self, read_table): - """Test that Table can be read from custom source.""" - resource = resource_utils.GnomadPublicTableResource( - "gs://gnomad-public/table.ht" - ) + """Test that Table can be read from different sources.""" + resource = resource_utils.GnomadPublicTableResource(resource_path) - gnomad_public_resource_configuration.source = "gs://my-bucket/gnomad-resources" + gnomad_public_resource_configuration.source = source resource.ht() - read_table.assert_called_with("gs://my-bucket/gnomad-resources/table.ht") + read_table.assert_called_with(expected_read_path) From a2d658e603d05461b78293cd54b90af6f04afc32 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 11:22:51 -0400 Subject: [PATCH 11/16] Add tests for other types of gnomAD public resources --- tests/resources/test_resource_utils.py | 142 +++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index a20530911..89fbbf137 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -113,3 +113,145 @@ def test_read_gnomad_public_table_resource( resource.ht() read_table.assert_called_with(expected_read_path) + + +class TestGnomadPublicMatrixTableResource: + """Tests for GnomadPublicMatrixTableResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + [ + ( + "gs://gnomad-public/matrix_table.mt", + GnomadPublicResourceSource.GNOMAD, + "gs://gnomad-public/matrix_table.mt", + ), + ( + "gs://gnomad-public/matrix_table.mt", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "gs://gcp-public-data--gnomad/matrix_table.mt", + ), + ( + "gs://gnomad-public/matrix_table.mt", + "gs://my-bucket/gnomad-resources", + "gs://my-bucket/gnomad-resources/matrix_table.mt", + ), + ], + ) + @patch("hail.read_matrix_table") + def test_read_gnomad_public_matrix_table_resource( + self, read_matrix_table, resource_path, source, expected_read_path + ): + """Test that MatrixTable can be read from different sources.""" + resource = resource_utils.GnomadPublicMatrixTableResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.mt() + read_matrix_table.assert_called_with(expected_read_path) + + +class TestGnomadPublicPedigreeResource: + """Tests for GnomadPublicPedigreeResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + [ + ( + "gs://gnomad-public/pedigree.ped", + GnomadPublicResourceSource.GNOMAD, + "gs://gnomad-public/pedigree.ped", + ), + ( + "gs://gnomad-public/pedigree.ped", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "gs://gcp-public-data--gnomad/pedigree.ped", + ), + ( + "gs://gnomad-public/pedigree.ped", + "gs://my-bucket/gnomad-resources", + "gs://my-bucket/gnomad-resources/pedigree.ped", + ), + ], + ) + @patch("hail.Pedigree.read") + def test_read_gnomad_public_pedigree_resource( + self, read_pedigree, resource_path, source, expected_read_path + ): + """Test that Pedigree can be read from different sources.""" + resource = resource_utils.GnomadPublicPedigreeResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.pedigree() + read_pedigree.assert_called() + assert read_pedigree.call_args[0][0] == expected_read_path + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + [ + ( + "gs://gnomad-public/pedigree.fam", + GnomadPublicResourceSource.GNOMAD, + "gs://gnomad-public/pedigree.fam", + ), + ( + "gs://gnomad-public/pedigree.fam", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "gs://gcp-public-data--gnomad/pedigree.fam", + ), + ( + "gs://gnomad-public/pedigree.fam", + "gs://my-bucket/gnomad-resources", + "gs://my-bucket/gnomad-resources/pedigree.fam", + ), + ], + ) + @patch("hail.import_fam") + def test_import_gnomad_public_pedigree_resource( + self, import_fam, resource_path, source, expected_read_path + ): + """Test that pedigree can be imported from different sources.""" + resource = resource_utils.GnomadPublicPedigreeResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.ht() + import_fam.assert_called() + assert import_fam.call_args[0][0] == expected_read_path + + +class TestGnomadPublicBlockMatrixResource: + """Tests for GnomadPublicBlockMatrixResource.""" + + @pytest.mark.parametrize( + "resource_path,source,expected_read_path", + [ + ( + "gs://gnomad-public/block_matrix.bm", + GnomadPublicResourceSource.GNOMAD, + "gs://gnomad-public/block_matrix.bm", + ), + ( + "gs://gnomad-public/block_matrix.bm", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + "gs://gcp-public-data--gnomad/block_matrix.bm", + ), + ( + "gs://gnomad-public/block_matrix.bm", + "gs://my-bucket/gnomad-resources", + "gs://my-bucket/gnomad-resources/block_matrix.bm", + ), + ], + ) + @patch("hail.linalg.BlockMatrix.read") + def test_read_gnomad_public_block_matrix_resource( + self, read_block_matrix, resource_path, source, expected_read_path + ): + """Test that BlockMatrix can be read from different sources.""" + resource = resource_utils.GnomadPublicBlockMatrixResource(resource_path) + + gnomad_public_resource_configuration.source = source + + resource.bm() + read_block_matrix.assert_called_with(expected_read_path) From 4e715b52ca7282f1f90b24885087b35e1638f8db Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 12:54:21 -0400 Subject: [PATCH 12/16] Refactor test parameters for gnomAD public resources --- tests/resources/test_resource_utils.py | 115 +++++++------------------ 1 file changed, 30 insertions(+), 85 deletions(-) diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 89fbbf137..f4b613e20 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -79,28 +79,37 @@ def test_read_block_matrix(self, read_block_matrix): assert ds == read_block_matrix.return_value +def gnomad_public_resource_test_parameters(path: str): + """ + Get parameters for gnomAD public resource tests. + + :param path: Path to resource file inside gnomAD bucket. + """ + return [ + ( + f"gs://gnomad-public{path}", + GnomadPublicResourceSource.GNOMAD, + f"gs://gnomad-public{path}", + ), + ( + f"gs://gnomad-public{path}", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + f"gs://gcp-public-data--gnomad{path}", + ), + ( + f"gs://gnomad-public{path}", + "gs://my-bucket/gnomad-resources", + f"gs://my-bucket/gnomad-resources{path}", + ), + ] + + class TestGnomadPublicTableResource: """Tests for GnomadPublicTableResource.""" @pytest.mark.parametrize( "resource_path,source,expected_read_path", - [ - ( - "gs://gnomad-public/table.ht", - GnomadPublicResourceSource.GNOMAD, - "gs://gnomad-public/table.ht", - ), - ( - "gs://gnomad-public/table.ht", - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, - "gs://gcp-public-data--gnomad/table.ht", - ), - ( - "gs://gnomad-public/table.ht", - "gs://my-bucket/gnomad-resources", - "gs://my-bucket/gnomad-resources/table.ht", - ), - ], + gnomad_public_resource_test_parameters("/table.ht"), ) @patch("hail.read_table") def test_read_gnomad_public_table_resource( @@ -120,23 +129,7 @@ class TestGnomadPublicMatrixTableResource: @pytest.mark.parametrize( "resource_path,source,expected_read_path", - [ - ( - "gs://gnomad-public/matrix_table.mt", - GnomadPublicResourceSource.GNOMAD, - "gs://gnomad-public/matrix_table.mt", - ), - ( - "gs://gnomad-public/matrix_table.mt", - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, - "gs://gcp-public-data--gnomad/matrix_table.mt", - ), - ( - "gs://gnomad-public/matrix_table.mt", - "gs://my-bucket/gnomad-resources", - "gs://my-bucket/gnomad-resources/matrix_table.mt", - ), - ], + gnomad_public_resource_test_parameters("/matrix_table.mt"), ) @patch("hail.read_matrix_table") def test_read_gnomad_public_matrix_table_resource( @@ -156,23 +149,7 @@ class TestGnomadPublicPedigreeResource: @pytest.mark.parametrize( "resource_path,source,expected_read_path", - [ - ( - "gs://gnomad-public/pedigree.ped", - GnomadPublicResourceSource.GNOMAD, - "gs://gnomad-public/pedigree.ped", - ), - ( - "gs://gnomad-public/pedigree.ped", - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, - "gs://gcp-public-data--gnomad/pedigree.ped", - ), - ( - "gs://gnomad-public/pedigree.ped", - "gs://my-bucket/gnomad-resources", - "gs://my-bucket/gnomad-resources/pedigree.ped", - ), - ], + gnomad_public_resource_test_parameters("/pedigree.ped"), ) @patch("hail.Pedigree.read") def test_read_gnomad_public_pedigree_resource( @@ -189,23 +166,7 @@ def test_read_gnomad_public_pedigree_resource( @pytest.mark.parametrize( "resource_path,source,expected_read_path", - [ - ( - "gs://gnomad-public/pedigree.fam", - GnomadPublicResourceSource.GNOMAD, - "gs://gnomad-public/pedigree.fam", - ), - ( - "gs://gnomad-public/pedigree.fam", - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, - "gs://gcp-public-data--gnomad/pedigree.fam", - ), - ( - "gs://gnomad-public/pedigree.fam", - "gs://my-bucket/gnomad-resources", - "gs://my-bucket/gnomad-resources/pedigree.fam", - ), - ], + gnomad_public_resource_test_parameters("/pedigree.fam"), ) @patch("hail.import_fam") def test_import_gnomad_public_pedigree_resource( @@ -226,23 +187,7 @@ class TestGnomadPublicBlockMatrixResource: @pytest.mark.parametrize( "resource_path,source,expected_read_path", - [ - ( - "gs://gnomad-public/block_matrix.bm", - GnomadPublicResourceSource.GNOMAD, - "gs://gnomad-public/block_matrix.bm", - ), - ( - "gs://gnomad-public/block_matrix.bm", - GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, - "gs://gcp-public-data--gnomad/block_matrix.bm", - ), - ( - "gs://gnomad-public/block_matrix.bm", - "gs://my-bucket/gnomad-resources", - "gs://my-bucket/gnomad-resources/block_matrix.bm", - ), - ], + gnomad_public_resource_test_parameters("/block_matrix.bm"), ) @patch("hail.linalg.BlockMatrix.read") def test_read_gnomad_public_block_matrix_resource( From dcc6dd5bdb8a34f8db5e2964df84b6e4957e4f7e Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 13:04:14 -0400 Subject: [PATCH 13/16] Fix paths for public resources in requester pays bucket --- gnomad/resources/resource_utils.py | 2 +- tests/resources/test_resource_utils.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index e4ecf58f4..6209aa9e8 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -384,7 +384,7 @@ def _get_path(self) -> str: relative_path = reduce( lambda path, bucket: path[5 + len(bucket) :] - if path.startswith(f"gs://{bucket}") + if path.startswith(f"gs://{bucket}/") else path, GNOMAD_PUBLIC_BUCKETS, self._path, diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index f4b613e20..967766a59 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -91,16 +91,31 @@ def gnomad_public_resource_test_parameters(path: str): GnomadPublicResourceSource.GNOMAD, f"gs://gnomad-public{path}", ), + ( + f"gs://gnomad-public-requester-pays{path}", + GnomadPublicResourceSource.GNOMAD, + f"gs://gnomad-public-requester-pays{path}", + ), ( f"gs://gnomad-public{path}", GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, f"gs://gcp-public-data--gnomad{path}", ), + ( + f"gs://gnomad-public-requester-pays{path}", + GnomadPublicResourceSource.GOOGLE_CLOUD_PUBLIC_DATASETS, + f"gs://gcp-public-data--gnomad{path}", + ), ( f"gs://gnomad-public{path}", "gs://my-bucket/gnomad-resources", f"gs://my-bucket/gnomad-resources{path}", ), + ( + f"gs://gnomad-public-requester-pays{path}", + "gs://my-bucket/gnomad-resources", + f"gs://my-bucket/gnomad-resources{path}", + ), ] From e54892465a504a66c9aa9c2a662684cd9ae220fa Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 13:24:21 -0400 Subject: [PATCH 14/16] Fix path validation for gnomAD public resources --- gnomad/resources/resource_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnomad/resources/resource_utils.py b/gnomad/resources/resource_utils.py index 6209aa9e8..92930ef37 100644 --- a/gnomad/resources/resource_utils.py +++ b/gnomad/resources/resource_utils.py @@ -399,7 +399,7 @@ def _get_path(self) -> str: def _set_path(self, path): if not any( - path.startswith(f"gs://{bucket}") for bucket in GNOMAD_PUBLIC_BUCKETS + path.startswith(f"gs://{bucket}/") for bucket in GNOMAD_PUBLIC_BUCKETS ): raise ValueError( f"GnomadPublicResource requires a path to a file in one of the public gnomAD buckets ({', '.join(GNOMAD_PUBLIC_BUCKETS)})" From bc1b495507ace375abcfea3d0f1310583f3c0791 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 14:08:46 -0400 Subject: [PATCH 15/16] Add return type for test helper function --- tests/resources/test_resource_utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/resources/test_resource_utils.py b/tests/resources/test_resource_utils.py index 967766a59..901021712 100644 --- a/tests/resources/test_resource_utils.py +++ b/tests/resources/test_resource_utils.py @@ -1,5 +1,6 @@ """Tests for resource classes.""" +from typing import List, Tuple, Union from unittest.mock import patch import pytest @@ -79,7 +80,9 @@ def test_read_block_matrix(self, read_block_matrix): assert ds == read_block_matrix.return_value -def gnomad_public_resource_test_parameters(path: str): +def gnomad_public_resource_test_parameters( + path: str, +) -> List[Tuple[str, Union[GnomadPublicResourceSource, str], str]]: """ Get parameters for gnomAD public resource tests. From 842768cbd442bbd796275732f0844b261a079508 Mon Sep 17 00:00:00 2001 From: Nick Watts Date: Wed, 5 May 2021 14:10:24 -0400 Subject: [PATCH 16/16] Add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 508895de7..cecd28f88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * Added function `make_faf_index_dict` to create a look-up Dictionary for entries contained in the filter allele frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) * Added function `make_freq_index_dict` to create a look-up Dictionary for entries contained in the frequency annotation array [(#349)](https://github.com/broadinstitute/gnomad_methods/pull/349/files) * VersionedResource objects are no longer subclasses of BaseResource [(#359)](https://github.com/broadinstitute/gnomad_methods/pull/359) +* gnomAD resources can now be imported from different sources [(#373)](https://github.com/broadinstitute/gnomad_methods/pull/373) ## Version 0.5.0 - April 22nd, 2021