diff --git a/Makefile b/Makefile index 5645b0b7e..02219e1c2 100644 --- a/Makefile +++ b/Makefile @@ -267,6 +267,7 @@ NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = +bgzf_internal_h = bgzf_internal.h $(htslib_bgzf_h) cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) cram_misc_h = cram/misc.h @@ -492,7 +493,7 @@ hts-object-files: $(LIBHTS_OBJS) $(CC) -shared $(LDFLAGS) -o $@ $< hts.dll.a $(LIBS) -bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) +bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(bgzf_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_kseq_h) @@ -504,7 +505,7 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) $(bgzf_internal_h) sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) simd.o simd.pico: simd.c config.h $(htslib_sam_h) $(sam_internal_h) diff --git a/bgzf.c b/bgzf.c index 8cf3b7828..cb280268f 100644 --- a/bgzf.c +++ b/bgzf.c @@ -48,12 +48,13 @@ #include "htslib/hts_endian.h" #include "cram/pooled_alloc.h" #include "hts_internal.h" +#include "bgzf_internal.h" +#include "htslib/khash.h" #ifndef EFTYPE #define EFTYPE ENOEXEC #endif -#define BGZF_CACHE #define BGZF_MT #define BLOCK_HEADER_LENGTH 18 @@ -76,21 +77,15 @@ */ static const uint8_t g_magic[19] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\0\0"; -#ifdef BGZF_CACHE typedef struct { int size; uint8_t *block; int64_t end_offset; } cache_t; -#include "htslib/khash.h" -KHASH_MAP_INIT_INT64(cache, cache_t) -#endif +KHASH_MAP_INIT_INT64(bgzf_cache, cache_t) -struct bgzf_cache_t { - khash_t(cache) *h; - khint_t last_pos; -}; +// struct bgzf_cache_t is defined in bgzf_internal.h #ifdef BGZF_MT @@ -409,20 +404,21 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) errno = EFTYPE; return NULL; } -#ifdef BGZF_CACHE + if (!(fp->cache = malloc(sizeof(*fp->cache)))) { free(fp->uncompressed_block); free(fp); return NULL; } - if (!(fp->cache->h = kh_init(cache))) { + if (!(fp->cache->h = kh_init(bgzf_cache))) { free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; } fp->cache->last_pos = 0; -#endif + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; return fp; } @@ -442,6 +438,15 @@ static BGZF *bgzf_write_init(const char *mode) fp = (BGZF*)calloc(1, sizeof(BGZF)); if (fp == NULL) goto mem_fail; fp->is_write = 1; + + fp->cache = malloc(sizeof(bgzf_cache_t)); + if (!fp->cache) + goto mem_fail; + fp->cache->h = NULL; + fp->cache->last_pos = 0; + fp->cache->private_data = NULL; + fp->cache->private_data_cleanup = (bgzf_private_data_cleanup_func *) NULL; + int compress_level = mode2level(mode); if ( compress_level==-2 ) { @@ -479,6 +484,7 @@ static BGZF *bgzf_write_init(const char *mode) fail: if (fp != NULL) { + free(fp->cache); free(fp->uncompressed_block); free(fp->gz_stream); free(fp); @@ -896,15 +902,16 @@ static int check_header(const uint8_t *header) && unpackInt16((uint8_t*)&header[14]) == 2) ? 0 : -1; } -#ifdef BGZF_CACHE static void free_cache(BGZF *fp) { khint_t k; - if (fp->is_write) return; - khash_t(cache) *h = fp->cache->h; - for (k = kh_begin(h); k < kh_end(h); ++k) - if (kh_exist(h, k)) free(kh_val(h, k).block); - kh_destroy(cache, h); + if (fp->cache->h) { + khash_t(bgzf_cache) *h = fp->cache->h; + for (k = kh_begin(h); k < kh_end(h); ++k) + if (kh_exist(h, k)) free(kh_val(h, k).block); + kh_destroy(bgzf_cache, h); + } + bgzf_clear_private_data(fp); free(fp->cache); } @@ -913,8 +920,8 @@ static int load_block_from_cache(BGZF *fp, int64_t block_address) khint_t k; cache_t *p; - khash_t(cache) *h = fp->cache->h; - k = kh_get(cache, h, block_address); + khash_t(bgzf_cache) *h = fp->cache->h; + k = kh_get(bgzf_cache, h, block_address); if (k == kh_end(h)) return 0; p = &kh_val(h, k); if (fp->block_length != 0) fp->block_offset = 0; @@ -937,7 +944,7 @@ static void cache_block(BGZF *fp, int size) uint8_t *block = NULL; cache_t *p; //fprintf(stderr, "Cache block at %llx\n", (int)fp->block_address); - khash_t(cache) *h = fp->cache->h; + khash_t(bgzf_cache) *h = fp->cache->h; if (BGZF_MAX_BLOCK_SIZE >= fp->cache_size) return; if (fp->block_length < 0 || fp->block_length > BGZF_MAX_BLOCK_SIZE) return; if ((kh_size(h) + 1) * BGZF_MAX_BLOCK_SIZE > (uint32_t)fp->cache_size) { @@ -959,13 +966,13 @@ static void cache_block(BGZF *fp, int size) if (k != k_orig) { block = kh_val(h, k).block; - kh_del(cache, h, k); + kh_del(bgzf_cache, h, k); } } else { block = (uint8_t*)malloc(BGZF_MAX_BLOCK_SIZE); } if (!block) return; - k = kh_put(cache, h, fp->block_address, &ret); + k = kh_put(bgzf_cache, h, fp->block_address, &ret); if (ret <= 0) { // kh_put failed, or in there already (shouldn't happen) free(block); return; @@ -976,11 +983,6 @@ static void cache_block(BGZF *fp, int size) p->block = block; memcpy(p->block, fp->uncompressed_block, p->size); } -#else -static void free_cache(BGZF *fp) {} -static int load_block_from_cache(BGZF *fp, int64_t block_address) {return 0;} -static void cache_block(BGZF *fp, int size) {} -#endif /* * Absolute htell in this compressed file. diff --git a/bgzf_internal.h b/bgzf_internal.h new file mode 100644 index 000000000..b3a790df6 --- /dev/null +++ b/bgzf_internal.h @@ -0,0 +1,70 @@ +/* bgzf_internal.h -- internal bgzf functions; not part of the public API. + + Copyright (C) 2025 Genome Research Ltd. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include "htslib/bgzf.h" + +/* + * BGZF private data interface + * This exists so that we can pass BCF headers into interfaces that have + * traditionally only taken a BGZF pointer without a corresponding bcf_hdr_t *, + * notably the bcf_readrec() function used by BCF iterators. + * + * To preserve the BGZF API and ABI, this is tagged on to the existing + * opaque bgzf_cache_t structure. bgzf_cache_t is now defined here so we can + * inline lookups. + */ + +typedef void bgzf_private_data_cleanup_func(void *private_data); + +struct kh_bgzf_cache_s; + +struct bgzf_cache_t { + struct kh_bgzf_cache_s *h; + unsigned int last_pos; + void *private_data; + bgzf_private_data_cleanup_func *private_data_cleanup; +}; + +// Set private data. cleanup will be called on bgzf_close() or +// bgzf_clear_private_data(); + +static inline void bgzf_set_private_data(BGZF *fp, void *private_data, + bgzf_private_data_cleanup_func *fn) { + assert(fp->cache != NULL); + fp->cache->private_data = private_data; + fp->cache->private_data_cleanup = fn; +} + +static inline void bgzf_clear_private_data(BGZF *fp) { + assert(fp->cache != NULL); + if (fp->cache->private_data) { + if (fp->cache->private_data_cleanup) + fp->cache->private_data_cleanup(fp->cache->private_data); + fp->cache->private_data = NULL; + } +} + +static inline void * bgzf_get_private_data(BGZF *fp) { + assert(fp->cache != NULL); + return fp->cache->private_data; +} diff --git a/vcf.c b/vcf.c index a03cfde7c..2731fe41a 100644 --- a/vcf.c +++ b/vcf.c @@ -51,6 +51,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kstring.h" #include "htslib/sam.h" #include "htslib/khash.h" +#include "bgzf_internal.h" #if 0 // This helps on Intel a bit, often 6-7% faster VCF parsing. @@ -117,6 +118,7 @@ typedef struct hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields size_t *key_len;// length of h->id[BCF_DT_ID] strings int version; //cached version + uint32_t ref_count; // reference count, low bit indicates bcf_hdr_destroy() has been called } bcf_hdr_aux_t; @@ -189,6 +191,30 @@ static int bcf_get_version(const bcf_hdr_t *hdr, const char *verstr) return VCF_DEF; } +// Header reference counting + +static void bcf_hdr_incr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + aux->ref_count += 2; +} + +static void bcf_hdr_decr_ref(bcf_hdr_t *h) +{ + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count >= 2) + aux->ref_count -= 2; + + if (aux->ref_count == 0) + bcf_hdr_destroy(h); +} + +static void hdr_bgzf_private_data_cleanup(void *data) +{ + bcf_hdr_t *h = (bcf_hdr_t *) data; + bcf_hdr_decr_ref(h); +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -1498,6 +1524,7 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) aux->key_len = NULL; aux->dict = *((vdict_t*)h->dict[0]); aux->version = 0; + aux->ref_count = 1; free(h->dict[0]); h->dict[0] = aux; @@ -1522,6 +1549,12 @@ void bcf_hdr_destroy(bcf_hdr_t *h) int i; khint_t k; if (!h) return; + bcf_hdr_aux_t *aux = get_hdr_aux(h); + if (aux->ref_count > 1) // Refs still held, so delay destruction + { + aux->ref_count &= ~1; + return; + } for (i = 0; i < 3; ++i) { vdict_t *d = (vdict_t*)h->dict[i]; if (d == 0) continue; @@ -1529,7 +1562,6 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (kh_exist(d, k)) free((char*)kh_key(d, k)); if ( i==0 ) { - bcf_hdr_aux_t *aux = get_hdr_aux(h); for (k=kh_begin(aux->gen); kgen); k++) if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); kh_destroy(hdict, aux->gen); @@ -1597,6 +1629,10 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) htxt[hlen] = '\0'; // Ensure htxt is terminated if ( bcf_hdr_parse(h, htxt) < 0 ) goto fail; free(htxt); + + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + return h; fail: hts_log_error("Failed to read BCF header"); @@ -1638,6 +1674,9 @@ int bcf_hdr_write(htsFile *hfp, bcf_hdr_t *h) if ( bgzf_write(fp, htxt.s, htxt.l) != htxt.l ) return -1; if ( bgzf_flush(fp) < 0) return -1; + bcf_hdr_incr_ref(h); + bgzf_set_private_data(fp, h, hdr_bgzf_private_data_cleanup); + free(htxt.s); return 0; } @@ -1805,6 +1844,63 @@ static const char *get_type_name(int type) { return types[t]; } +/** + * updatephasing - updates 1st phasing based on other phasing status + * @param p - pointer to phase value array + * @param end - end of array + * @param q - pointer to consumed data + * @param samples - no. of samples in array + * @param ploidy - no. of phasing values per sample + * @param type - value type (one of BCF_BT_...) + * Returns 0 on success and 1 on failure + * Update for haploids made only if it is not unknown (.) + */ +static int updatephasing(uint8_t *p, uint8_t *end, uint8_t **q, int samples, int ploidy, int type) +{ + int j, k; + unsigned int inc = 1 << bcf_type_shift[type]; + ptrdiff_t bytes = samples * ploidy * inc; + + if (samples < 0 || ploidy < 0 || end - p < bytes) + return 1; + + /* + * This works because phasing is stored in the least-significant bit + * of the GT encoding, and the data is always stored little-endian. + * Thus it's possible to get the desired result by doing bit operations + * on the least-significant byte of each value and ignoring the + * higher bytes (for 16-bit and 32-bit values). + */ + + switch (ploidy) { + case 1: + // Trivial case - haploid data is phased by default + for (j = 0; j < samples; ++j) { + if (*p) *p |= 1; //only if not unknown (.) + p += inc; + } + break; + case 2: + // Mostly trivial case - first is phased if second is. + for (j = 0; j < samples; ++j) { + *p |= (p[inc] & 1); + p += 2 * inc; + } + break; + default: + // Generic case - first is phased if all other alleles are. + for (j = 0; j < samples; ++j) { + uint8_t allphased = 1; + for (k = 1; k < ploidy; ++k) + allphased &= (p[inc * k]); + *p |= allphased; + p += ploidy * inc; + } + } + *q = p; + return 0; +} + static void bcf_record_check_err(const bcf_hdr_t *hdr, bcf1_t *rec, char *type, uint32_t *reports, int i) { if (*reports == 0 || hts_verbose >= HTS_LOG_DEBUG) @@ -1832,6 +1928,13 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { (1 << BCF_BT_FLOAT) | (1 << BCF_BT_CHAR)); int32_t max_id = hdr ? hdr->n[BCF_DT_ID] : 0; + /* set phasing for 1st allele as in v44 for versions upto v43, to have + consistent binary values irrespective of version; not run for v >= v44, + to retain explicit phasing in v44 and higher */ + int idgt = hdr ? + bcf_get_version(hdr, NULL) < VCF44 ? + bcf_hdr_id2int(hdr, BCF_DT_ID, "GT") : -1 : + -1; // Check for valid contig ID if (rec->rid < 0 @@ -1941,9 +2044,16 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } - bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; - if (end - ptr < bytes) goto bad_indiv; - ptr += bytes; + if (idgt >= 0 && idgt == key) { + // check first GT phasing bit and fix up if necessary + if (updatephasing(ptr, end, &ptr, rec->n_sample, num, type)) { + err |= BCF_ERR_TAG_INVALID; + } + } else { + bytes = ((size_t) num << bcf_type_shift[type]) * rec->n_sample; + if (end - ptr < bytes) goto bad_indiv; + ptr += bytes; + } } if (!err && rec->rlen < 0) { @@ -2018,7 +2128,9 @@ int bcf_subset_format(const bcf_hdr_t *hdr, bcf1_t *rec) int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - if (fp->format.format == vcf) return vcf_read(fp,h,v); + if (fp->format.format == vcf) return vcf_read(fp, h, v); + if (!h) + h = (const bcf_hdr_t *) bgzf_get_private_data(fp->fp.bgzf); int ret = bcf_read1_core(fp->fp.bgzf, v); if (ret == 0) ret = bcf_record_check(h, v); if ( ret!=0 || !h->keep_samples ) return ret; @@ -2028,8 +2140,9 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; + const bcf_hdr_t *hdr = (const bcf_hdr_t *) bgzf_get_private_data(fp); int ret = bcf_read1_core(fp, v); - if (ret == 0) ret = bcf_record_check(NULL, v); + if (ret == 0) ret = bcf_record_check(hdr, v); if (ret >= 0) *tid = v->rid, *beg = v->pos, *end = v->pos + v->rlen; return ret; @@ -3225,7 +3338,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, is_phased = (*t == '|'); if (*t != '|' && *t != '/') break; } - if (ver >= VCF44 && !phasingprfx) { + if (!phasingprfx) { //get GT in v44 way when no prefixed phasing /* no explicit phasing for 1st allele, set based on other alleles and ploidy */ if (ploidy == 1) { //implicitly phased @@ -6004,6 +6117,7 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); } #undef BRANCH + return nsmpl*fmt->n; }