diff --git a/.gitignore b/.gitignore index ac72d4bf4..4ac78c986 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,7 @@ lib*.so.* /test/fieldarith /test/hfile /test/hts_endian +/test/longrefs/*.tmp.* /test/pileup /test/sam /test/tabix/*.tmp.* diff --git a/Makefile b/Makefile index d8c09e3fb..3d9528304 100644 --- a/Makefile +++ b/Makefile @@ -532,7 +532,7 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* + -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h diff --git a/NEWS b/NEWS index b062f89dd..c7c548209 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,18 @@ Noteworthy changes in release a.b * Incompatible changes: Several functions and data types have been changed in this release, and the shared library soversion has been bumped. + - HTSlib now supports 64 bit reference positions. This means several + structures, function parameters, and return values have been made bigger + to allow larger values to be stored. While most code that uses + HTSlib interfaces should still build after this change, some alterations + may be needed - notably to printf() formats where the values of structure + members are being printed. + + Due to file format limitations, large positions are only supported + when reading and writing SAM and VCF files. + + See README.large_positions.md for more information. + - An extra field has been added to the kbitset_t struct so bitsets can be made smaller (and later enlarged) without involving memory allocation. diff --git a/README.large_positions.md b/README.large_positions.md new file mode 100644 index 000000000..f639c48d9 --- /dev/null +++ b/README.large_positions.md @@ -0,0 +1,231 @@ +# HTSlib 64 bit reference positions + +HTSlib version 1.10 onwards internally use 64 bit reference positions. This +is to support analysis of species like axolotl, tulip and marbled lungfish +which have, or are expected to have, chromosomes longer than two gigabases. + +# File format support + +Currently 64 bit positions can only be stored in SAM and VCF format files. +Binary BAM, CRAM and BCF cannot be used due to limitations in the formats +themselves. As SAM and VCF are text formats, they have no limit on the +size of numeric values. + +# Compatibility issues to check + +Various data structure members, function parameters, and return values have +been expanded from 32 to 64 bits. As a result, some changes may be needed to +code that uses the library, even if it does not support long references. + +## Variadic functions taking format strings + +The type of various structure members (e.g. `bam1_core_t::pos`) and return +values from some functions (e.g. `bam_cigar2rlen()`) have been changed to +`hts_pos_t`, which is a 64-bit signed integer. Using these in 32-bit +code will generally work (as long as the stored positions are within range), +however care needs to be taken when these values are passed directly +to functions like `printf()` which take a variable-length argument list and +a format string. + +Header file `htslib/hts.h` defines macro `PRIhts_pos` which can be +used in `printf()` format strings to get the correct format specifier for +an `hts_pos_t` value. Code that needs to print positions should be +changed from: + +```c +printf("Position is %d\n", bam->core.pos); +``` + +to: + +```c +printf("Position is %"PRIhts_pos"\n", bam->core.pos); +``` + +If for some reason compatibility with older versions of HTSlib (which do +not have `hts_pos_t` or `PRIhts_pos`) is needed, the value can be cast to +`int64_t` and printed as an explicitly 64-bit value: + +```c +#include // For PRId64 and int64_t + +printf("Position is %" PRId64 "\n", (int64_t) bam->core.pos); +``` + +Passing incorrect types to variadic functions like `printf()` can lead +to incorrect behaviour and security risks, so it important to track down +and fix all of the places where this may happen. Modern C compilers like +gcc (version 3.0 onwards) and clang can check `printf()` and `scanf()` +parameter types for compatibility against the format string. To +enable this, build code with `-Wall` or `-Wformat` and fix all the +reported warnings. + +Where functions that take `printf`-style format strings are implemented, +they should use the appropriate gcc attributes to enable format string +checking. `htslib/hts_defs.h` includes macros `HTS_FORMAT` and +`HTS_PRINTF_FMT` which can be used to provide the attribute declaration +in a portable way. For example, `test/sam.c` uses them for a function +that prints error messages: + +``` +void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) { /* ... */ } +``` + +## Implicit type conversions + +Conversion of signed `int` or `int32_t` to `hts_pos_t` will always work. + +Conversion of `hts_pos_t` to `int` or `int32_t` will work as long as the value +converted is within the range that can be stored in the destination. + +Code that casts unsigned `uint32_t` values to signed with the expectation +that the result may be negative will no longer work as `hts_pos_t` can store +values over UINT32_MAX. Such code should be changed to use signed values. + +Functions hts_parse_region() and hts_parse_reg64() return special value +`HTS_POS_MAX` for regions which extend to the end of the reference. +This value is slightly smaller than INT64_MAX, but should be larger than +any reference that is likely to be used. When cast to `int32_t` the +result should be `INT32_MAX`. + +# Upgrading code to work with 64 bit positions + +Variables used to store reference positions should be changed to +type `hts_pos_t`. Use `PRIhts_pos` in format strings when printing them. + +When converting positions stored in strings, use `strtoll()` in place of +`atoi()` or `strtol()` (which produces a 32 bit value on 64-bit Windows and +all 32-bit platforms). + +Programs which need to look up a reference sequence length from a `sam_hdr_t` +structure should use `sam_hdr_tid2len()` instead of the old +`sam_hdr_t::target_len` array (which is left as 32-bit for reasons of +compatibility). `sam_hdr_tid2len()` returns `hts_pos_t`, so works correctly +for large references. + +Various functions which take pointer arguments have new versions which +support `hts_pos_t *` arguments. Code supporting 64-bit positions should +use the new versions. These are: + +Original function | 64-bit version +------------------ | -------------------- +fai_fetch() | fai_fetch64() +fai_fetchqual() | fai_fetchqual64() +faidx_fetch_seq() | faidx_fetch_seq64() +faidx_fetch_qual() | faidx_fetch_qual64() +hts_parse_reg() | hts_parse_reg64() or hts_parse_region() +bam_plp_auto() | bam_plp64_auto() +bam_plp_next() | bam_plp64_next() +bam_mplp_auto() | bam_mplp64_auto() + +Limited support has been added for 64-bit INFO values in VCF files, for large +values in structural variant END tags. New functions `bcf_update_info_int64()` +and `bcf_get_info_int64()` can be used to set and fetch 64-bit INFO values. +They both take arrays of `int64_t`. `bcf_int64_missing` and +`bcf_int64_vector_end` can be used to set missing and vector end values in +these arrays. The INFO data is stored in the minimum size needed, so there +is no harm in using these functions to store smaller integer values. + +# Structure members that have changed size + +``` +File htslib/hts.h: + hts_pair32_t::begin + hts_pair32_t::end + + (typedef hts_pair_pos_t is provided as a better-named replacement for hts_pair32_t) + + hts_reglist_t::min_beg + hts_reglist_t::max_end + + hts_itr_t::beg + hts_itr_t::end + hts_itr_t::curr_beg + hts_itr_t::curr_end + +File htslib/regidx.h: + reg_t::start + reg_t::end + +File htslib/sam.h: + bam1_core_t::pos + bam1_core_t::mpos + bam1_core_t::isize + +File htslib/synced_bcf_reader.h: + bcf_sr_regions_t::start + bcf_sr_regions_t::end + bcf_sr_regions_t::prev_start + +File htslib/vcf.h: + bcf_idinfo_t::info + + bcf_info_t::v1::i + + bcf1_t::pos + bcf1_t::rlen +``` + +# Functions where parameters or the return value have changed size + +Functions are annotated as follows: + +* `[new]` The function has been added since version 1.9 +* `[parameters]` Function parameters have changed size +* `[return]` Function return value has changed size + +``` +File htslib/faidx.h: + + [new] fai_fetch64() + [new] fai_fetchqual64() + [new] faidx_fetch_seq64() + [new] faidx_fetch_qual64() + [new] fai_parse_region() + +File htslib/hts.h: + + [parameters] hts_idx_push() + [new] hts_parse_reg64() + [parameters] hts_itr_query() + [parameters] hts_reg2bin() + +File htslib/kstring.h: + + [new] kputll() + +File htslib/regidx.h: + + [parameters] regidx_overlap() + +File htslib/sam.h: + + [new] sam_hdr_tid2len() + [return] bam_cigar2rlen() + [return] bam_endpos() + [parameters] bam_itr_queryi() + [parameters] sam_itr_queryi() + [new] bam_plp64_next() + [new] bam_plp64_auto() + [new] bam_mplp64_auto() + [parameters] sam_cap_mapq() + [parameters] sam_prob_realn() + +File htslib/synced_bcf_reader.h: + + [parameters] bcf_sr_seek() + [parameters] bcf_sr_regions_overlap() + +File htslib/tbx.h: + + [parameters] tbx_readrec() + +File htslib/vcf.h: + + [parameters] bcf_readrec() + [new] bcf_update_info_int64() + [new] bcf_get_info_int64() + [return] bcf_dec_int1() + [return] bcf_dec_typed_int1() + +``` diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index bc31f438c..e9b20a62b 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -288,7 +288,7 @@ void debug_vbuf(sr_sort_t *srt) for (i=0; isr->nreaders; i++) { vcf_buf_t *buf = &srt->vcf_buf[i]; - fprintf(stderr,"\t%d", buf->rec[j] ? buf->rec[j]->pos+1 : 0); + fprintf(stderr,"\t%"PRIhts_pos, buf->rec[j] ? buf->rec[j]->pos+1 : 0); } fprintf(stderr,"\n"); } @@ -330,7 +330,7 @@ int bcf_sr_sort_add_active(sr_sort_t *srt, int idx) srt->active[srt->nactive - 1] = idx; return 0; // FIXME: check for errs in this function } -static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { if ( !srt->grp_str2int ) { @@ -556,7 +556,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, return 0; // FIXME: check for errs in this function } -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int min_pos) +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t min_pos) { int i,j; assert( srt->nactive>0 ); diff --git a/bcf_sr_sort.h b/bcf_sr_sort.h index 0a31e13c8..c8bd787a1 100644 --- a/bcf_sr_sort.h +++ b/bcf_sr_sort.h @@ -90,7 +90,8 @@ typedef struct int moff, noff, *off, mcharp; char **charp; const char *chr; - int pos, nsr, msr; + hts_pos_t pos; + int nsr, msr; int pair; int nactive, mactive, *active; // list of readers with lines at the current pos } @@ -98,7 +99,7 @@ sr_sort_t; sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt); void bcf_sr_sort_reset(sr_sort_t *srt); -int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, int pos); +int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos); int bcf_sr_sort_set_active(sr_sort_t *srt, int i); int bcf_sr_sort_add_active(sr_sort_t *srt, int i); void bcf_sr_sort_destroy(sr_sort_t *srt); diff --git a/bgzf.c b/bgzf.c index 7cdf1d68c..c2caa0443 100644 --- a/bgzf.c +++ b/bgzf.c @@ -110,7 +110,8 @@ enum mtaux_cmd { // When multi-threaded bgzf_tell won't work, so we delay the hts_idx_push // until we've written the last block. typedef struct { - int tid, beg, end, is_mapped; // args for hts_idx_push + hts_pos_t beg, end; + int tid, is_mapped; // args for hts_idx_push uint64_t offset, block_number; } hts_idx_cache_entry; @@ -183,7 +184,7 @@ struct __bgzidx_t * Returns 0 on success, * -1 on failure */ -int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, int beg, int end, uint64_t offset, int is_mapped) { +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) { hts_idx_cache_entry *e; mtaux_t *mt = fp->mt; diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a991766c6..276ad3c84 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -331,6 +331,26 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, return l > 0 ? 0 : -1; } +int cram_external_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t l; + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); + b->idx += l; + *out_size = 1; + + return l > 0 ? 0 : -1; +} + int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -392,8 +412,10 @@ cram_codec *cram_external_decode_init(char *data, int size, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_external_decode_char; else @@ -422,6 +444,14 @@ int cram_external_encode_int(cram_slice *slice, cram_codec *c, return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; } +int cram_external_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + + ltf8_put_blk(c->out, *i64); + return 0; +} + int cram_external_encode_char(cram_slice *slice, cram_codec *c, char *in, int in_size) { BLOCK_APPEND(c->out, in, in_size); @@ -472,8 +502,10 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->encode = cram_external_encode_char; else @@ -489,6 +521,24 @@ cram_codec *cram_external_encode_init(cram_stats *st, * --------------------------------------------------------------------------- * BETA */ +int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; int i, n = *out_size; @@ -545,8 +595,10 @@ cram_codec *cram_beta_decode_init(char *data, int size, return NULL; c->codec = E_BETA; - if (option == E_INT || option == E_LONG) + if (option == E_INT) c->decode = cram_beta_decode_int; + else if (option == E_LONG) + c->decode = cram_beta_decode_long; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_beta_decode_char; else { @@ -594,6 +646,18 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, return -1; } +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + int cram_beta_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { int *syms = (int *)in; @@ -637,6 +701,8 @@ cram_codec *cram_beta_encode_init(cram_stats *st, c->free = cram_beta_encode_free; if (option == E_INT) c->encode = cram_beta_encode_int; + else if (option == E_LONG) + c->encode = cram_beta_encode_long; else c->encode = cram_beta_encode_char; c->store = cram_beta_encode_store; @@ -962,6 +1028,56 @@ int cram_huffman_decode_int(cram_slice *slice, cram_codec *c, return 0; } +int cram_huffman_decode_long0(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + const cram_huffman_code * const codes = c->u.huffman.codes; + + /* Special case of 0 length codes */ + for (i = 0, n = *out_size; i < n; i++) { + out_i[i] = codes[0].symbol; + } + return 0; +} + +int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n, ncodes = c->u.huffman.ncodes; + const cram_huffman_code * const codes = c->u.huffman.codes; + + for (i = 0, n = *out_size; i < n; i++) { + int idx = 0; + int val = 0, len = 0, last_len = 0; + + // Now one bit at a time for remaining checks + for (;;) { + int dlen = codes[idx].len - last_len; + if (cram_not_enough_bits(in, dlen)) + return -1; + + //val <<= dlen; + //val |= get_bits_MSB(in, dlen); + //last_len = (len += dlen); + + last_len = (len += dlen); + for (; dlen; dlen--) GET_BIT_MSB(in, val); + + idx = val - codes[idx].p; + if (idx >= ncodes || idx < 0) + return -1; + + if (codes[idx].code == val && codes[idx].len == len) { + out_i[i] = codes[idx].symbol; + break; + } + } + } + + return 0; +} + /* * Initialises a huffman decoder from an encoding data stream. */ @@ -1011,8 +1127,16 @@ cram_codec *cram_huffman_decode_init(char *data, int size, } /* Read symbols and bit-lengths */ - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].symbol); + if (option == E_LONG) { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + l = safe_ltf8_get(cp, data_end, &codes[i].symbol); + } + } else { + for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { + int32_t i32; + l = safe_itf8_get(cp, data_end, &i32); + codes[i].symbol = i32; + } } if (l < 1) @@ -1100,13 +1224,18 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_BYTE_ARRAY_BLOCK) { - abort(); - } else { + } else if (option == E_LONG) { + if (h->u.huffman.codes[0].len == 0) + h->decode = cram_huffman_decode_long0; + else + h->decode = cram_huffman_decode_long; + } else if (option == E_INT) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else h->decode = cram_huffman_decode_int; + } else { + return NULL; } return (cram_codec *)h; @@ -1191,6 +1320,43 @@ int cram_huffman_encode_int(cram_slice *slice, cram_codec *c, return r; } +int cram_huffman_encode_long0(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_huffman_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int i, code, len, r = 0; + int64_t *syms = (int64_t *)in; + + while (in_size--) { + int sym = *syms++; + + if (sym >= -1 && sym < MAX_HUFF) { + i = c->u.e_huffman.val2code[sym+1]; + assert(c->u.e_huffman.codes[i].symbol == sym); + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } else { + /* Slow - use a lookup table for when sym < MAX_HUFFMAN_SYM? */ + for (i = 0; i < c->u.e_huffman.nvals; i++) { + if (c->u.e_huffman.codes[i].symbol == sym) + break; + } + if (i == c->u.e_huffman.nvals) + return -1; + + code = c->u.e_huffman.codes[i].code; + len = c->u.e_huffman.codes[i].len; + } + + r |= store_bits_MSB(c->out, code, len); + } + + return r; +} + void cram_huffman_encode_free(cram_codec *c) { if (!c) return; @@ -1230,8 +1396,14 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, } tp += itf8_put(tp, c->u.e_huffman.nvals); - for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + if (c->u.e_huffman.option == E_LONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += ltf8_put(tp, codes[i].symbol); + } + } else { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += itf8_put(tp, codes[i].symbol); + } } tp += itf8_put(tp, c->u.e_huffman.nvals); @@ -1409,6 +1581,7 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->u.e_huffman.codes = codes; c->u.e_huffman.nvals = nvals; + c->u.e_huffman.option = option; c->free = cram_huffman_encode_free; if (option == E_BYTE || option == E_BYTE_ARRAY) { @@ -1416,11 +1589,16 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else { + } else if (option == E_INT) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; + } else if (option == E_LONG) { + if (c->u.e_huffman.codes[0].len == 0) + c->encode = cram_huffman_encode_long0; + else + c->encode = cram_huffman_encode_long; } c->store = cram_huffman_encode_store; @@ -1980,6 +2158,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_external_encode_store; if (c->decode == cram_external_decode_int) c->encode = cram_external_encode_int; + if (c->decode == cram_external_decode_long) + c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; else @@ -2010,6 +2190,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->encode = cram_huffman_encode_int0; else if (c->decode == cram_huffman_decode_int) t->encode = cram_huffman_encode_int; + else if (c->decode == cram_huffman_decode_long0) + t->encode = cram_huffman_encode_long0; + else if (c->decode == cram_huffman_decode_long) + t->encode = cram_huffman_encode_long; else { free(t); return -1; @@ -2025,6 +2209,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->store = cram_beta_encode_store; if (c->decode == cram_beta_decode_int) c->encode = cram_beta_encode_int; + else if (c->decode == cram_beta_decode_long) + c->encode = cram_beta_encode_long; else if (c->decode == cram_beta_decode_char) c->encode = cram_beta_encode_char; else diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 59ce1313f..02b9f8e85 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -49,7 +49,7 @@ struct cram_codec; * appears. */ typedef struct { - int32_t symbol; + int64_t symbol; int32_t p; // next code start value, minus index to codes[] int32_t code; int32_t len; @@ -65,6 +65,7 @@ typedef struct { cram_huffman_code *codes; int nvals; int val2code[MAX_HUFF+1]; // value to code lookup for small values + int option; } cram_huffman_encoder; typedef struct { diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 7dcc24470..6140b585c 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -45,6 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -158,9 +159,25 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { + int32_t i32; cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_span); +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); + cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); +#else + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; + cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; +#endif cp += safe_itf8_get(cp, endp, &hdr->num_records); cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); if (hdr->num_landmarks < 0 || @@ -409,7 +426,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { - ds_id = DS_AP; type = E_INT; + ds_id = DS_AP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'R' && key[1] == 'G') { ds_id = DS_RG; type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { @@ -417,9 +439,19 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'N' && key[1] == 'S') { ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { - ds_id = DS_NP; type = E_INT; + ds_id = DS_NP; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'T' && key[1] == 'S') { - ds_id = DS_TS; type = E_INT; + ds_id = DS_TS; +#ifdef LARGE_POS + type = E_LONG, +#else + type = E_INT; +#endif } else if (key[0] == 'N' && key[1] == 'F') { ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { @@ -978,8 +1010,16 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { if (b->content_type == MAPPED_SLICE) { cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#ifdef LARGE_POS + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); + cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); +#else + int32_t i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_start = i32; + cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); + hdr->ref_seq_span = i32; +#endif } cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); hdr->record_counter = 0; @@ -1090,7 +1130,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int has_MD, int has_NM) { int prev_pos = 0, f, r = 0, out_sz = 1; int seq_pos = 1; - int cig_len = 0, ref_pos = cr->apos; + int cig_len = 0; + int64_t ref_pos = cr->apos; int32_t fn, i32; enum cigar_op cig_op = BAM_CMATCH; uint32_t *cigar = s->cigar; @@ -1995,8 +2036,8 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { */ if (cr->tlen == INT_MIN) { int id1 = rec, id2 = rec; - int aleft = cr->apos, aright = cr->aend; - int tlen; + int64_t aleft = cr->apos, aright = cr->aend; + int64_t tlen; int ref = cr->ref_id; // number of segments starting at the same point. @@ -2235,7 +2276,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((fd->required_fields & SAM_SEQ) && s->ref == NULL && s->hdr->ref_seq_id >= 0 && !c->comp_hdr->no_ref) { - hts_log_error("Unable to fetch reference #%d %d..%d", + hts_log_error("Unable to fetch reference #%d %"PRId64"..%"PRId64"\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_start + s->hdr->ref_seq_span-1); return -1; @@ -2429,9 +2470,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_AP] ->decode(s, c->comp_hdr->codecs[DS_AP], blk, (char *)&cr->apos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; +#endif if (r) return r; if (c->comp_hdr->AP_delta) cr->apos += s->last_apos; @@ -2528,17 +2577,33 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_NP) { if (!c->comp_hdr->codecs[DS_NP]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_NP] ->decode(s, c->comp_hdr->codecs[DS_NP], blk, (char *)&cr->mate_pos, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; +#endif if (r) return r; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) return -1; +#ifdef LARGE_POS r |= c->comp_hdr->codecs[DS_TS] ->decode(s, c->comp_hdr->codecs[DS_TS], blk, (char *)&cr->tlen, &out_sz); +#else + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + cr->tlen = i32; +#endif if (r) return r; } else { cr->tlen = INT_MIN; @@ -2609,7 +2674,8 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (!(bf & BAM_FUNMAP)) { if ((ds & CRAM_AP) && cr->apos <= 0) { - hts_log_error("Read has alignment position %d but no unmapped flag", + hts_log_error("Read has alignment position %"PRId64 + " but no unmapped flag", cr->apos); return -1; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 35b701a80..cb573bd7c 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -92,11 +93,27 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ + // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { r |= itf8_put_blk(cb, h->ref_seq_id); +#ifdef LARGE_POS + r |= ltf8_put_blk(cb, h->ref_seq_start); + r |= ltf8_put_blk(cb, h->ref_seq_span); +#else r |= itf8_put_blk(cb, h->ref_seq_start); r |= itf8_put_blk(cb, h->ref_seq_span); +#endif r |= itf8_put_blk(cb, h->num_records); r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { @@ -535,8 +552,13 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { } cp += itf8_put(cp, s->hdr->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put(cp, s->hdr->ref_seq_start); + cp += ltf8_put(cp, s->hdr->ref_seq_span); +#else cp += itf8_put(cp, s->hdr->ref_seq_start); cp += itf8_put(cp, s->hdr->ref_seq_span); +#endif cp += itf8_put(cp, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) cp += itf8_put(cp, s->hdr->record_counter); @@ -574,7 +596,7 @@ static int cram_encode_slice_read(cram_fd *fd, cram_block_compression_hdr *h, cram_slice *s, cram_record *cr, - int *last_pos) { + int64_t *last_pos) { int r = 0; int32_t i32; unsigned char uc; @@ -595,12 +617,24 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos - *last_pos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif *last_pos = cr->apos; } else { +#ifdef LARGE_POS + int64_t i64; + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); +#else i32 = cr->apos; r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); +#endif } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -612,11 +646,20 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); +#ifdef LARGE_POS r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], (char *)&cr->mate_pos, 1); r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], (char *)&cr->tlen, 1); +#else + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); +#endif } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], (char *)&cr->mate_line, 1); @@ -910,7 +953,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { */ static int cram_encode_slice(cram_fd *fd, cram_container *c, cram_block_compression_hdr *h, cram_slice *s) { - int rec, r = 0, last_pos; + int rec, r = 0; + int64_t last_pos; int embed_ref; enum cram_DS_ID id; @@ -1312,7 +1356,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; - int first_base = INT_MAX, last_base = INT_MIN; + int64_t first_base = INT64_MAX, last_base = INT64_MIN; int r1_start = r1; @@ -1488,8 +1532,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== AP ===\n"); if (c->pos_sorted) { h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], E_INT, NULL, - fd->version); + c->stats[DS_AP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); } else { int p[2] = {0, c->max_apos}; h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, @@ -1523,14 +1572,24 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), - c->stats[DS_TS], E_INT, NULL, - fd->version); + c->stats[DS_TS], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), - c->stats[DS_NP], E_INT, NULL, - fd->version); + c->stats[DS_NP], +#ifdef LARGE_POS + E_LONG, +#else + E_INT, +#endif + NULL, fd->version); if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; //fprintf(stderr, "=== NF ===\n"); @@ -2569,7 +2628,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { if (c->curr_slice == c->max_slice || (bam_ref(b) != c->curr_ref && !c->multi_seq)) { c->ref_seq_span = fd->last_base - c->ref_seq_start + 1; - hts_log_info("Flush container %d/%d..%d", + hts_log_info("Flush container %d/%"PRId64"..%"PRId64, c->ref_seq_id, c->ref_seq_start, c->ref_seq_start + c->ref_seq_span -1); @@ -2751,8 +2810,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; - int apos = cr->apos-1, spos = 0; - int MD_last = apos; // last position of edit in MD tag + int64_t apos = cr->apos-1, spos = 0; + int64_t MD_last = apos; // last position of edit in MD tag cr->cigar = s->ncigar; cr->ncigar = bam_cigar_len(b); @@ -3048,7 +3107,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (new == 0) { cram_record *p = &s->crecs[kh_val(s->pair[sec], k)]; - int aleft, aright, sign; + int64_t aleft, aright; + int sign; aleft = MIN(cr->apos, p->apos); aright = MAX(cr->aend, p->aend); @@ -3065,7 +3125,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. if ((bam_ins_size(b) && - abs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || (!bam_ins_size(b) && !fd->tlen_zero)) goto detached; @@ -3087,7 +3147,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && abs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || + if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || (!p->tlen && !fd->tlen_zero)) goto detached; diff --git a/cram/cram_index.c b/cram/cram_index.c index 3b8cef833..222bbee6d 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -544,7 +544,8 @@ static int cram_index_build_multiref(cram_fd *fd, off_t cpos, int32_t landmark, int sz) { - int i, ref = -2, ref_start = 0, ref_end; + int i, ref = -2; + int64_t ref_start = 0, ref_end; char buf[1024]; if (fd->mode != 'w') { @@ -571,7 +572,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -584,7 +585,7 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", ref, ref_start, ref_end - ref_start + 1, (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) @@ -616,7 +617,7 @@ int cram_index_slice(cram_fd *fd, if (s->hdr->ref_seq_id == -2) { ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); } else { - sprintf(buf, "%d\t%d\t%d\t%"PRId64"\t%d\t%d\n", + sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", s->hdr->ref_seq_id, s->hdr->ref_seq_start, s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; @@ -684,7 +685,7 @@ int cram_index_build(cram_fd *fd, const char *fn_base, const char *fn_idx) { off_t cpos, hpos; BGZF *fp; kstring_t fn_idx_str = {0}; - int32_t last_ref = -9, last_start = -9; + int64_t last_ref = -9, last_start = -9; // Useful for cram_index_build_multiref cram_set_option(fd, CRAM_OPT_REQUIRED_FIELDS, SAM_RNAME | SAM_POS | SAM_CIGAR); diff --git a/cram/cram_io.c b/cram/cram_io.c index ea10eced8..1b5c820dc 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -490,7 +490,7 @@ int ltf8_decode_crc(cram_fd *fd, int64_t *val_p, uint32_t *crc) { * * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val) { +int itf8_put_blk(cram_block *blk, int32_t val) { char buf[5]; int sz; @@ -502,6 +502,18 @@ int itf8_put_blk(cram_block *blk, int val) { return -1; } +int ltf8_put_blk(cram_block *blk, int64_t val) { + char buf[9]; + int sz; + + sz = ltf8_put(buf, val); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + /* * Decodes a 32-bit little endian value from fd and stores in val. * @@ -1841,7 +1853,7 @@ static void sanitise_SQ_lines(cram_fd *fd) { // Should we also check MD5sums here to ensure the correct // reference was given? - hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d", + hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d", r->name, fd->header->hrecs->ref[i].len, (int)r->length); // Fixing the parsed @SQ header will make MD:Z: strings work @@ -2944,8 +2956,26 @@ cram_container *cram_read_container(cram_fd *fd) { crc = crc32(0L, (unsigned char *)&len, 4); } if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +/* + * LARGE_POS used in this code is purely a debugging mechanism for testing + * whether the htslib API can cope with 64-bit quantities. These are + * possible in SAM, but not *yet* in BAM or CRAM. + * + * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. + * + * At some point it is expected these ifdefs will become a version check + * instead. + */ +#ifdef LARGE_POS + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; + if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; +#else + int32_t i32; + if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; +#endif if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -3070,8 +3100,13 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { @@ -3129,8 +3164,13 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += itf8_put((char*)cp, 0); } else { cp += itf8_put((char*)cp, c->ref_seq_id); +#ifdef LARGE_POS + cp += ltf8_put((char*)cp, c->ref_seq_start); + cp += ltf8_put((char*)cp, c->ref_seq_span); +#else cp += itf8_put((char*)cp, c->ref_seq_start); cp += itf8_put((char*)cp, c->ref_seq_span); +#endif } cp += itf8_put((char*)cp, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { diff --git a/cram/cram_io.h b/cram/cram_io.h index 43ac7dcd5..5cb2b9b8a 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -377,7 +377,8 @@ static inline int safe_ltf8_get(const char *cp, const char *endp, * @return * Returns the number of bytes written */ -int itf8_put_blk(cram_block *blk, int val); +int itf8_put_blk(cram_block *blk, int32_t val); +int ltf8_put_blk(cram_block *blk, int64_t val); /*! Pulls a literal 32-bit value from a block. * diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c index aab68df31..1f33eaeba 100644 --- a/cram/cram_samtools.c +++ b/cram/cram_samtools.c @@ -45,13 +45,13 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual) { diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 4bbc39b04..4bed1465d 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -80,13 +80,13 @@ int bam_construct_seq(bam_seq_t **bp, size_t extra_len, const char *qname, size_t qname_len, int flag, int rname, // Ref ID - int pos, - int end, // aligned start/end coords + int64_t pos, + int64_t end, // aligned start/end coords int mapq, uint32_t ncigar, const uint32_t *cigar, int mrnm, // Mate Ref ID - int mpos, - int isize, + int64_t mpos, + int64_t isize, int len, const char *seq, const char *qual); diff --git a/cram/cram_stats.c b/cram/cram_stats.c index 87adde009..1b107b687 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -39,6 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram/cram.h" #include "cram/os.h" @@ -47,7 +48,7 @@ cram_stats *cram_stats_create(void) { return calloc(1, sizeof(cram_stats)); } -int cram_stats_add(cram_stats *st, int32_t val) { +int cram_stats_add(cram_stats *st, int64_t val) { st->nsamp++; //assert(val >= 0); @@ -75,7 +76,7 @@ int cram_stats_add(cram_stats *st, int32_t val) { return 0; } -void cram_stats_del(cram_stats *st, int32_t val) { +void cram_stats_del(cram_stats *st, int64_t val) { st->nsamp--; //assert(val >= 0); @@ -90,11 +91,11 @@ void cram_stats_del(cram_stats *st, int32_t val) { if (--kh_val(st->h, k) == 0) kh_del(m_i2i, st->h, k); } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } else { - hts_log_warning("Failed to remove val %d from cram_stats", val); + hts_log_warning("Failed to remove val %"PRId64" from cram_stats", val); st->nsamp++; } } diff --git a/cram/cram_stats.h b/cram/cram_stats.h index d9b37a7c3..6a87fb1e8 100644 --- a/cram/cram_stats.h +++ b/cram/cram_stats.h @@ -36,8 +36,8 @@ extern "C" { #endif cram_stats *cram_stats_create(void); -int cram_stats_add(cram_stats *st, int32_t val); -void cram_stats_del(cram_stats *st, int32_t val); +int cram_stats_add(cram_stats *st, int64_t val); +void cram_stats_del(cram_stats *st, int64_t val); void cram_stats_dump(cram_stats *st); void cram_stats_free(cram_stats *st); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 2cde6cfef..5c5fe4628 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -60,7 +60,7 @@ extern "C" { #endif // Generic hash-map integer -> integer -KHASH_MAP_INIT_INT(m_i2i, int) +KHASH_MAP_INIT_INT64(m_i2i, int) // Generic hash-set integer -> (existance) KHASH_SET_INIT_INT(s_i2i) @@ -281,8 +281,8 @@ struct cram_map; /* Compression header block */ typedef struct cram_block_compression_hdr { int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int32_t num_records; int32_t num_landmarks; int32_t *landmark; @@ -337,8 +337,8 @@ KHASH_MAP_INIT_INT(m_tagmap, cram_tag_map*) typedef struct cram_block_slice_hdr { enum cram_content_type content_type; int32_t ref_seq_id; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_start; /* if content_type == MAPPED_SLICE */ - int32_t ref_seq_span; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_start; /* if content_type == MAPPED_SLICE */ + int64_t ref_seq_span; /* if content_type == MAPPED_SLICE */ int32_t num_records; int64_t record_counter; int32_t num_blocks; @@ -362,8 +362,8 @@ struct ref_entry; typedef struct cram_container { int32_t length; int32_t ref_seq_id; - int32_t ref_seq_start; - int32_t ref_seq_span; + int64_t ref_seq_start; + int64_t ref_seq_span; int64_t record_counter; int64_t num_bases; int32_t num_records; @@ -385,10 +385,10 @@ typedef struct cram_container { int max_c_rec, curr_c_rec; // current and max recs per container int slice_rec; // rec no. for start of this slice int curr_ref; // current ref ID. -2 for no previous - int last_pos; // last record position + int64_t last_pos; // last record position struct cram_slice **slices, *slice; int pos_sorted; // boolean, 1=>position sorted data - int max_apos; // maximum position, used if pos_sorted==0 + int64_t max_apos; // maximum position, used if pos_sorted==0 int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. @@ -422,14 +422,14 @@ typedef struct cram_record { int32_t flags; // BF int32_t cram_flags; // CF int32_t len; // RL - int32_t apos; // AP + int64_t apos; // AP int32_t rg; // RG int32_t name; // RN; idx to s->names_blk int32_t name_len; int32_t mate_line; // index to another cram_record int32_t mate_ref_id; - int32_t mate_pos; // NP - int32_t tlen; // TS + int64_t mate_pos; // NP + int64_t tlen; // TS // Auxiliary data int32_t ntags; // TC @@ -446,7 +446,7 @@ typedef struct cram_record { int32_t qual; // idx to s->qual_blk int32_t cigar; // idx to s->cigar int32_t ncigar; - int32_t aend; // alignment end + int64_t aend; // alignment end int32_t mqual; // MQ int32_t feature; // idx to s->feature @@ -543,7 +543,7 @@ typedef struct cram_slice { cram_block **block_by_id; /* State used during encoding/decoding */ - int last_apos, max_apos; + int64_t last_apos, max_apos; /* Array of decoded cram records */ cram_record *crecs; @@ -661,8 +661,8 @@ typedef struct cram_index { typedef struct { int refid; - int start; - int end; + int64_t start; + int64_t end; } cram_range; /*----------------------------------------------------------------------------- diff --git a/faidx.c b/faidx.c index 3c6ec5158..801129c24 100644 --- a/faidx.c +++ b/faidx.c @@ -693,7 +693,7 @@ faidx_t *fai_load_format(const char *fn, enum fai_format_options format) { static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, - uint64_t offset, int64_t beg, int64_t end, int *len) { + uint64_t offset, hts_pos_t beg, hts_pos_t end, hts_pos_t *len) { char *s; size_t l; int c = 0; @@ -739,7 +739,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } static int fai_get_val(const faidx_t *fai, const char *str, - int *len, faidx1_t *val, int64_t *fbeg, int64_t *fend) { + hts_pos_t *len, faidx1_t *val, hts_pos_t *fbeg, hts_pos_t *fend) { khiter_t iter; khash_t(s) *h; int id; @@ -770,7 +770,7 @@ static int fai_get_val(const faidx_t *fai, const char *str, } -char *fai_fetch(const faidx_t *fai, const char *str, int *len) +char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; int64_t beg, end; @@ -783,8 +783,15 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) return fai_retrieve(fai, &val, val.seq_offset, beg, end, len); } +char *fai_fetch(const faidx_t *fai, const char *str, int *len) +{ + hts_pos_t len64; + char *ret = fai_fetch64(fai, str, &len64); + *len = len64; // trunc + return ret; +} -char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { +char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { faidx1_t val; int64_t beg, end; @@ -796,6 +803,12 @@ char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { return fai_retrieve(fai, &val, val.qual_offset, beg, end, len); } +char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { + hts_pos_t len64; + char *ret = fai_fetchqual64(fai, str, &len64); + *len = len64; // trunc + return ret; +} int faidx_fetch_nseq(const faidx_t *fai) { @@ -819,8 +832,7 @@ int faidx_seq_len(const faidx_t *fai, const char *seq) return kh_val(fai->hash, k).len; } - -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, int *p_beg_i, int *p_end_i, int *len) { +static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { khiter_t iter; // Adjust position @@ -850,8 +862,7 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * return 0; } - -char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -861,11 +872,18 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + hts_pos_t len64; + char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} -char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; @@ -875,9 +893,16 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int } // Now retrieve the sequence - return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, (long) p_end_i + 1, len); + return fai_retrieve(fai, &val, val.qual_offset, p_beg_i, p_end_i + 1, len); } +char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len) +{ + hts_pos_t len64; + char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); + *len = len64; // trunc + return ret; +} int faidx_has_seq(const faidx_t *fai, const char *seq) { diff --git a/header.c b/header.c index cf5ca14ac..75ffb001c 100644 --- a/header.c +++ b/header.c @@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Hash table for removing multiple lines from the header KHASH_SET_INIT_STR(rm) +// Used for long refs in SAM files +KHASH_DECLARE(s2i, kh_cstr_t, int64_t) + typedef khash_t(rm) rmhash_t; static int sam_hdr_link_pg(sam_hdr_t *bh); @@ -140,7 +143,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, int nref = hrecs->nref; const char *name = NULL; const char *altnames = NULL; - int len = -1, r; + hts_pos_t len = -1; + int r; khint_t k; while (tag) { @@ -149,7 +153,7 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, name = tag->str+3; } else if (tag->str[0] == 'L' && tag->str[1] == 'N') { assert(tag->len >= 3); - len = atoi(tag->str+3); + len = strtoll(tag->str+3, NULL, 10); } else if (tag->str[0] == 'A' && tag->str[1] == 'N') { assert(tag->len >= 3); altnames = tag->str+3; @@ -180,7 +184,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, // Check lengths match; correct if not. if (len != hrecs->ref[nref].len) { char tmp[32]; - snprintf(tmp, sizeof(tmp), "%u", hrecs->ref[nref].len); + snprintf(tmp, sizeof(tmp), "%" PRIhts_pos, + hrecs->ref[nref].len); if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0) return -1; } @@ -921,7 +926,11 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, if (!bh->target_name[i]) return -1; } - bh->target_len[i] = hrecs->ref[i].len; + if (hrecs->ref[i].len < UINT32_MAX) { + bh->target_len[i] = hrecs->ref[i].len; + } else { + bh->target_len[i] = UINT32_MAX; + } } // Free up any names that have been removed @@ -991,7 +1000,17 @@ static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs, int r; hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]); if (!hrecs->ref[tid].name) goto fail; - hrecs->ref[tid].len = bh->target_len[tid]; + if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) { + hrecs->ref[tid].len = bh->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict; + k = kh_get(s2i, long_refs, hrecs->ref[tid].name); + if (k < kh_end(long_refs)) { + hrecs->ref[tid].len = kh_val(long_refs, k); + } else { + hrecs->ref[tid].len = UINT32_MAX; + } + } hrecs->ref[tid].ty = NULL; k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r); if (r < 0) goto fail; @@ -1038,7 +1057,7 @@ static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) { for (tid = 0; tid < hrecs->nref; tid++) { if (hrecs->ref[tid].ty == NULL) { - snprintf(len, sizeof(len), "%d", hrecs->ref[tid].len); + snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len); if (sam_hrecs_add(hrecs, "SQ", "SN", hrecs->ref[tid].name, "LN", len, NULL) != 0) @@ -1938,7 +1957,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) { return NULL; } -uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { sam_hrecs_t *hrecs; if (!h) @@ -1947,8 +1966,19 @@ uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) { if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) { return hrecs->ref[tid].len; } else { - if (tid < h->n_targets) - return h->target_len[tid]; + if (tid < h->n_targets) { + if (h->target_len[tid] < UINT32_MAX || !h->sdict) { + return h->target_len[tid]; + } else { + khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict; + khint_t k = kh_get(s2i, long_refs, h->target_name[tid]); + if (k < kh_end(long_refs)) { + return kh_val(long_refs, k); + } else { + return UINT32_MAX; + } + } + } } return 0; diff --git a/header.h b/header.h index 18c8ee89c..810a3dda1 100644 --- a/header.h +++ b/header.h @@ -122,7 +122,7 @@ typedef struct sam_hrec_type_s { /*! Parsed \@SQ lines */ typedef struct { const char *name; - uint32_t len; + hts_pos_t len; sam_hrec_type_t *ty; } sam_hrec_sq_t; diff --git a/hts.c b/hts.c index 6133f893b..ee43c5125 100644 --- a/hts.c +++ b/hts.c @@ -32,6 +32,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -1517,7 +1518,7 @@ KHASH_MAP_INIT_INT(bin, bins_t) typedef khash_t(bin) bidx_t; typedef struct { - int32_t n, m; + hts_pos_t n, m; uint64_t *offset; } lidx_t; @@ -1532,7 +1533,8 @@ struct __hts_idx_t { int tbi_n, last_tbi_tid; struct { uint32_t last_bin, save_bin; - int last_coor, last_tid, save_tid, finished; + hts_pos_t last_coor; + int last_tid, save_tid, finished; uint64_t last_off, save_off; uint64_t off_beg, off_end; uint64_t n_mapped, n_unmapped; @@ -1578,7 +1580,8 @@ static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) static inline int insert_to_l(lidx_t *l, int64_t _beg, int64_t _end, uint64_t offset, int min_shift) { - int i, beg, end; + int i; + hts_pos_t beg, end; beg = _beg >> min_shift; end = (_end - 1) >> min_shift; if (l->m < end + 1) { @@ -1732,7 +1735,7 @@ int hts_idx_finish(hts_idx_t *idx, uint64_t final_offset) return ret; } -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped) +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped) { int bin; int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); @@ -1773,12 +1776,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int idx->z.last_tid = tid; idx->z.last_bin = 0xffffffffu; } else if (tid >= 0 && idx->z.last_coor > beg) { // test if positions are out of order - hts_log_error("Unsorted positions on sequence #%d: %d followed by %d", tid+1, idx->z.last_coor+1, beg+1); + hts_log_error("Unsorted positions on sequence #%d: %"PRIhts_pos" followed by %"PRIhts_pos, tid+1, idx->z.last_coor+1, beg+1); return -1; } else if (end < beg) { // Malformed ranges are errors. (Empty ranges (beg==end) are unusual but acceptable.) - hts_log_error("Invalid record on sequence #%d: end %d < begin %d", tid+1, end, beg+1); + hts_log_error("Invalid record on sequence #%d: end %"PRId64" < begin %"PRId64, tid+1, end, beg+1); return -1; } if ( tid>=0 ) @@ -1828,14 +1831,14 @@ int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %d..%d cannot be stored in a csi index " + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " "with min_shift = %d, n_lvls = %d. Try using " "min_shift = 14, n_lvls >= %d", beg, end, idx->min_shift, idx->n_lvls, n_lvls); } else { - hts_log_error("Region %d..%d cannot be stored in a %s index. " + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " "Try using a csi index with min_shift = 14, " "n_lvls >= %d", beg, end, idx_format_name(idx->fmt), @@ -2269,13 +2272,15 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) *** Iterator *** ****************/ +// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; if (itr->bins.n + n > itr->bins.m) { itr->bins.m = itr->bins.n + n; @@ -2290,7 +2295,8 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; - int b, e, i, j; + int i, j; + hts_pos_t b, e; hts_pair64_max_t *off; bidx_t *bidx; khint_t k; @@ -2396,7 +2402,7 @@ uint64_t hts_itr_off(const hts_idx_t* idx, int tid) { return off0; } -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { int i, n_off, l, bin; hts_pair64_max_t *off; @@ -2527,7 +2533,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) khint_t k; bidx_t *bidx; uint64_t min_off, max_off, t_off = (uint64_t)-1; - int tid, beg, end; + int tid; + hts_pos_t beg, end; hts_reglist_t *curr_reg; if (!idx || !iter || !iter->multi) @@ -2646,7 +2653,8 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; - int tid, beg, end, i, j, l, n_off = 0; + int tid, i, j, l, n_off = 0; + hts_pos_t beg, end; hts_reglist_t *curr_reg; hts_pair32_t *curr_intv; hts_pair64_max_t *off = NULL; @@ -2700,10 +2708,10 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) off[n_off].max = (uint64_t)tid<<32 | end; n_off++; } else { - hts_log_warning("Could not set offset end for region %d:%d-%d. Skipping", tid, beg, end); + hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } } else { - hts_log_warning("No index entry for region %d:%d-%d", tid, beg, end); + hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { @@ -2957,7 +2965,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // No colon is simplest case; just check and return. if (colon == NULL) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = HTS_POS_MAX; kputsn(s, s_len-quoted, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -2972,7 +2980,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Has a colon, but check whole name first. if (!quoted) { - *beg = 0; *end = INT64_MAX; + *beg = 0; *end = HTS_POS_MAX; kputsn(s, s_len, &ks); // convert to nul terminated string if (!ks.s) { *tid = -2; @@ -3023,7 +3031,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end if (*beg < 0) { if (isdigit(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 - *end = *beg==-1 ? INT64_MAX : -(*beg+1); + *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); *beg = 0; return s_end; } else if (*hyphen == '-') { @@ -3035,7 +3043,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) { - *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : INT64_MAX; + *end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX; } else if (*hyphen == '-') { *end = hts_parse_decimal(hyphen+1, &hyphen, flags); if (*hyphen != '\0' && *hyphen != ',') { @@ -3048,7 +3056,7 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end } if (*end == 0) - *end = INT64_MAX; // interpret chr:100- as chr:100- + *end = HTS_POS_MAX; // interpret chr:100- as chr:100- if (*beg >= *end) return NULL; @@ -3057,19 +3065,19 @@ const char *hts_parse_region(const char *s, int *tid, int64_t *beg, int64_t *end // Next release we should mark this as deprecated? // Use hts_parse_region above instead. -const char *hts_parse_reg(const char *s, int *beg, int *end) +const char *hts_parse_reg64(const char *s, hts_pos_t *beg, hts_pos_t *end) { char *hyphen; const char *colon = strrchr(s, ':'); if (colon == NULL) { - *beg = 0; *end = INT_MAX; + *beg = 0; *end = HTS_POS_MAX; return s + strlen(s); } *beg = hts_parse_decimal(colon+1, &hyphen, HTS_PARSE_THOUSANDS_SEP) - 1; if (*beg < 0) *beg = 0; - if (*hyphen == '\0') *end = INT_MAX; + if (*hyphen == '\0') *end = HTS_POS_MAX; else if (*hyphen == '-') *end = hts_parse_decimal(hyphen+1, NULL, HTS_PARSE_THOUSANDS_SEP); else return NULL; @@ -3077,10 +3085,31 @@ const char *hts_parse_reg(const char *s, int *beg, int *end) return colon; } +const char *hts_parse_reg(const char *s, int *beg, int *end) +{ + int64_t beg64 = 0, end64 = 0; + const char *colon = hts_parse_reg64(s, &beg64, &end64); + if (beg64 > INT_MAX) { + hts_log_error("Position %"PRId64" too large", beg64); + return NULL; + } + if (end64 > INT_MAX) { + if (end64 == HTS_POS_MAX) { + end64 = INT_MAX; + } else { + hts_log_error("Position %"PRId64" too large", end64); + return NULL; + } + } + *beg = beg64; + *end = end64; + return colon; +} + hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec) { int tid; - int64_t beg, end; + hts_pos_t beg, end; if (strcmp(reg, ".") == 0) return itr_query(idx, HTS_IDX_START, 0, 0, readrec); @@ -3090,8 +3119,6 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP)) return NULL; - if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - return itr_query(idx, tid, beg, end, readrec); } @@ -3152,7 +3179,8 @@ hts_itr_t *hts_itr_regions(const hts_idx_t *idx, hts_reglist_t *reglist, int cou int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) { - int ret, tid, beg, end; + int ret, tid; + hts_pos_t beg, end; if (iter == NULL || iter->finished) return -1; if (iter->read_rest) { if (iter->curr_off) { // seek to the start @@ -3196,7 +3224,8 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) int hts_itr_multi_next(htsFile *fd, hts_itr_t *iter, void *r) { void *fp; - int ret, tid, beg, end, i, cr, ci; + int ret, tid, i, cr, ci; + hts_pos_t beg, end; hts_reglist_t *found_reg; if (iter == NULL || iter->finished) return -1; diff --git a/hts_internal.h b/hts_internal.h index c4f6b611d..36a7fba69 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -91,7 +91,7 @@ void close_plugin(void *plugin); * Returns 0 on success, * -1 on failure */ -int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, int beg, int end, uint64_t offset, int is_mapped); +int bgzf_idx_push(BGZF *fp, hts_idx_t *hidx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); #ifdef __cplusplus } diff --git a/htslib/faidx.h b/htslib/faidx.h index 48ae040e9..cf0e4e0b3 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -30,7 +30,7 @@ #define HTSLIB_FAIDX_H #include -#include "hts_defs.h" +#include "hts.h" #ifdef __cplusplus extern "C" { @@ -170,6 +170,7 @@ are reference names, quote using curly braces. Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. */ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); +char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @@ -183,6 +184,7 @@ destroyed by end users by calling `free()` on it. Region names can be quoted with curly braces, as for fai_fetch(). */ char *fai_fetchqual(const faidx_t *fai, const char *reg, int *len); +char *fai_fetchqual64(const faidx_t *fai, const char *reg, hts_pos_t *len); /// Fetch the number of sequences /** @param fai Pointer to the faidx_t struct @@ -203,6 +205,19 @@ by end users by calling `free()` on it. */ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the sequence in a region +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); + /// Fetch the quality string in a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param c_name Region name @@ -216,6 +231,19 @@ by end users by calling `free()` on it. */ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +/// Fetch the quality string in a region for FASTQ files +/** @param fai Pointer to the faidx_t struct + @param c_name Region name + @param p_beg_i Beginning position number (zero-based) + @param p_end_i End position number (zero-based) + @param len Length of the region; -2 if c_name not present, -1 general error + @return Pointer to the sequence; null on failure + +The returned sequence is allocated by `malloc()` family and should be destroyed +by end users by calling `free()` on it. +*/ +char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len); + /// Query if sequence is present /** @param fai Pointer to the faidx_t struct @param seq Sequence name diff --git a/htslib/hts.h b/htslib/hts.h index 85847a8fe..6c430eeb0 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -30,8 +30,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include -#include "hts_defs.h" #include "hts_log.h" #ifdef __cplusplus @@ -577,9 +577,26 @@ When REST or NONE is used, idx is also ignored and may be NULL. #define HTS_FMT_TBI 2 #define HTS_FMT_CRAI 3 +// Almost INT64_MAX, but when cast into a 32-bit int it's +// also INT_MAX instead of -1. This avoids bugs with old code +// using the new hts_pos_t data type. +#define HTS_POS_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#define HTS_POS_MIN INT64_MIN +#define PRIhts_pos PRId64 +typedef int64_t hts_pos_t; + +// For comparison with previous release: +// +// #define HTS_POS_MAX INT_MAX +// #define HTS_POS_MIN INT_MIN +// #define PRIhts_pos PRId32 +// typedef int32_t hts_pos_t; + typedef struct { - uint32_t beg, end; -} hts_pair32_t; + hts_pos_t beg, end; +} hts_pair_pos_t; + +typedef hts_pair_pos_t hts_pair32_t; // For backwards compatibility typedef struct { uint64_t u, v; @@ -592,21 +609,23 @@ typedef struct { typedef struct { const char *reg; - hts_pair32_t *intervals; + hts_pair_pos_t *intervals; int tid; uint32_t count; - uint32_t min_beg, max_end; + hts_pos_t min_beg, max_end; } hts_reglist_t; -typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, int *beg, int *end); +typedef int hts_readrec_func(BGZF *fp, void *data, void *r, int *tid, hts_pos_t *beg, hts_pos_t *end); typedef int hts_seek_func(void *fp, int64_t offset, int where); typedef int64_t hts_tell_func(void *fp); typedef struct { uint32_t read_rest:1, finished:1, is_cram:1, nocoor:1, multi:1, dummy:27; - int tid, beg, end, n_off, i, n_reg; + int tid, n_off, i, n_reg; + hts_pos_t beg, end; hts_reglist_t *reg_list; - int curr_tid, curr_beg, curr_end, curr_reg, curr_intv; + int curr_tid, curr_reg, curr_intv; + hts_pos_t curr_beg, curr_end; uint64_t curr_off, nocoor_off; hts_pair64_max_t *off; hts_readrec_func *readrec; @@ -658,7 +677,7 @@ void hts_idx_destroy(hts_idx_t *idx); The @p is_mapped parameter is used to update the n_mapped / n_unmapped counts stored in the meta-data bin. */ -int hts_idx_push(hts_idx_t *idx, int tid, int beg, int end, uint64_t offset, int is_mapped); +int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t offset, int is_mapped); /// Finish building an index /** @param idx Index @@ -844,6 +863,9 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); */ long long hts_parse_decimal(const char *str, char **strend, int flags); +typedef int (*hts_name2id_f)(void*, const char*); +typedef const char *(*hts_id2name_f)(void*, int); + /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @param beg Set on return to the 0-based start of the region @@ -851,10 +873,15 @@ long long hts_parse_decimal(const char *str, char **strend, int flags); @return Pointer to the colon or '\0' after the reference sequence name, or NULL if @a str could not be parsed. */ +const char *hts_parse_reg64(const char *str, hts_pos_t *beg, hts_pos_t *end); -typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); - +/// Parse a "CHR:START-END"-style region string +/** @param str String to be parsed + @param beg Set on return to the 0-based start of the region + @param end Set on return to the 1-based end of the region + @return Pointer to the colon or '\0' after the reference sequence name, + or NULL if @a str could not be parsed. +*/ const char *hts_parse_reg(const char *str, int *beg, int *end); /// Parse a "CHR:START-END"-style region string @@ -940,14 +967,14 @@ const char *hts_parse_region(const char *str, int *tid, int64_t *beg, int64_t *e @param readrec Callback to read a record from the input file @return An iterator on success; NULL on failure */ -hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); /// Free an iterator /** @param iter Iterator to free */ void hts_itr_destroy(hts_itr_t *iter); -typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec); +typedef hts_itr_t *hts_itr_query_func(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec); /// Create a single-region iterator from a text region specification /** @param idx Index @@ -1148,7 +1175,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu void hts_md5_destroy(hts_md5_context *ctx); -static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls) +static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n_lvls) { int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7; for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l)) diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 3bf4a4630..ec02b081a 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_HTS_DEFS_H #define HTSLIB_HTS_DEFS_H +#include // For __MINGW_PRINTF_FORMAT macro + #ifdef __clang__ #ifdef __has_attribute #define HTS_COMPILER_HAS(attribute) __has_attribute(attribute) diff --git a/htslib/kstring.h b/htslib/kstring.h index c440cd5e9..f817ed5dc 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -354,11 +354,11 @@ static inline int kputw(int c, kstring_t *s) return kputuw(x, s); } -static inline int kputl(long c, kstring_t *s) +static inline int kputll(long long c, kstring_t *s) { char buf[32]; int i, l = 0; - unsigned long x = c; + unsigned long long x = c; if (c < 0) x = -x; do { buf[l++] = x%10 + '0'; x /= 10; } while (x > 0); if (c < 0) buf[l++] = '-'; @@ -369,6 +369,10 @@ static inline int kputl(long c, kstring_t *s) return 0; } +static inline int kputl(long c, kstring_t *s) { + return kputll(c, s); +} + /* * Returns 's' split by delimiter, with *n being the number of components; * NULL on failue. diff --git a/htslib/regidx.h b/htslib/regidx.h index f2e0e00da..7ac2d3a7f 100644 --- a/htslib/regidx.h +++ b/htslib/regidx.h @@ -55,6 +55,7 @@ #include #include +#include "hts.h" #ifdef __cplusplus extern "C" { @@ -63,7 +64,7 @@ extern "C" { typedef struct _regidx_t regidx_t; typedef struct { - uint32_t start, end; + hts_pos_t start, end; } reg_t; typedef struct @@ -125,7 +126,7 @@ void regidx_destroy(regidx_t *idx); * Returns 0 if there is no overlap or 1 if overlap is found. The overlapping * regions can be iterated as shown in the example above. */ -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t start, uint32_t end, regitr_t *itr); +int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t start, hts_pos_t end, regitr_t *itr); /* * regidx_insert() - add a new region. diff --git a/htslib/sam.h b/htslib/sam.h index 8e1615e61..39bd80cdf 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -74,7 +74,7 @@ typedef struct sam_hdr_t { const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead"); char **target_name; char *text; - void *sdict HTS_DEPRECATED("Unused since 1.10"); + void *sdict; sam_hrecs_t *hrecs; uint32_t ref_count; } sam_hdr_t; @@ -169,24 +169,41 @@ extern const int8_t bam_cigar_table[256]; *** Alignment records *** *************************/ +/* + * Assumptions made here. While pos can be 64-bit, no sequence + * itself is that long, but due to ref skip CIGAR fields it + * may span more than that. (CIGAR itself is 28-bit len + 4 bit + * type, but in theory we can combine multiples together.) + * + * Mate position and insert size also need to be 64-bit, but + * we won't accept more than 32-bit for tid. + * + * The bam_core_t structure is the *in memory* layout and not + * the same as the on-disk format. 64-bit changes here permit + * SAM to work with very long chromosomes and permit BAM and CRAM + * to seamlessly update in the future without further API/ABI + * revisions. + */ + /*! @typedef @abstract Structure for core alignment information. - @field tid chromosome ID, defined by sam_hdr_t @field pos 0-based leftmost coordinate + @field tid chromosome ID, defined by sam_hdr_t @field bin bin calculated by bam_reg2bin() @field qual mapping quality - @field l_qname length of the query name - @field flag bitwise flag @field l_extranul length of extra NULs between qname & cigar (for alignment) + @field flag bitwise flag + @field l_qname length of the query name @field n_cigar number of CIGAR operations @field l_qseq length of the query sequence (read) @field mtid chromosome ID of next read in template, defined by sam_hdr_t @field mpos 0-based leftmost coordinate of next read in template + @field isize observed template length ("insert size") */ typedef struct { + hts_pos_t pos; int32_t tid; - int32_t pos; - uint16_t bin; + uint16_t bin; // NB: invalid on 64-bit pos uint8_t qual; uint8_t l_extranul; uint16_t flag; @@ -194,16 +211,17 @@ typedef struct { uint32_t n_cigar; int32_t l_qseq; int32_t mtid; - int32_t mpos; - int32_t isize; + hts_pos_t mpos; + hts_pos_t isize; } bam1_core_t; /*! @typedef @abstract Structure for one alignment. @field core core information about the alignment + @field id + @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @field l_data current length of bam1_t::data @field m_data maximum length of bam1_t::data - @field data all variable-length data, concatenated; structure: qname-cigar-seq-qual-aux @field mempolicy memory handling policy, see bam_set_mempolicy() @discussion Notes: @@ -223,9 +241,9 @@ typedef struct { */ typedef struct { bam1_core_t core; - int l_data; - uint8_t *data; uint64_t id; + uint8_t *data; + int l_data; uint32_t m_data; uint32_t mempolicy:2, :30 /* Reserved */; } bam1_t; @@ -688,7 +706,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid); * Fetch the reference sequence length from the target length array, * using the numerical target id. */ -uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); +hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid); /// Alias of sam_hdr_name2tid(), for backwards compatibility. /*! @@ -946,7 +964,7 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar); operations in @p cigar (these are the operations that "consume" reference bases). All other operations (including invalid ones) are ignored. */ -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar); /*! @abstract Calculate the rightmost base position of an alignment on the @@ -959,7 +977,7 @@ int bam_cigar2rlen(int n_cigar, const uint32_t *cigar); For an unmapped read (either according to its flags or if it has no cigar string), we return b->core.pos + 1 by convention. */ -int32_t bam_endpos(const bam1_t *b); +hts_pos_t bam_endpos(const bam1_t *b); int bam_str2flag(const char *str); /** returns negative value on error */ char *bam_flag2str(int flag); /** The string must be freed by the user */ @@ -1084,7 +1102,7 @@ When using one of these values, @p beg and @p end are ignored. When using HTS_IDX_REST or HTS_IDX_NONE, NULL can be passed in to @p idx. */ -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end); +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end); /// Create a SAM/BAM/CRAM iterator /** @param idx Index @@ -1485,6 +1503,8 @@ typedef struct __bam_mplp_t *bam_mplp_t; int bam_plp_push(bam_plp_t iter, const bam1_t *b); const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); + const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp); void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt); void bam_plp_reset(bam_plp_t iter); @@ -1533,6 +1553,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; void bam_mplp_destroy(bam_mplp_t iter); void bam_mplp_set_maxcnt(bam_mplp_t iter, int maxcnt); int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp); + int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp); void bam_mplp_reset(bam_mplp_t iter); void bam_mplp_constructor(bam_mplp_t iter, int (*func)(void *data, const bam1_t *b, bam_pileup_cd *cd)); @@ -1546,7 +1567,7 @@ typedef struct __bam_mplp_t *bam_mplp_t; * BAQ calculation and realignment * ***********************************/ -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres); +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres); /// Calculate BAQ scores /** @param b BAM record @@ -1588,7 +1609,7 @@ Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag); +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); #ifdef __cplusplus } diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index c047b29fa..b9585f1af 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -125,8 +125,9 @@ typedef struct _bcf_sr_regions_t char **seq_names; // sequence names int nseqs; // number of sequences (chromosomes) in the file int iseq; // current position: chr name, index to snames - int start, end; // current position: start, end of the region (0-based) - int prev_seq, prev_start; + hts_pos_t start, end; // current position: start, end of the region (0-based) + int prev_seq; + hts_pos_t prev_start; } bcf_sr_regions_t; @@ -241,7 +242,7 @@ int bcf_sr_next_line(bcf_srs_t *readers); * @seq: sequence name; NULL to seek to start * @pos: 0-based coordinate */ -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos); +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos); /** * bcf_sr_set_samples() - sets active samples @@ -336,7 +337,7 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg); * regions and more regions exist; -2 if not in the regions and there are no more * regions left. */ -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end); +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end); /* * bcf_sr_regions_flush() - calls repeatedly regs->missed_reg_handler() until diff --git a/htslib/tbx.h b/htslib/tbx.h index 9119ab8a3..1180d575b 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -64,7 +64,7 @@ extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sa /* Internal helper function used by tbx_itr_next() */ BGZF *hts_get_bgzfp(htsFile *fp); - int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end); + int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end); tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf); /* diff --git a/htslib/vcf.h b/htslib/vcf.h index 31720d7f1..ccdf701ed 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,6 +61,7 @@ extern "C" { #define BCF_HT_INT 1 #define BCF_HT_REAL 2 #define BCF_HT_STR 3 +#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 @@ -94,7 +95,7 @@ typedef struct { } bcf_hrec_t; typedef struct { - uint32_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] + uint64_t info[3]; // stores Number:20, var:4, Type:4, ColType:4 in info[0..2] // for BCF_HL_FLT,INFO,FMT and contig length in info[0] for BCF_HL_CTG bcf_hrec_t *hrec[3]; int id; @@ -130,6 +131,7 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_INT8 1 #define BCF_BT_INT16 2 #define BCF_BT_INT32 3 +#define BCF_BT_INT64 4 // Unofficial, for internal use only. #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 @@ -155,9 +157,9 @@ typedef struct { typedef struct { int key; // key: numeric tag id, the corresponding string is bcf_hdr_t::id[BCF_DT_ID][$key].key - int type, len; // type: one of BCF_BT_* types; len: vector length, 1 for scalars + int type; // type: one of BCF_BT_* types union { - int32_t i; // integer value + int64_t i; // integer value float f; // float value } v1; // only set if $len==1; for easier access uint8_t *vptr; // pointer to data array in bcf1_t->shared.s, excluding the size+type and tag id bytes @@ -165,6 +167,7 @@ typedef struct { uint32_t vptr_off:31, // vptr offset, i.e., the size of the INFO key plus size+type bytes vptr_free:1; // indicates that vptr-vptr_off must be freed; set only when modified and the new // data block is bigger than the original + int len; // vector length, 1 for scalars } bcf_info_t; @@ -208,9 +211,9 @@ typedef struct { line must be formatted in vcf_format. */ typedef struct { + hts_pos_t pos; // POS + hts_pos_t rlen; // length of REF int32_t rid; // CHROM - int32_t pos; // POS - int32_t rlen; // length of REF float qual; // QUAL uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; @@ -427,7 +430,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) HTS_RESULT_USED; /** Helper function for the bcf_itr_next() macro; internal use, ignore it */ - int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, int *beg, int *end); + int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end); @@ -666,18 +669,18 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int bcf_update_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); int bcf_add_id(const bcf_hdr_t *hdr, bcf1_t *line, const char *id); - /* + /** * bcf_update_info_*() - functions for updating INFO fields - * @hdr: the BCF header - * @line: VCF line to be edited - * @key: the INFO tag to be updated - * @values: pointer to the array of values. Pass NULL to remove the tag. - * @n: number of values in the array. When set to 0, the INFO tag is removed + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. * - * The @string in bcf_update_info_flag() is optional, @n indicates whether - * the flag is set or removed. + * The @p string in bcf_update_info_flag() is optional, + * @p n indicates whether the flag is set or removed. * - * Returns 0 on success or negative value on error. */ #define bcf_update_info_int32(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_INT) #define bcf_update_info_float(hdr,line,key,values,n) bcf_update_info((hdr),(line),(key),(values),(n),BCF_HT_REAL) @@ -685,6 +688,29 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_update_info_string(hdr,line,key,string) bcf_update_info((hdr),(line),(key),(string),1,BCF_HT_STR) int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type); + /// Set or update 64-bit integer INFO values + /** + * @param hdr: the BCF header + * @param line: VCF line to be edited + * @param key: the INFO tag to be updated + * @param values: pointer to the array of values. Pass NULL to remove the tag. + * @param n: number of values in the array. When set to 0, the INFO tag is removed + * @return 0 on success or negative value on error. + * + * This function takes an int64_t values array as input. The data + * actually stored will be shrunk to the minimum size that can + * accept all of the values. + * + * INFO values outside of the range BCF_MIN_BT_INT32 to BCF_MAX_BT_INT32 + * can only be written to VCF files. + */ + static inline int bcf_update_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *key, + const int64_t *values, int n) + { + return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); + } + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -752,22 +778,25 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** * bcf_get_info_*() - get INFO values, integers or floats - * @hdr: BCF header - * @line: BCF record - * @tag: INFO tag to retrieve - * @dst: *dst is pointer to a memory location, can point to NULL - * @ndst: pointer to the size of allocated memory + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) * - * Returns negative value on error or the number of written values - * (including missing values) on success. bcf_get_info_string() returns - * on success the number of characters written excluding the null- - * terminating byte. bcf_get_info_flag() returns 1 when flag is set or 0 - * if not. + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. bcf_get_info_string() returns + * on success the number of characters stored excluding the nul- + * terminating byte. bcf_get_info_flag() does not store anything in *dst + * but returns 1 if the flag is set or 0 if not. * - * List of return codes: - * -1 .. no such INFO tag defined in the header - * -2 .. clash between types defined in the header and encountered in the VCF record - * -3 .. tag is not present in the VCF record + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. */ #define bcf_get_info_int32(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_INT) #define bcf_get_info_float(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_REAL) @@ -775,6 +804,33 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_get_info_flag(hdr,line,tag,dst,ndst) bcf_get_info_values(hdr,line,tag,(void**)(dst),ndst,BCF_HT_FLAG) int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, void **dst, int *ndst, int type); + /// Put integer INFO values into an int64_t array + /** + * @param hdr: BCF header + * @param line: BCF record + * @param tag: INFO tag to retrieve + * @param dst: *dst is pointer to a memory location, can point to NULL + * @param ndst: pointer to the size of allocated memory + * @return >=0 on success + * -1 .. no such INFO tag defined in the header + * -2 .. clash between types defined in the header and encountered in the VCF record + * -3 .. tag is not present in the VCF record + * -4 .. the operation could not be completed (e.g. out of memory) + * + * Returns negative value on error or the number of values (including + * missing values) put in *dst on success. + * + * *dst will be reallocated if it is not big enough (i.e. *ndst is too + * small) or NULL on entry. The new size will be stored in *ndst. + */ + static inline int bcf_get_info_int64(const bcf_hdr_t *hdr, bcf1_t *line, + const char *tag, int64_t **dst, + int *ndst) + { + return bcf_get_info_values(hdr, line, tag, + (void **) dst, ndst, BCF_HT_LONG); + } + /** * bcf_get_format_*() - same as bcf_get_info*() above * @@ -876,8 +932,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ #define bcf_hdr_id2length(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>8 & 0xf) #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) - #define bcf_hdr_id2type(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) - #define bcf_hdr_id2coltype(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) + #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) + #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id<0 || bcf_hdr_id2coltype(hdr,type,int_id)==0xf) ? 0 : 1) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) /// Convert BCF FORMAT data to string form @@ -1067,10 +1123,12 @@ which works for both BCF and VCF. #define bcf_int8_vector_end (-127) /* INT8_MIN + 1 */ #define bcf_int16_vector_end (-32767) /* INT16_MIN + 1 */ #define bcf_int32_vector_end (-2147483647) /* INT32_MIN + 1 */ +#define bcf_int64_vector_end (-9223372036854775807LL) /* INT64_MIN + 1 */ #define bcf_str_vector_end 0 #define bcf_int8_missing (-128) /* INT8_MIN */ #define bcf_int16_missing (-32767-1) /* INT16_MIN */ #define bcf_int32_missing (-2147483647-1) /* INT32_MIN */ +#define bcf_int64_missing (-9223372036854775807LL - 1LL) /* INT64_MIN */ #define bcf_str_missing 0x07 // Limits on BCF values stored in given types. Max values are the same @@ -1200,7 +1258,7 @@ Cautious callers can detect invalid type codes by checking that *q has actually been updated. */ -static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) +static inline int64_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) { if (type == BCF_BT_INT8) { *q = (uint8_t*)p + 1; @@ -1211,6 +1269,9 @@ static inline int32_t bcf_dec_int1(const uint8_t *p, int type, uint8_t **q) } else if (type == BCF_BT_INT32) { *q = (uint8_t*)p + 4; return le_to_i32(p); + } else if (type == BCF_BT_INT64) { + *q = (uint8_t*)p + 4; + return le_to_i64(p); } else { // Invalid type. return 0; } @@ -1232,7 +1293,7 @@ the integer value. Cautious callers can detect invalid type codes by checking that *q has actually been updated. */ -static inline int32_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) +static inline int64_t bcf_dec_typed_int1(const uint8_t *p, uint8_t **q) { return bcf_dec_int1(p + 1, *p&0xf, q); } diff --git a/htslib_vars.mk b/htslib_vars.mk index 9b2ae8a10..0db722b0e 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -43,7 +43,7 @@ htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h -htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h +htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) htslib_synced_bcf_reader_h = $(HTSPREFIX)htslib/synced_bcf_reader.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_tbx_h) htslib_tbx_h = $(HTSPREFIX)htslib/tbx.h $(htslib_hts_h) diff --git a/realn.c b/realn.c index bc21f8083..78da5df16 100644 --- a/realn.c +++ b/realn.c @@ -35,12 +35,13 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "htslib/sam.h" -int sam_cap_mapq(bam1_t *b, const char *ref, int ref_len, int thres) +int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) { uint8_t *seq = bam_get_seq(b), *qual = bam_get_qual(b); uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - int i, x, y, mm, q, len, clip_l, clip_q; + int i, y, mm, q, len, clip_l, clip_q; + hts_pos_t x; double t; if (thres < 0) thres = 40; // set the default mm = q = len = clip_l = clip_q = 0; @@ -101,9 +102,10 @@ static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, int ref_len, int flag) +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { - int k, i, bw, x, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; + hts_pos_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; probaln_par_t conf = { 0.001, 0.1, 10 }; diff --git a/regidx.c b/regidx.c index c1d177d6e..10b5cccf8 100644 --- a/regidx.c +++ b/regidx.c @@ -57,7 +57,8 @@ struct _regidx_t // temporary data for index initialization kstring_t str; - int rid_prev, start_prev, end_prev; + int rid_prev; + hts_pos_t start_prev, end_prev; int payload_size; void *payload; }; @@ -155,7 +156,7 @@ int regidx_insert(regidx_t *idx, char *line) { if ( idx->start_prev > reg.start || (idx->start_prev==reg.start && idx->end_prev>reg.end) ) { - hts_log_error("The regions are not sorted: %s:%d-%d is before %s:%d-%d", + hts_log_error("The regions are not sorted: %s:%"PRIhts_pos"-%"PRIhts_pos" is before %s:%"PRIhts_pos"-%"PRIhts_pos, idx->str.s,idx->start_prev+1,idx->end_prev+1,idx->str.s,reg.start+1,reg.end+1); return -1; } @@ -243,7 +244,7 @@ void regidx_destroy(regidx_t *idx) free(idx); } -int regidx_overlap(regidx_t *idx, const char *chr, uint32_t from, uint32_t to, regitr_t *itr) +int regidx_overlap(regidx_t *idx, const char *chr, hts_pos_t from, hts_pos_t to, regitr_t *itr) { if ( itr ) itr->i = itr->n = 0; diff --git a/region.c b/region.c index d9679f79f..4b5dd4cfc 100644 --- a/region.c +++ b/region.c @@ -30,17 +30,21 @@ DEALINGS IN THE SOFTWARE. */ typedef struct reglist { uint32_t n, m; - uint64_t *a; + hts_pair_pos_t *a; int tid; } reglist_t; KHASH_MAP_INIT_INT(reg, reglist_t) typedef kh_reg_t reghash_t; -static int compare_uint64 (const void * a, const void * b) +static int compare_hts_pair_pos_t (const void *av, const void *bv) { - if (*(uint64_t *)a < *(uint64_t *)b) return -1; - if (*(uint64_t *)a > *(uint64_t *)b) return 1; + hts_pair_pos_t *a = (hts_pair_pos_t *) av; + hts_pair_pos_t *b = (hts_pair_pos_t *) bv; + if (a->beg < b->beg) return -1; + if (a->beg > b->beg) return 1; + if (a->end < b->end) return -1; + if (a->end > b->end) return 1; return 0; } @@ -54,7 +58,6 @@ static void reg_print(reghash_t *h) { khint_t k; uint32_t i; khint32_t key; - uint32_t beg, end; if (!h) { fprintf(stderr, "Hash table is empty!\n"); @@ -66,9 +69,8 @@ static void reg_print(reghash_t *h) { fprintf(stderr, "Region: key %u tid %d\n", key, p->tid); if ((p = &kh_val(h,k)) != NULL && p->n > 0) { for (i=0; in; i++) { - beg = (uint32_t)(p->a[i]>>32); - end = (uint32_t)(p->a[i]); - fprintf(stderr, "\tinterval[%d]: %d-%d\n", i, beg, end); + fprintf(stderr, "\tinterval[%d]: %"PRIhts_pos"-%"PRIhts_pos"\n", i, + p->a[i].beg, p->a[i].end); } } else { fprintf(stderr, "Region key %u has no intervals!\n", key); @@ -94,23 +96,30 @@ static int reg_compact(reghash_t *h) { if (!kh_exist(h,i) || !(p = &kh_val(h,i)) || !(p->n)) continue; - qsort(p->a, p->n, sizeof(uint64_t), compare_uint64); + qsort(p->a, p->n, sizeof(p->a[0]), compare_hts_pair_pos_t); for (new_n = 0, j = 1; j < p->n; j++) { - if ((uint32_t)p->a[new_n] < (uint32_t)(p->a[j]>>32)) { - p->a[++new_n] = p->a[j]; + if (p->a[new_n].end < p->a[j].beg) { + p->a[++new_n].beg = p->a[j].beg; + p->a[new_n].end = p->a[j].end; } else { - if ((uint32_t)p->a[new_n] < (uint32_t)p->a[j]) - p->a[new_n] = (p->a[new_n] & 0xFFFFFFFF00000000) | (uint32_t)(p->a[j]); + if (p->a[new_n].end < p->a[j].end) + p->a[new_n].end = p->a[j].end; } } - p->n = ++new_n; + ++new_n; + if (p->n > new_n) { + // Shrink array to required size. + hts_pair_pos_t *new_a = realloc(p->a, new_n * sizeof(p->a[0])); + if (new_a) p->a = new_a; + } + p->n = new_n; count++; } return count; } -static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) { +static int reg_insert(reghash_t *h, int tid, hts_pos_t beg, hts_pos_t end) { khint_t k; reglist_t *p; @@ -135,12 +144,13 @@ static int reg_insert(reghash_t *h, int tid, unsigned int beg, unsigned int end) if (p->n == p->m) { uint32_t new_m = p->m ? p->m<<1 : 4; if (new_m == 0) return -1; - uint64_t *new_a = realloc(p->a, new_m * sizeof(uint64_t)); + hts_pair_pos_t *new_a = realloc(p->a, new_m * sizeof(p->a[0])); if (new_a == NULL) return -1; p->m = new_m; p->a = new_a; } - p->a[p->n++] = (uint64_t)beg<<32 | end; + p->a[p->n].beg = beg; + p->a[p->n++].end = end; return 0; } @@ -174,9 +184,8 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr khint_t k; int i, l_count = 0, tid; - uint32_t j; const char *q; - int64_t beg, end; + hts_pos_t beg, end; /* First, transform the char array into a hash table */ h = kh_init(reg); @@ -207,9 +216,6 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr } } - if (beg > INT_MAX) beg = INT_MAX; // Remove when fully 64-bit compliant - if (end > INT_MAX) end = INT_MAX; // Remove when fully 64-bit compliant - if (reg_insert(h, tid, beg, end) != 0) { hts_log_error("Error when inserting region='%s' in the bed hash table at address=%p", argv[i], (void *) h); goto fail; @@ -230,21 +236,19 @@ hts_reglist_t *hts_reglist_create(char **argv, int argc, int *r_count, void *hdr continue; h_reglist[l_count].tid = p->tid; - h_reglist[l_count].intervals = calloc(p->n, sizeof(h_reglist[l_count].intervals[0])); - if(!(h_reglist[l_count].intervals)) { - hts_log_error("Could not allocate memory for intervals"); - goto fail; - } + h_reglist[l_count].intervals = p->a; h_reglist[l_count].count = p->n; - h_reglist[l_count].max_end = 0; + p->a = NULL; // As we stole it. - for (j = 0; j < p->n; j++) { - h_reglist[l_count].intervals[j].beg = (uint32_t)(p->a[j]>>32); - h_reglist[l_count].intervals[j].end = (uint32_t)(p->a[j] & 0xffffffffU); - - if (h_reglist[l_count].intervals[j].end > h_reglist[l_count].max_end) - h_reglist[l_count].max_end = h_reglist[l_count].intervals[j].end; + // After reg_compact(), list is ordered and non-overlapping, so... + if (p->n > 0) { + h_reglist[l_count].min_beg = h_reglist[l_count].intervals[0].beg; + h_reglist[l_count].max_end = h_reglist[l_count].intervals[p->n - 1].end; + } else { + h_reglist[l_count].min_beg = 0; + h_reglist[l_count].max_end = 0; } + l_count++; } reg_destroy(h); diff --git a/sam.c b/sam.c index 5d794de68..33081d5d3 100644 --- a/sam.c +++ b/sam.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" @@ -123,6 +124,8 @@ void sam_hdr_destroy(sam_hdr_t *bh) free(bh->text); if (bh->hrecs) sam_hrecs_free(bh->hrecs); + if (bh->sdict) + kh_destroy(s2i, (khash_t(s2i) *) bh->sdict); free(bh); } @@ -455,16 +458,17 @@ int bam_cigar2qlen(int n_cigar, const uint32_t *cigar) return l; } -int bam_cigar2rlen(int n_cigar, const uint32_t *cigar) +hts_pos_t bam_cigar2rlen(int n_cigar, const uint32_t *cigar) { - int k, l; + int k; + hts_pos_t l; for (k = l = 0; k < n_cigar; ++k) if (bam_cigar_type(bam_cigar_op(cigar[k]))&2) l += bam_cigar_oplen(cigar[k]); return l; } -int32_t bam_endpos(const bam1_t *b) +hts_pos_t bam_endpos(const bam1_t *b) { if (!(b->core.flag & BAM_FUNMAP) && b->core.n_cigar > 0) return b->core.pos + bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); @@ -556,12 +560,12 @@ int bam_read1(BGZF *fp, bam1_t *b) if (fp->is_be) { for (i = 0; i < 8; ++i) ed_swap_4p(x + i); } - c->tid = x[0]; c->pos = x[1]; + c->tid = x[0]; c->pos = (int32_t)x[1]; c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; c->l_extranul = (c->l_qname%4 != 0)? (4 - c->l_qname%4) : 0; c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; + c->mtid = x[5]; c->mpos = (int32_t)x[6]; c->isize = (int32_t)x[7]; new_l_data = block_len - 32 + c->l_extranul; if (new_l_data > INT_MAX || c->l_qseq < 0 || c->l_qname < 1) return -4; @@ -608,6 +612,12 @@ int bam_write1(BGZF *fp, const bam1_t *b) return -1; } if (c->n_cigar > 0xffff) block_len += 16; // "16" for "CGBI", 4-byte tag length and 8-byte fake CIGAR + if (c->pos > INT_MAX || + c->mpos > INT_MAX || + c->isize < INT_MIN || c->isize > INT_MAX) { + hts_log_error("Positional data is too large for BAM format"); + return -1; + } x[0] = c->tid; x[1] = c->pos; x[2] = (uint32_t)c->bin<<16 | c->qual<<8 | (c->l_qname - c->l_extranul); @@ -688,9 +698,11 @@ static hts_idx_t *sam_index(htsFile *fp, int min_shift) h = sam_hdr_read(fp); if (h == NULL) return NULL; if (min_shift > 0) { - int64_t max_len = 0, s; - for (i = 0; i < h->n_targets; ++i) - if (max_len < h->target_len[i]) max_len = h->target_len[i]; + hts_pos_t max_len = 0, s; + for (i = 0; i < h->n_targets; ++i) { + hts_pos_t len = sam_hdr_tid2len(h, i); + if (max_len < len) max_len = len; + } max_len += 256; for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); fmt = HTS_FMT_CSI; @@ -828,7 +840,7 @@ int sam_idx_save(htsFile *fp) { return 0; } -static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -843,7 +855,7 @@ static int sam_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, i } // This is used only with read_rest=1 iterators, so need not set tid/beg/end. -static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = (htsFile *)fpv; bam1_t *b = bv; @@ -852,7 +864,7 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, int *b return ret; } -static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, int *beg, int *end) +static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; @@ -975,7 +987,7 @@ hts_idx_t *sam_index_load(htsFile *fp, const char *fn) return index_load(fp, fn, NULL, HTS_IDX_SAVE_REMOTE); } -static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end, hts_readrec_func *readrec) +static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, hts_readrec_func *readrec) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; hts_itr_t *iter = (hts_itr_t *) calloc(1, sizeof(hts_itr_t)); @@ -1032,7 +1044,7 @@ static hts_itr_t *cram_itr_query(const hts_idx_t *idx, int tid, int beg, int end return iter; } -hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, int beg, int end) +hts_itr_t *sam_itr_queryi(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) { const hts_cram_idx_t *cidx = (const hts_cram_idx_t *) idx; if (idx == NULL) @@ -1203,6 +1215,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { const char *q, *r; char* sn = NULL; khash_t(s2i) *d = kh_init(s2i); + khash_t(s2i) *long_refs = NULL; if (!h || !d) goto error; @@ -1214,7 +1227,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) { has_SQ = 1; - int ln = -1; + hts_pos_t ln = -1; for (q = fp->line.s + 4;; ++q) { if (strncmp(q, "SN:", 3) == 0) { q += 3; @@ -1232,7 +1245,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { q = r; } else { if (strncmp(q, "LN:", 3) == 0) - ln = strtol(q + 3, (char**)&q, 10); + ln = strtoll(q + 3, (char**)&q, 10); } while (*q != '\t' && *q != '\n' && *q != '\0') @@ -1251,7 +1264,24 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); } else { - kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + int k2; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } } } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); @@ -1293,6 +1323,8 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) { char* tab = strchr(line.s, '\t'); + hts_pos_t ln; + if (tab == NULL) continue; @@ -1304,18 +1336,38 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (absent < 0) break; + ln = strtoll(tab, NULL, 10); + if (!absent) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); } else { - kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | atol(tab); + if (ln >= UINT32_MAX) { + // Stash away ref length that + // doesn't fit in target_len array + khint_t k2; + int absent = -1; + if (!long_refs) { + long_refs = kh_init(s2i); + if (!long_refs) + goto error; + } + k2 = kh_put(s2i, long_refs, sn, &absent); + if (absent < 0) + goto error; + kh_val(long_refs, k2) = ln; + kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 + | UINT32_MAX); + } else { + kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln; + } has_SQ = 1; } e |= kputs("@SQ\tSN:", &str) < 0; e |= kputsn(line.s, tab - line.s, &str) < 0; e |= kputs("\tLN:", &str) < 0; - e |= kputl(atol(tab), &str) < 0; + e |= kputll(ln, &str) < 0; e |= kputc('\n', &str) < 0; if (e) break; @@ -1352,6 +1404,9 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } } + // Repurpose sdict to hold any references longer than UINT32_MAX + h->sdict = long_refs; + kh_destroy(s2i, d); if (str.l == 0) @@ -1367,6 +1422,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { sam_hdr_destroy(h); ks_free(&str); kh_destroy(s2i, d); + kh_destroy(s2i, long_refs); if (sn) free(sn); return NULL; } @@ -2759,7 +2815,7 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->tid] , str); r |= kputc_('\t', str); } else r |= kputsn_("*\t", 2, str); - r |= kputw(c->pos + 1, str); r |= kputc_('\t', str); // pos + r |= kputll(c->pos + 1, str); r |= kputc_('\t', str); // pos r |= kputw(c->qual, str); r |= kputc_('\t', str); // qual if (c->n_cigar) { // cigar uint32_t *cigar = bam_get_cigar(b); @@ -2775,8 +2831,8 @@ static int sam_format1_append(const bam_hdr_t *h, const bam1_t *b, kstring_t *st r |= kputs(h->target_name[c->mtid], str); r |= kputc_('\t', str); } - r |= kputw(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos - r |= kputw(c->isize, str); r |= kputc_('\t', str); // template len + r |= kputll(c->mpos + 1, str); r |= kputc_('\t', str); // mate pos + r |= kputll(c->isize, str); r |= kputc_('\t', str); // template len if (c->l_qseq) { // seq and qual uint8_t *s = bam_get_seq(b); if (ks_resize(str, str->l+2+2*c->l_qseq) < 0) goto mem_err; @@ -3652,14 +3708,15 @@ char *bam_flag2str(int flag) *******************/ typedef struct { - int k, x, y, end; + int k, y; + hts_pos_t x, end; } cstate_t; static cstate_t g_cstate_null = { -1, 0, 0, 0 }; typedef struct __linkbuf_t { bam1_t b; - int32_t beg, end; + hts_pos_t beg, end; cstate_t s; struct __linkbuf_t *next; bam_pileup_cd cd; @@ -3710,7 +3767,7 @@ static inline void mp_free(mempool_t *mp, lbnode_t *p) s->x: the reference coordinate of the start of s->k s->y: the query coordiante of the start of s->k */ -static inline int resolve_cigar2(bam_pileup1_t *p, int32_t pos, cstate_t *s) +static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) { #define _cop(c) ((c)&BAM_CIGAR_MASK) #define _cln(c) ((c)>>BAM_CIGAR_SHIFT) @@ -3879,7 +3936,8 @@ typedef khash_t(olap_hash) olap_hash_t; struct __bam_plp_t { mempool_t *mp; lbnode_t *head, *tail; - int32_t tid, pos, max_tid, max_pos; + int32_t tid, max_tid; + hts_pos_t pos, max_pos; int is_eof, max_plp, error, maxcnt; uint64_t id; bam_pileup1_t *plp; @@ -3957,9 +4015,9 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { - int pos = *iref; + hts_pos_t pos = *iref; if ( pos < 0 ) return -1; *icig = 0; *iseq = 0; @@ -3992,7 +4050,7 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, int *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, int *icig, int *iseq, int *iref) +static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4021,21 +4079,21 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; - int a_icig = 0, a_iseq = 0; - int b_icig = 0, b_iseq = 0; + hts_pos_t a_icig = 0, a_iseq = 0; + hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); uint8_t *a_seq = bam_get_seq(a), *b_seq = bam_get_seq(b); - int iref = b->core.pos; - int a_iref = iref - a->core.pos; - int b_iref = iref - b->core.pos; + hts_pos_t iref = b->core.pos; + hts_pos_t a_iref = iref - a->core.pos; + hts_pos_t b_iref = iref - b->core.pos; int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %d-%d\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, + fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); #endif @@ -4056,6 +4114,9 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) iref++; if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) + return -1; // Fell off end of sequence, bad CIGAR? + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { #if DBG @@ -4105,7 +4166,7 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) // no overlap possible, unless some wild cigar if ( node->b.core.tid != node->b.core.mtid - || (abs(node->b.core.isize) >= 2*node->b.core.l_qseq + || (llabs(node->b.core.isize) >= 2*node->b.core.l_qseq && node->b.core.mpos >= node->end) // for those wild cigars ) return 0; @@ -4157,7 +4218,7 @@ static void overlap_remove(bam_plp_t iter, const bam1_t *b) // Prepares next pileup position in bam records collected by bam_plp_auto -> user func -> bam_plp_push. Returns // pointer to the piled records if next position is ready or NULL if there is not enough records in the // buffer yet (the current position is still the maximum position across all buffered reads). -const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_next(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { if (iter->error) { *_n_plp = -1; return NULL; } *_n_plp = 0; @@ -4209,6 +4270,22 @@ const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_ return NULL; } +const bam_pileup1_t *bam_plp_next(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + hts_pos_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_next(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + int bam_plp_push(bam_plp_t iter, const bam1_t *b) { if (iter->error) return -1; @@ -4258,11 +4335,11 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) return 0; } -const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +const bam_pileup1_t *bam_plp64_auto(bam_plp_t iter, int *_tid, hts_pos_t *_pos, int *_n_plp) { const bam_pileup1_t *plp; if (iter->func == 0 || iter->error) { *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; else { // no pileup line can be obtained; read alignments *_n_plp = 0; if (iter->is_eof) return 0; @@ -4272,7 +4349,7 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; // otherwise no pileup line can be returned; read the next alignment. } if ( ret < -1 ) { iter->error = ret; *_n_plp = -1; return 0; } @@ -4280,11 +4357,27 @@ const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_ *_n_plp = -1; return 0; } - if ((plp = bam_plp_next(iter, _tid, _pos, _n_plp)) != 0) return plp; + if ((plp = bam_plp64_next(iter, _tid, _pos, _n_plp)) != 0) return plp; return 0; } } +const bam_pileup1_t *bam_plp_auto(bam_plp_t iter, int *_tid, int *_pos, int *_n_plp) +{ + hts_pos_t pos64 = 0; + const bam_pileup1_t *p = bam_plp64_auto(iter, _tid, &pos64, _n_plp); + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + iter->error = 1; + *_n_plp = -1; + return NULL; + } + return p; +} + void bam_plp_reset(bam_plp_t iter) { overlap_remove(iter, NULL); @@ -4309,7 +4402,8 @@ void bam_plp_set_maxcnt(bam_plp_t iter, int maxcnt) struct __bam_mplp_t { int n; - uint64_t min, *pos; + int32_t min_tid, *tid; + hts_pos_t min_pos, *pos; bam_plp_t *iter; int *n_plp; const bam_pileup1_t **plp; @@ -4320,15 +4414,18 @@ bam_mplp_t bam_mplp_init(int n, bam_plp_auto_f func, void **data) int i; bam_mplp_t iter; iter = (bam_mplp_t)calloc(1, sizeof(struct __bam_mplp_t)); - iter->pos = (uint64_t*)calloc(n, sizeof(uint64_t)); + iter->pos = (hts_pos_t*)calloc(n, sizeof(hts_pos_t)); + iter->tid = (int32_t*)calloc(n, sizeof(int32_t)); iter->n_plp = (int*)calloc(n, sizeof(int)); iter->plp = (const bam_pileup1_t**)calloc(n, sizeof(bam_pileup1_t*)); iter->iter = (bam_plp_t*)calloc(n, sizeof(bam_plp_t)); iter->n = n; - iter->min = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; + iter->min_tid = (uint32_t)-1; for (i = 0; i < n; ++i) { iter->iter[i] = bam_plp_init(func, data[i]); - iter->pos[i] = iter->min; + iter->pos[i] = iter->min_pos; + iter->tid[i] = iter->min_tid; } return iter; } @@ -4352,28 +4449,45 @@ void bam_mplp_destroy(bam_mplp_t iter) { int i; for (i = 0; i < iter->n; ++i) bam_plp_destroy(iter->iter[i]); - free(iter->iter); free(iter->pos); free(iter->n_plp); free(iter->plp); + free(iter->iter); free(iter->pos); free(iter->tid); + free(iter->n_plp); free(iter->plp); free(iter); } -int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +int bam_mplp64_auto(bam_mplp_t iter, int *_tid, hts_pos_t *_pos, int *n_plp, const bam_pileup1_t **plp) { int i, ret = 0; - uint64_t new_min = (uint64_t)-1; + hts_pos_t new_min_pos = HTS_POS_MAX; + uint32_t new_min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { - int tid, pos; - iter->plp[i] = bam_plp_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { + int tid; + hts_pos_t pos; + iter->plp[i] = bam_plp64_auto(iter->iter[i], &tid, &pos, &iter->n_plp[i]); if ( iter->iter[i]->error ) return -1; - iter->pos[i] = iter->plp[i] ? (uint64_t)tid<<32 | pos : 0; + if (iter->plp[i]) { + iter->tid[i] = tid; + iter->pos[i] = pos; + } else { + iter->tid[i] = 0; + iter->pos[i] = 0; + } + } + if (iter->plp[i]) { + if (iter->tid[i] < new_min_tid) { + new_min_tid = iter->tid[i]; + new_min_pos = iter->pos[i]; + } else if (iter->pos[i] < new_min_pos) { + new_min_pos = iter->pos[i]; + } } - if (iter->plp[i] && iter->pos[i] < new_min) new_min = iter->pos[i]; } - iter->min = new_min; - if (new_min == (uint64_t)-1) return 0; - *_tid = new_min>>32; *_pos = (uint32_t)new_min; + iter->min_pos = new_min_pos; + iter->min_tid = new_min_tid; + if (new_min_pos == HTS_POS_MAX) return 0; + *_tid = new_min_tid; *_pos = new_min_pos; for (i = 0; i < iter->n; ++i) { - if (iter->pos[i] == iter->min) { // FIXME: valgrind reports "uninitialised value(s) at this line" + if (iter->pos[i] == iter->min_pos && iter->tid[i] == iter->min_tid) { n_plp[i] = iter->n_plp[i], plp[i] = iter->plp[i]; ++ret; } else n_plp[i] = 0, plp[i] = 0; @@ -4381,13 +4495,31 @@ int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_p return ret; } +int bam_mplp_auto(bam_mplp_t iter, int *_tid, int *_pos, int *n_plp, const bam_pileup1_t **plp) +{ + hts_pos_t pos64 = 0; + int ret = bam_mplp64_auto(iter, _tid, &pos64, n_plp, plp); + if (ret >= 0) { + if (pos64 < INT_MAX) { + *_pos = pos64; + } else { + hts_log_error("Position %"PRId64" too large", pos64); + *_pos = INT_MAX; + return -1; + } + } + return ret; +} + void bam_mplp_reset(bam_mplp_t iter) { int i; - iter->min = (uint64_t)-1; + iter->min_pos = HTS_POS_MAX; + iter->min_tid = (uint32_t)-1; for (i = 0; i < iter->n; ++i) { bam_plp_reset(iter->iter[i]); - iter->pos[i] = (uint64_t)-1; + iter->pos[i] = HTS_POS_MAX; + iter->tid[i] = (uint32_t)-1; iter->n_plp[i] = 0; iter->plp[i] = NULL; } diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 6b65e3133..315a4cf65 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include "htslib/synced_bcf_reader.h" @@ -38,11 +39,15 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/thread_pool.h" #include "bcf_sr_sort.h" -#define MAX_CSI_COOR 0x7fffffff // maximum indexable coordinate of .csi +// Maximum indexable coordinate of .csi, for default min_shift of 14. +// This comes out to about 17 Tbp. Limiting factor is the bin number, +// which is a uint32_t in CSI. The highest number of levels compatible +// with this is 10 (needs 31 bits). +#define MAX_CSI_COOR ((1LL << (14 + 30)) - 1) typedef struct { - uint32_t start, end; + hts_pos_t start, end; } region1_t; @@ -60,7 +65,7 @@ typedef struct } aux_t; -static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end); +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end); static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); @@ -383,7 +388,7 @@ void debug_buffer(FILE *fp, bcf_sr_t *reader) for (j=0; j<=reader->nbuffer; j++) { bcf1_t *line = reader->buffer[j]; - fprintf(fp,"\t%p\t%s%s\t%s:%d\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); + fprintf(fp,"\t%p\t%s%s\t%s:%"PRIhts_pos"\t%s ", (void*)line,reader->fname,j==0?"*":" ",reader->header->id[BCF_DT_CTG][line->rid].key,line->pos+1,line->n_allele?line->d.allele[0]:""); int k; for (k=1; kn_allele; k++) fprintf(fp," %s", line->d.allele[k]); fprintf(fp,"\n"); @@ -419,11 +424,11 @@ static inline int has_filter(bcf_sr_t *reader, bcf1_t *line) return 0; } -static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) +static int _reader_seek(bcf_sr_t *reader, const char *seq, hts_pos_t start, hts_pos_t end) { if ( end>=MAX_CSI_COOR ) { - hts_log_error("The coordinate is out of csi index limit: %d", end+1); + hts_log_error("The coordinate is out of csi index limit: %"PRIhts_pos, end+1); exit(1); } if ( reader->itr ) @@ -445,7 +450,7 @@ static int _reader_seek(bcf_sr_t *reader, const char *seq, int start, int end) reader->itr = bcf_itr_queryi(reader->bcf_idx,tid,start,end+1); } if (!reader->itr) { - hts_log_error("Could not seek: %s:%d-%d", seq, start + 1, end + 1); + hts_log_error("Could not seek: %s:%"PRIhts_pos"-%"PRIhts_pos, seq, start + 1, end + 1); assert(0); } return 0; @@ -580,7 +585,8 @@ static void _reader_shift_buffer(bcf_sr_t *reader) static int next_line(bcf_srs_t *files) { - int i, min_pos = INT_MAX; + int i; + hts_pos_t min_pos = HTS_POS_MAX; const char *chr = NULL; // Loop until next suitable line is found or all readers have finished @@ -605,7 +611,7 @@ static int next_line(bcf_srs_t *files) else if ( min_pos==files->readers[i].buffer[1]->pos ) bcf_sr_sort_add_active(&BCF_SR_AUX(files)->sort, i); } - if ( min_pos==INT_MAX ) + if ( min_pos==HTS_POS_MAX ) { if ( !files->regions ) break; continue; @@ -621,7 +627,7 @@ static int next_line(bcf_srs_t *files) for (i=0; inreaders; i++) if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) _reader_shift_buffer(&files->readers[i]); - min_pos = INT_MAX; + min_pos = HTS_POS_MAX; chr = NULL; continue; } @@ -671,7 +677,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) } -int bcf_sr_seek(bcf_srs_t *readers, const char *seq, int pos) +int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) { if ( !readers->regions ) return 0; bcf_sr_sort_reset(&BCF_SR_AUX(readers)->sort); @@ -766,7 +772,7 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file) // Add a new region into a list sorted by start,end. On input the coordinates // are 1-based, stored 0-based, inclusive. -static int _regions_add(bcf_sr_regions_t *reg, const char *chr, int start, int end) +static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end) { if ( start==-1 && end==-1 ) { @@ -827,7 +833,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; - int from, to; + hts_pos_t from, to; while ( 1 ) { while ( *ep && *ep!=',' && *ep!=':' ) ep++; @@ -879,7 +885,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) // ichr,ifrom,ito are 0-based; // returns -1 on error, 0 if the line is a comment line, 1 on success -static int _regions_parse_line(char *line, int ichr,int ifrom,int ito, char **chr,char **chr_end,int *from,int *to) +static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char **chr, char **chr_end, hts_pos_t *from, hts_pos_t *to) { if (ifrom < 0 || ito < 0) return -1; *chr_end = NULL; @@ -969,7 +975,8 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr while ( hts_getline(reg->file, KS_SEP_LINE, ®->line) > 0 ) { char *chr, *chr_end; - int from, to, ret; + hts_pos_t from, to; + int ret; ret = _regions_parse_line(reg->line.s, ichr,ifrom,abs(ito), &chr,&chr_end,&from,&to); if ( ret < 0 ) { @@ -1076,7 +1083,8 @@ int bcf_sr_regions_next(bcf_sr_regions_t *reg) // reading from tabix char *chr, *chr_end; - int ichr = 0, ifrom = 1, ito = 2, is_bed = 0, from, to; + int ichr = 0, ifrom = 1, ito = 2, is_bed = 0; + hts_pos_t from, to; if ( reg->tbx ) { ichr = reg->tbx->conf.sc-1; @@ -1195,7 +1203,7 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re return !(type & VCF_INDEL) ? 1 : 0; } -int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, int start, int end) +int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence diff --git a/tabix.c b/tabix.c index 8888f1ee6..32e20caed 100644 --- a/tabix.c +++ b/tabix.c @@ -120,11 +120,11 @@ static char **parse_regions(char *regions_fname, char **argv, int argc, int *nre for (iseq=0; iseqss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtol(line + b, &s, 0); + intv->beg = intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; @@ -103,7 +103,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) if ((conf->preset&0xffff) == TBX_GENERIC) { if (id == conf->ec) { - intv->end = strtol(line + b, &s, 0); + intv->end = strtoll(line + b, &s, 0); if ( s==line+b ) return -1; // expected int } } else if ((conf->preset&0xffff) == TBX_SAM) { @@ -131,7 +131,7 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) s = strstr(line + b, ";END="); if (s) s += 5; } - if (s) intv->end = strtol(s, &s, 0); + if (s) intv->end = strtoll(s, &s, 0); line[i] = c; } } @@ -172,7 +172,7 @@ static inline int get_intv(tbx_t *tbx, kstring_t *str, tbx_intv_t *intv, int is_ * -1 on EOF * <= -2 on error */ -int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, int *beg, int *end) +int tbx_readrec(BGZF *fp, void *tbxv, void *sv, int *tid, hts_pos_t *beg, hts_pos_t *end) { tbx_t *tbx = (tbx_t *) tbxv; kstring_t *s = (kstring_t *) sv; @@ -220,6 +220,44 @@ static int tbx_set_meta(tbx_t *tbx) return 0; } +// Minimal effort parser to extract reference length out of VCF header line +// This is used only used to adjust the number of levels if necessary, +// so not a major problem if it doesn't always work. +static void adjust_max_ref_len_vcf(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "##contig", 8) != 0) return; + ptr = strstr(str + 8, "length"); + if (!ptr) return; + for (ptr += 6; *ptr == ' ' || *ptr == '='; ptr++) {} + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Same for sam files +static void adjust_max_ref_len_sam(const char *str, int64_t *max_ref_len) +{ + const char *ptr; + int64_t len; + if (strncmp(str, "@SQ", 3) != 0) return; + ptr = strstr(str + 3, "\tLN:"); + if (!ptr) return; + ptr += 4; + len = strtoll(ptr, NULL, 10); + if (*max_ref_len < len) *max_ref_len = len; +} + +// Adjusts number of levels if not big enough. This can happen for +// files with very large contigs. +static int adjust_n_lvls(int min_shift, int n_lvls, int64_t max_len) +{ + int64_t s = 1LL << (min_shift + n_lvls * 3); + max_len += 256; + for (; max_len > s; ++n_lvls, s <<= 3) {} + return n_lvls; +} + tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) { tbx_t *tbx; @@ -228,6 +266,7 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) int64_t lineno = 0; uint64_t last_off = 0; tbx_intv_t intv; + int64_t max_ref_len = 0; str.s = 0; str.l = str.m = 0; tbx = (tbx_t*)calloc(1, sizeof(tbx_t)); @@ -237,11 +276,23 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) else min_shift = 14, n_lvls = 5, fmt = HTS_FMT_TBI; while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { ++lineno; + if (str.s[0] == tbx->conf.meta_char && fmt == HTS_FMT_CSI) { + switch (tbx->conf.preset) { + case TBX_SAM: + adjust_max_ref_len_sam(str.s, &max_ref_len); break; + case TBX_VCF: + adjust_max_ref_len_vcf(str.s, &max_ref_len); break; + default: + break; + } + } if (lineno <= tbx->conf.line_skip || str.s[0] == tbx->conf.meta_char) { last_off = bgzf_tell(fp); continue; } if (first == 0) { + if (fmt == HTS_FMT_CSI) + n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1; diff --git a/test/longrefs/index.expected1.vcf b/test/longrefs/index.expected1.vcf new file mode 100644 index 000000000..e0e7f91ad --- /dev/null +++ b/test/longrefs/index.expected1.vcf @@ -0,0 +1,6 @@ +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 diff --git a/test/longrefs/index.expected2.vcf b/test/longrefs/index.expected2.vcf new file mode 100644 index 000000000..4898e2563 --- /dev/null +++ b/test/longrefs/index.expected2.vcf @@ -0,0 +1 @@ +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/longrefs/index.vcf b/test/longrefs/index.vcf new file mode 100644 index 000000000..54c8e03d3 --- /dev/null +++ b/test/longrefs/index.vcf @@ -0,0 +1,216 @@ +##fileformat=VCFv4.2 +##FILTER= +##reference=file:10_gig_at_front.fa +##contig= +##ALT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT ERS220911 +1 10009999919 . G <*> 0 . DP=1;I16=1,0,0,0,26,676,0,0,60,3600,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,26 +1 10009999920 . T <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999921 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999922 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999923 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999924 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999925 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999926 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999927 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999928 . G <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999929 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999930 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999931 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999932 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999933 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999934 . T <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999935 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999936 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999937 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999938 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999939 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999940 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999941 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999942 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999943 . A <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999944 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999945 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999946 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999947 . C <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999948 . A <*> 0 . DP=1;I16=1,0,0,0,34,1156,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,34 +1 10009999949 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999950 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999951 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999952 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999953 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999954 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999955 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999956 . C <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999957 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999958 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999959 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999960 . T <*> 0 . DP=1;I16=1,0,0,0,35,1225,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,35 +1 10009999961 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999962 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999963 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999964 . A <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999965 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999966 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999967 . A <*> 0 . DP=1;I16=1,0,0,0,41,1681,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,41 +1 10009999968 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999969 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999970 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999971 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999972 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999973 . T <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999974 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999975 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999976 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999977 . G <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999978 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999979 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999980 . C <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999981 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999982 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999983 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999984 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999985 . T <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999986 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999987 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999988 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999989 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999990 . G <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999991 . A <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999992 . C <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999993 . A <*> 0 . DP=1;I16=1,0,0,0,36,1296,0,0,60,3600,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,36 +1 10009999994 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10009999995 . G <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10009999996 . G <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10009999997 . C <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10009999998 . G <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10009999999 . A <*> 0 . DP=1;I16=1,0,0,0,31,961,0,0,60,3600,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,31 +1 10010000000 . A <*> 0 . DP=1;I16=1,0,0,0,33,1089,0,0,60,3600,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,33 +1 10010000001 . A <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000002 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000003 . C <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000004 . C <*> 0 . DP=1;I16=1,0,0,0,29,841,0,0,60,3600,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000005 . C <*> 0 . DP=1;I16=1,0,0,0,39,1521,0,0,60,3600,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,39 +1 10010000006 . G <*> 0 . DP=1;I16=1,0,0,0,38,1444,0,0,60,3600,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,38 +1 10010000007 . T <*> 0 . DP=1;I16=1,0,0,0,40,1600,0,0,60,3600,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,40 +1 10010000008 . C <*> 0 . DP=1;I16=1,0,0,0,37,1369,0,0,60,3600,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,37 +1 10010000009 . T <*> 0 . DP=1;I16=1,0,0,0,43,1849,0,0,60,3600,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,43 +1 10010000010 . C <*> 0 . DP=2;I16=1,1,0,0,59,2105,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,59 +1 10010000011 . T <*> 0 . DP=2;I16=1,1,0,0,76,2888,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000012 . A <*> 0 . DP=2;I16=1,1,0,0,77,2965,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000013 . C <*> 0 . DP=2;I16=1,1,0,0,66,2250,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,66 +1 10010000014 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,32,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,67 +1 10010000015 . A <*> 0 . DP=2;I16=1,1,0,0,69,2385,0,0,89,4441,0,0,8,34,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,65 +1 10010000016 . T <*> 0 . DP=2;I16=1,1,0,0,75,2817,0,0,89,4441,0,0,8,40,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,68 +1 10010000017 . A <*> 0 . DP=2;I16=1,1,0,0,67,2285,0,0,89,4441,0,0,8,50,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,58 +1 10010000018 . A <*> 0 . DP=2;I16=1,1,0,0,64,2120,0,0,89,4441,0,0,8,64,0,0;QS=1,0;MQSB=1;MQ0F=0 PL 0,6,55 +1 10010000019 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000020 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000021 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000022 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000023 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000024 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000025 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000026 . T <*> 0 . DP=1;I16=0,1,0,0,29,841,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000027 . A <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000028 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000029 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000030 . A <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000031 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000032 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000033 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000034 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000035 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000036 . G <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000037 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000038 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000039 . T <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000040 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000041 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000042 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000043 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000044 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000045 . T <*> 0 . DP=1;I16=0,1,0,0,42,1764,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000046 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000047 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000048 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000049 . T <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000050 . G <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000051 . C <*> 0 . DP=1;I16=0,1,0,0,16,256,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,16 +1 10010000052 . T <*> 0 . DP=1;I16=0,1,0,0,31,961,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000053 . T <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000054 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000055 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000056 . A <*> 0 . DP=1;I16=0,1,0,0,22,484,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,22 +1 10010000057 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000058 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000059 . C <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000060 . C <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000061 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000062 . A <*> 0 . DP=1;I16=0,1,0,0,34,1156,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000063 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000064 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000065 . T <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000066 . A <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000067 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000068 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000069 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000070 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000071 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000072 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000073 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000074 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000075 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000076 . C <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000077 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000078 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000079 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000080 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000081 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000082 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000083 . G <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000084 . G <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,25,625,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000085 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,24,576,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000086 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,23,529,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000087 . G <*> 0 . DP=1;I16=0,1,0,0,40,1600,0,0,29,841,0,0,22,484,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000088 . A <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,21,441,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000089 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,20,400,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000090 . T <*> 0 . DP=1;I16=0,1,0,0,39,1521,0,0,29,841,0,0,19,361,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000091 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,18,324,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000092 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,17,289,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000093 . T <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,16,256,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000094 . C <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,15,225,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000095 . C <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,14,196,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000096 . A <*> 0 . DP=1;I16=0,1,0,0,41,1681,0,0,29,841,0,0,13,169,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000097 . A <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,12,144,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000098 . G <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,11,121,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000099 . C <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,10,100,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000100 . C <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,9,81,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000101 . T <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,8,64,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000102 . T <*> 0 . DP=1;I16=0,1,0,0,36,1296,0,0,29,841,0,0,7,49,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000103 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,6,36,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000104 . G <*> 0 . DP=1;I16=0,1,0,0,38,1444,0,0,29,841,0,0,5,25,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000105 . A <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,4,16,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000106 . G <*> 0 . DP=1;I16=0,1,0,0,37,1369,0,0,29,841,0,0,3,9,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000107 . G <*> 0 . DP=1;I16=0,1,0,0,33,1089,0,0,29,841,0,0,2,4,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000108 . C <*> 0 . DP=1;I16=0,1,0,0,32,1024,0,0,29,841,0,0,1,1,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000109 . A <*> 0 . DP=1;I16=0,1,0,0,35,1225,0,0,29,841,0,0,0,0,0,0;QS=1,0;MQ0F=0 PL 0,3,29 +1 10010000110 . G 0 . SVTYPE=DEL;SVLEN=-890;END=10010001000 PL 0,1,45 diff --git a/test/longrefs/longref.sam b/test/longrefs/longref.sam new file mode 100644 index 000000000..a2611f675 --- /dev/null +++ b/test/longrefs/longref.sam @@ -0,0 +1,96 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 +SRR065390.29022479 0 CHROMOSOME_I 10000000167 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAATCCTATGCATAAACCTAAACAGAATCAAAAGAAAAATCCAATCT CCCCCCCCCACCCCCBCCCC?CCCCCCCD;?D?D################################ AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94C0T0A3 NM:i:3 +SRR065390.23298396 16 CHROMOSOME_I 10000000167 1 100M * 0 0 AAGCCTCGGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC #####################A@><>B==BC@CCBB?BA'@>>;>>DADDDBDBADB?B6@7=;;7DBD?B<8=AA:4-9<@@1:@A################################ AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:98C1 NM:i:1 +SRR065390.23263331 0 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCCCDCCDCDDDBBDA=B@BB@B>B>AB?@?BB>;;ACC>CAA@;9<5@############## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.1428659 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC #######?DB@;>BBB::>:D=>D?BDDBBBBCCAC@DCCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9270489 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########?4=>@BAA>BB>AA@====3BBBBB;B?@C==CCC?@CCC?CCC?ACCCCCBCCCBCCCCBCCCCCCCCCCCCCC=BCCCCCACCCDCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.9538669 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########@=?6??@B;BA@@@?.@?@@;D>A;DB@DBBBD>@DDDBADCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15525407 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ####################@37:0BC@@C@ACCAB?@CCACCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18387934 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##########################@@A@4BDDBB@ACABB@8BCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.27778447 16 CHROMOSOME_I 10000000168 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###############@@B=;>89<>/8?<8@>=ABDCCDCC@CCACB@@C@9ACCCC;CCCC@CCAAB@@CCCCCBCCCCCCCBCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4767844 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTAAGCCTAACCCCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCCDCC=CCBA=BCCACCBCC<@@@A@>A?D<5/772AA####################### AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80T14G2T1 NM:i:3 +SRR065390.6036148 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCTCAGACCA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCBCCC=C########################################## AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:86T6A2C1T1 NM:i:4 +SRR065390.7523697 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTATACCTATGCATA 8773399<;8BBB>BAA<A################################### AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:84C3A4A5A0 NM:i:4 +SRR065390.21777229 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCCAAGCCCAAGACCAAGCCAAGACCCC CCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCCBDABAA@48@############################################# AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:74T5T3C1T5T1A0G2T0A0 NM:i:9 +SRR065390.22082412 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTTGGCCGCAGCCTCAGCCTGAACAGA CCCCACACCCCCCCC??:??@CCCC@9A>9?AA@AC>@CA@B-73>8=53@=:=A?><=>49778?################ AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:88A5A5 NM:i:2 +SRR065390.32243033 0 CHROMOSOME_I 10000000170 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTACGACTA CCCCCCCCCCCACC@CCACCCCCCCCCCCCCCCCC@CADCCBBD@BB>=?A@9C@?C>A88?>8A?:@CCCCCCCCC:?>;:CCC?BCCCCACCCCCCCCCC AS:i:-39 XS:i:-39 XN:i:0 XM:i:18 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0T0A0A2C0T2G1C0T0A0A0G0C1T0A1G1C64C10 NM:i:18 +SRR065390.28296401 16 CHROMOSOME_I 10000000171 1 100M * 0 0 CTAAGCCTAAGCCTAAGGCTAAGCCTAAACCCACGCCTAGGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################################################BAADDDBBDDCCDCCCCCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:17C10G2T1A5A3T56 NM:i:6 +SRR065390.1242089 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC A=@@?=?=8A3BB>>B@B>BAB@B@B77//8<;>5:@@@B6ABA@BA<@BB5):5;*83736?;;;@@=;6B>??##################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3872193 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBCCCC@DCACD=ABCB@BCDDA@BA=BBB@C??@;:0A>?>B>?)?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14566073 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCBCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCCCAB=?CCCA6?AACABCCAC=1B@A@;B<@A@@;>?@>8BB?B#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18391952 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCDCCCAADCCB?CBABD=A>?BB5:??:B;>?@AA?>3?;@(8>=>>/(5500;+@@6 AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.18719419 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCAC@@C@@B@DBBDBB################################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23668023 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCC@@ABDB@@BBB>DBABB@D@BDBAABAB>B>AA@??9:8>>A:255@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23826980 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDBCDDBDDDABBBBDDBBBBBBB>D?#################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.28024258 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBACDDBC>DDBDB>BBBBB;?@BBB3@???=0<=>@@:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30039772 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCDACDBBDDDDDBBBDBBD>BBAADAABAAC??B??######################### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5345749 16 CHROMOSOME_I 10000000173 1 100M * 0 0 GACCCAGACCCCGCGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##########################################@BA=>AAA@;AAAA@AA9AAAA@BAA@:=@@@4A=?A@AAAAA:B@@BBBBB@>>>>> AS:i:-18 XS:i:-18 XN:i:0 XM:i:9 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A1G2T0A1G2T0A0A9T76 NM:i:9 +SRR065390.16932911 16 CHROMOSOME_I 10000000173 1 100M * 0 0 AACCCTAAACCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ###############B?BAA;;9>0A1BAAA@=CA*@CCCCACCCC@@?CAAB>AC=C?CCCCBCCBBCBCCCABCCBCA@CCCCCCBCCCCC?BCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2G5G5G85 NM:i:3 +SRR065390.17106354 0 CHROMOSOME_I 10000000173 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCACC?CCADCCAC@BB@CBB@C?@A@@A>=B?BAABBABB6A>BBB:BBA=?DD??;D/<71; AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22716808 0 CHROMOSOME_I 10000000174 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCGAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCABBBBB?################################### AS:i:-2 XS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 YT:Z:UU MD:Z:94T5 NM:i:1 +SRR065390.12986460 0 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCATAATCGTAAGACTAAGAGCAAGCCTCAGCATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA?CCA############################### AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:92T2G4 NM:i:2 +SRR065390.14729559 16 CHROMOSOME_I 10000000176 1 100M * 0 0 CCTACGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA #########################?(4<=B@;BBBBCB?>BCCA?DCCACCCCCC@C;BBB??B<;9=C@BCAACBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2T0A0A6G88 NM:i:4 +SRR065390.26023345 0 CHROMOSOME_I 10000000177 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTCAGCCGAA CCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCA?CDADABDBDDBDDBAB>>BBBB@;>@BBB?A>CBBB<>>B@@4@?>>?0ABD@@###### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6149508 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCDDCCBD=CCDB@@DABAB=ABB??>>@BB=BCBAB>>D;A?><>AA>?A==+@A AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.6618950 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCDCCCCCDCCBCAACBBCBB@DADABBDAB?CBB@B;?BB=B>>>?:? AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.7246333 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCACCCDCCCCCCCCCCCDCCBCD@CBBDCADADADBDABBDBDABDBCBBA>BAB>>AC9A################## AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8266146 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTCAGCCGAGGCCTACGC CDCCCCCCCCCCCCCCCCCBCCCCCCDCCCCCCACDCCCCCDACBDCABCB@A=ABBB@BBD@DB?B################################# AS:i:-10 XS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:80G3A4T1A5A2 NM:i:5 +SRR065390.8986893 0 CHROMOSOME_I 10000000179 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCA@CCCCD=CCCDAABBDB>BDDBDB;BB@@B=@BDB:.A>>BB:@################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 diff --git a/test/longrefs/longref_itr.expected.sam b/test/longrefs/longref_itr.expected.sam new file mode 100644 index 000000000..6aca06706 --- /dev/null +++ b/test/longrefs/longref_itr.expected.sam @@ -0,0 +1,26 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 diff --git a/test/longrefs/longref_multi.expected.sam b/test/longrefs/longref_multi.expected.sam new file mode 100644 index 000000000..997ead54c --- /dev/null +++ b/test/longrefs/longref_multi.expected.sam @@ -0,0 +1,46 @@ +@SQ SN:CHROMOSOME_I LN:10001009800 +SRR065390.14978392 16 CHROMOSOME_I 10000000002 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.921023 16 CHROMOSOME_I 10000000003 12 100M * 0 0 CTAAGCCTAAATCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################################???88:;98768700000<>:BBA?BBAB?BBBBBBBB>B>BB::;?:00000 AS:i:-6 XS:i:-13 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10G0C10G77 NM:i:3 +SRR065390.1871511 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA 0:BB@>B<=B@???@=8@B>BB@CA@DACDCBBCCCA@CCCCACCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3743423 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##################?6@:7<=@3=@ABAAB>BDBBABADABDDDBDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.4251890 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########@BB=BCBBC?B>B;>B@@ADBBB@DBBBBDCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.5238868 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA @,=@@D8D;?BBB>;?BBB==BB@D;>D>BBB>BBDDBA@@BCCB@=BACBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.8289592 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A?@C9@@BC=AABDD@A@DC@CB=@BA?6@CCAAC@+CCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.14497557 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ######@:@@.>=><;;B>AB>>BB?B=>B=BD>BDADDD>CCDDDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.15617929 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA D?;;D>?C>CBAAACD@BB?B>BBDB>@BBDDBDC@CBDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.16049575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #######################@??BB8BBB@@:AB@BDBCCDCBDCCCCACCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.17838261 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################@>=?B@DCBDB>@D>DBADCDDD>CC@DCCCCBCCACCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22711273 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################B<@=<:6/0307==72@@=?788==;AAA:@CCAACCC?CCAACCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22922978 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##########################B=B>A@BBBC??=@=A@AC<><<8>C6CCCCC8CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23087186 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ############@:73???@6;D?B>:>BBA?B<>B@B>@B>@>BCDCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23506653 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############A/=A5::87@:=>6@AA>@CDBA@ABCB?BC>CD>DDBDC@CCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.23791575 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCCCACCCCAACCCTTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############################B4;:=B@>A@BCB@@ABCCBB@BCC@CCDCCDCCDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-12 XS:i:-12 XN:i:0 XM:i:6 XO:i:0 XG:i:0 YT:Z:UU MD:Z:7T0A1G2T2G3A79 NM:i:6 +SRR065390.25911768 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ##############@8B@B?9=:A?=@DDB>;B6?DDBCABABB@DDCCBDBDCCDACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26055380 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################DAA><0=>=B;?BACDBDABCBBC@CACACACACCACCCCCCCCCCCCCCCCCCCCCCBCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.26121674 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################?:AA::@DAAA>B??@A4@=BBBBDDBDBDCCBDDBCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.30352568 16 CHROMOSOME_I 10000000003 7 100M * 0 0 CTAGGGCTAACCCTCAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #################################################################A>>5A?CCC@CCCCCCCCCC?CC:C@A@==@A@A@ AS:i:-10 XS:i:-19 XN:i:0 XM:i:5 XO:i:0 XG:i:0 YT:Z:UU MD:Z:3A1C4G3A37G47 NM:i:5 +SRR065390.31341126 16 CHROMOSOME_I 10000000003 1 100M * 0 0 CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################?AD?D@BCAABBBD@=DBCDBAACCDCAABCDCCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.33653624 16 CHROMOSOME_I 10000000003 17 100M * 0 0 CTAATCCTAGGCCTAAGCCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################??8?000-+0000,@ABBBB@B:B@B>BB????>>>@@?::?6?>>;>>@ACCCCBCCBACCCC AS:i:-6 XS:i:-19 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:4G4A9T80 NM:i:3 +SRR065390.28043279 16 CHROMOSOME_I 10000000004 0 9M1I90M * 0 0 TCTTCCGATCTCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####A>=7A6DD=@AA?>AAABC@CAABDBCBBABDADBADCABBBDCDCDCACDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCC AS:i:-26 XS:i:-26 XN:i:0 XM:i:6 XO:i:1 XG:i:1 YT:Z:UU MD:Z:1A0A0G2T1A0G89 NM:i:7 +SRR065390.29270443 16 CHROMOSOME_I 10000000006 1 100M * 0 0 AGCCTAAGCCGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ###################################@:88@@>B>C>CCCCA@CCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:10T2G86 NM:i:2 +SRR065390.1364843 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ####################@=A=8@:>@;@@=>>B8?C6CCCCCCCCCCACCCCBBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.10190875 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################@@@@@@;>BBB?>A6BAB?BB=BAB@?:A.<===@7:4::>8D@BABBACCCCAB@CCCDDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.13556211 0 CHROMOSOME_I 10000000011 0 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGATTGGAAGAGCGGCTAAGCAGGAACGCCGGGCTCGATCTCAGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBCDCCB>BBBBB########################################### AS:i:-50 XS:i:-50 XN:i:0 XM:i:25 XO:i:0 XG:i:0 YT:Z:UU MD:Z:57C0C1A0A0G0C0C0T0A0A1C6C0T0A1G1C0T0A0A1C2A0A0G0C2A3 NM:i:25 +SRR065390.20524775 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?9<8B=?@C8A<@?@C8CBDCCC=CCCCC??@CCDCCCCCCCCCCCCCCCCCCCCDCCCCCCCDCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20580336 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ############################?:>@?@=>@=0<:CB>@B=DCADB@CCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.22573273 16 CHROMOSOME_I 10000000011 1 100M * 0 0 AAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC ##################################A9;?@CBBDBA>BB;ABDB>AAA;=>=0943@########### AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.20870114 0 CHROMOSOME_I 10000000012 1 100M * 0 0 AGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCABCCCC=BBBCA@B>B?D;B=>9?############################ AS:i:0 XS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 YT:Z:UU MD:Z:100 NM:i:0 +SRR065390.3863623 16 CHROMOSOME_I 10000000012 1 100M * 0 0 CGCCTACGCCTACGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC ##############################?@BB>B@BCABBB?DC@DADC@DCDCACDCBCCCCCCCCCCC@CCCCCCCCCCCCCCC1CCCCCCCCCCC AS:i:-6 XS:i:-6 XN:i:0 XM:i:3 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0A5A5A87 NM:i:3 +SRR065390.1659845 0 CHROMOSOME_I 10000000013 0 100M * 0 0 GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAACCTAAGCCTAAGCCCAACCCTAAGACCGAGACCGAGACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCAB@CCC######################################### AS:i:-22 XS:i:-22 XN:i:0 XM:i:11 XO:i:0 XG:i:0 YT:Z:UU MD:Z:60G14T2G6C1T0A2C1T0A2C1T0 NM:i:11 +SRR065390.1567418 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CACAGCCTACGTCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #########################################?:8A@<@>>BBB8>BBB@BBBB>@:??::87688:?:::?@<@@97866@?>@@;;>:< AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T0A6A1C88 NM:i:4 +SRR065390.4996386 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CCAAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###################################@@@@A=BB@C>>DCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-22 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T92 NM:i:2 +SRR065390.14822977 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CGAAGCCAGAGCCTAGGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ####################################B:B?:==2>6@B@@C>?>A@CB5@??@28C@CCCBC@CC?CC?A@CC:CBCCCCCCCCCCCCCC AS:i:-8 XS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T5T0A6A84 NM:i:4 +SRR065390.15148736 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CTGAGCCGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###########################CCBC<=C;9??<;==C@BCCCCC=CCCCACACACCBBCCCCCCCCCCCCCCCCCBCCCCCCCCCCCBCA?CCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A4T92 NM:i:2 +SRR065390.18089757 16 CHROMOSOME_I 10000000015 1 100M * 0 0 CTGAGCCTGAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ########################A212.0:?.>8?BB?B<@@C?CCBCB;DCCCACDCCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:2A5A91 NM:i:2 +SRR065390.25601994 16 CHROMOSOME_I 10000000015 17 100M * 0 0 ATAAGCCTAATCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #####################???DD?BD?BDBB>ACBDBDDBDDDBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-21 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C9G89 NM:i:2 +SRR065390.29400981 16 CHROMOSOME_I 10000000015 17 100M * 0 0 CGAACCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA ###############################A<:?C>>BCABABC?AD>BDADDDBDBBDBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-4 XS:i:-18 XN:i:0 XM:i:2 XO:i:0 XG:i:0 YT:Z:UU MD:Z:1T2G95 NM:i:2 diff --git a/test/sam.c b/test/sam.c index 7cbfd179e..bbd759fb9 100644 --- a/test/sam.c +++ b/test/sam.c @@ -31,6 +31,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "htslib/hts_defs.h" @@ -1148,12 +1150,15 @@ static void samrecord_layout(void) size_t bam1_t_size, bam1_t_size2; - bam1_t_size = (36 + sizeof(int) + 4 + sizeof (char *) + sizeof(uint64_t) - + sizeof(uint32_t)); + assert(sizeof(hts_pos_t) == 8 || sizeof(hts_pos_t) == 4); + int core_size = sizeof(hts_pos_t) == 8 ? 48 : 36; + bam1_t_size = (core_size + sizeof(int) + sizeof(char *) + sizeof(uint64_t) + + 2 * sizeof(uint32_t)); bam1_t_size2 = bam1_t_size + 4; // Account for padding on some platforms - if (sizeof (bam1_core_t) != 36) - fail("sizeof bam1_core_t is %zu, expected 36", sizeof (bam1_core_t)); + if (sizeof (bam1_core_t) != core_size) + fail("sizeof bam1_core_t is %zu, expected %d", + sizeof (bam1_core_t), core_size); if (sizeof (bam1_t) != bam1_t_size && sizeof (bam1_t) != bam1_t_size2) fail("sizeof bam1_t is %zu, expected either %zu or %zu", @@ -1167,6 +1172,166 @@ static void samrecord_layout(void) "test/sam_alignment.tmp.sam_", "w", NULL); } +static void check_big_ref(int parse_header) +{ + static const char sam_text[] = "data:," + "@HD\tVN:1.4\n" + "@SQ\tSN:large#1\tLN:5000000000\n" + "@SQ\tSN:small#1\tLN:100\n" + "@SQ\tSN:large#2\tLN:9223372034707292158\n" + "@SQ\tSN:small#2\tLN:1\n" + "r1\t0\tlarge#1\t4999999000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r2\t0\tsmall#1\t1\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "r3\t0\tlarge#2\t9223372034707292000\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n" + "p1\t99\tlarge#2\t1\t50\t8M\t=\t9223372034707292150\t9223372034707292158\tACGTACGT\tabcdefgh\n" + "p1\t147\tlarge#2\t9223372034707292150\t50\t8M\t=\t1\t-9223372034707292158\tACGTACGT\tabcdefgh\n" + "r4\t0\tsmall#2\t2\t50\t8M\t*\t0\t0\tACGTACGT\tabcdefgh\n"; + const hts_pos_t expected_lengths[] = { + 5000000000LL, 100LL, 9223372034707292158LL, 1LL + }; + const int expected_tids[] = { + 0, 1, 2, 2, 2, 3 + }; + const int expected_mtid[] = { + -1, -1, -1, 2, 2, -1 + }; + const hts_pos_t expected_positions[] = { + 4999999000LL - 1, 1LL - 1, 9223372034707292000LL - 1, 1LL - 1, + 9223372034707292150LL - 1, 2LL - 1 + }; + const hts_pos_t expected_mpos[] = { + -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1 + }; + samFile *in = NULL, *out = NULL; + sam_hdr_t *header = NULL; + bam1_t *aln = bam_init1(); + const int num_refs = sizeof(expected_lengths) / sizeof(expected_lengths[0]); + const int num_align = sizeof(expected_tids) / sizeof(expected_tids[0]); + const char *outfname = "test/sam_big_ref.tmp.sam_"; + int i, r; + char buffer[sizeof(sam_text) + 1024]; + FILE *inf = NULL; + size_t bytes; + + if (!aln) { + fail("Out of memory"); + goto cleanup; + } + + in = sam_open(sam_text, "r"); + if (!in) { + fail("Opening SAM file"); + goto cleanup; + } + out = sam_open(outfname, "w"); + if (!out) { + fail("Opening output SAM file \"%s\"", outfname); + goto cleanup; + } + header = sam_hdr_read(in); + if (!header) { + fail("Reading SAM header"); + goto cleanup; + } + if (parse_header) { + // This will force the reader to be parsed + if (sam_hdr_count_lines(header, "SQ") != num_refs) { + fail("Wrong number of SQ lines in header"); + goto cleanup; + } + } + for (i = 0; i < num_refs; i++) { + hts_pos_t ln = sam_hdr_tid2len(header, i); + if (ln != expected_lengths[i]) { + fail("Wrong length for ref %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_lengths[i], ln); + goto cleanup; + } + } + if (sam_hdr_write(out, header) < 0) { + fail("Failed to write SAM header"); + goto cleanup; + } + i = 0; + while ((r = sam_read1(in, header, aln)) >= 0) { + if (i >= num_align) { + fail("Too many alignment records.\n"); + goto cleanup; + } + if (aln->core.tid != expected_tids[i]) { + fail("Wrong tid for record %d : expected %d got %d\n", + i, expected_tids[i], aln->core.tid); + goto cleanup; + } + if (aln->core.mtid != expected_mtid[i]) { + fail("Wrong mate tid for record %d : expected %d got %d\n", + i, expected_mtid[i], aln->core.mtid); + goto cleanup; + } + if (aln->core.pos != expected_positions[i]) { + fail("Wrong position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_positions[i], aln->core.pos); + } + if (aln->core.mpos != expected_mpos[i]) { + fail("Wrong mate position for record %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + i, expected_mpos[i], aln->core.mpos); + } + if (sam_write1(out, header, aln) < 0) { + fail("Failed to write alignment record %d\n", i); + goto cleanup; + } + i++; + } + if (r < -1) { + fail("Error reading SAM alignment\n"); + goto cleanup; + } + if (i < num_align) { + fail("Not enough alignment records\n"); + goto cleanup; + } + r = sam_close(in); in = NULL; + if (r < 0) { + fail("sam_close(in)"); + goto cleanup; + } + r = sam_close(out); out = NULL; + if (r < 0) { + fail("sam_close(out)"); + goto cleanup; + } + + inf = fopen(outfname, "r"); + if (!inf) { + fail("Opening \"%s\"", outfname); + goto cleanup; + } + bytes = fread(buffer, 1, sizeof(buffer), inf); + if (bytes != sizeof(sam_text) - 7 + || memcmp(buffer, sam_text + 6, bytes - 7) != 0) { + fail("Output file does not match original version"); + fprintf(stderr, + "---------- Expected:\n%.*s\n" + "++++++++++ Got:\n%.*s\n" + "====================\n", + (int) sizeof(sam_text) - 7, sam_text + 6, + (int) bytes, buffer); + goto cleanup; + } + + cleanup: + bam_destroy1(aln); + sam_hdr_destroy(header); + if (in) sam_close(in); + if (out) sam_close(out); + if (inf) fclose(inf); + unlink(outfname); + return; +} + static void faidx1(const char *filename) { int n, n_exp = 0, n_fq_exp = 0; @@ -1575,6 +1740,8 @@ int main(int argc, char **argv) test_text_file("test/fastqs.fq", 500); check_enum1(); check_cigar_tab(); + check_big_ref(0); + check_big_ref(1); test_mempolicy(); for (i = 1; i < argc; i++) faidx1(argv[i]); diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ebe93904a..ee0aadedc 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -31,6 +31,7 @@ #include #include #include +#include #include void error(const char *format, ...) @@ -103,7 +104,7 @@ int main(int argc, char *argv[]) { if ( !bcf_sr_has_line(sr,i) ) continue; bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%d", bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); + printf("%s:%"PRIhts_pos, bcf_seqname(bcf_sr_get_header(sr,i),rec),rec->pos+1); break; } diff --git a/test/test-parse-reg.c b/test/test-parse-reg.c index 404e98ddf..74bb3187f 100644 --- a/test/test-parse-reg.c +++ b/test/test-parse-reg.c @@ -47,6 +47,10 @@ #include #include +#ifndef INT64_32_MAX +#define INT64_32_MAX ((((int64_t)INT_MAX)<<32)|INT_MAX) +#endif + void reg_expected(sam_hdr_t *hdr, const char *reg, int flags, char *reg_exp, int tid_exp, int64_t beg_exp, int64_t end_exp) { const char *reg_out; @@ -87,26 +91,26 @@ int reg_test(char *fn) { // 5 chr1,chr3 // Check range extensions. - reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_MAX); - reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1", 0, "", 0, 0, INT64_32_MAX); + reg_expected(hdr, "chr1:50", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:50", HTS_PARSE_ONE_COORD, "", 0, 49, 50); reg_expected(hdr, "chr1:50-100", 0, "", 0, 49, 100); - reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_MAX); + reg_expected(hdr, "chr1:50-", 0, "", 0, 49, INT64_32_MAX); reg_expected(hdr, "chr1:-50", 0, "", 0, 0, 50); // Check quoting fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200", 0, NULL, 0, 0, 0); // ambiguous reg_expected(hdr, "{chr1}:100-200", 0, "", 0, 99, 200); - reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_MAX); + reg_expected(hdr, "{chr1:100-200}", 0, "", 2, 0, INT64_32_MAX); reg_expected(hdr, "{chr1:100-200}:100-200", 0, "", 2, 99, 200); reg_expected(hdr, "{chr2:100-200}:100-200", 0, "", 3, 99, 200); reg_expected(hdr, "chr2:100-200:100-200", 0, "", 3, 99, 200); - reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_MAX); + reg_expected(hdr, "chr2:100-200", 0, "", 3, 0, INT64_32_MAX); // Check numerics - reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_MAX); - reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_MAX); + reg_expected(hdr, "chr3", 0, "", 4, 0, INT64_32_MAX); + reg_expected(hdr, "chr3:", 0, "", 4, 0, INT64_32_MAX); reg_expected(hdr, "chr3:1000-1500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1,000-1,500", 0, "", 4, 999, 1500); reg_expected(hdr, "chr3:1k-1.5K", 0, "", 4, 999, 1500); @@ -114,11 +118,11 @@ int reg_test(char *fn) { reg_expected(hdr, "chr3:1e3-15e2", 0, "", 4, 999, 1500); // Check list mode - reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_MAX); + reg_expected(hdr, "chr1,chr3", HTS_PARSE_LIST, "chr3", 0, 0, INT64_32_MAX); fprintf(stderr, "Expected error: "); reg_expected(hdr, "chr1:100-200,chr3", HTS_PARSE_LIST, NULL, 0, 0, 0); // ambiguous - reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_MAX); - reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_MAX); + reg_expected(hdr, "{chr1,chr3}", HTS_PARSE_LIST, "", 5, 0, INT64_32_MAX); + reg_expected(hdr, "{chr1,chr3},chr1", HTS_PARSE_LIST, "chr1", 5, 0, INT64_32_MAX); // incorrect usage; first reg is valid (but not what user expects). reg_expected(hdr, "chr3:1,000-1,500", HTS_PARSE_LIST | HTS_PARSE_ONE_COORD, "000-1,500", 4, 0, 1); diff --git a/test/test.pl b/test/test.pl index 8b84ca3c5..ca0d766c1 100755 --- a/test/test.pl +++ b/test/test.pl @@ -263,6 +263,11 @@ sub test_compare } } + if (exists($args{fix_newlines})) { + $exp =~ s/\015\012/\n/g; + $out =~ s/\015\012/\n/g; + } + if ( $exp ne $out ) { failed($opts,$test,"The outputs differ:\n\t\t$exp_fn\n\t\t$out_fn"); @@ -631,6 +636,42 @@ sub test_view } else { failed($opts, "no_hdr_sq tests", "$test_view_failures subtests failed"); } + + # File with large (> 2Gbases) positions + # Only works for SAM at the moment, but we can still round-trip it. + print "test_view testing large (> 2Gbases) positions:\n"; + $test_view_failures = 0; + testv $opts, "./test_view $tv_args -z -p longrefs/longref.tmp.sam.gz -x longrefs/longref.tmp.sam.gz.csi.otf -m 14 longrefs/longref.sam"; + testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; + testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1; + + # Large position iterator tests + testv $opts, "./test_view $tv_args -p longrefs/longref_itr.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003"; + testv $opts, "./compare_sam.pl longrefs/longref_itr.expected.sam longrefs/longref_itr.tmp.sam"; + testv $opts, "./test_view $tv_args -M -p longrefs/longref_multi.tmp.sam longrefs/longref.tmp.sam.gz CHROMOSOME_I:10000000000-10000000003 CHROMOSOME_I:10000000100-10000000110"; + testv $opts, "./compare_sam.pl longrefs/longref_multi.expected.sam longrefs/longref_multi.tmp.sam"; + + # VCF round trip + unlink("longrefs/index.tmp.vcf.gz.csi"); # To stop vcf_hdr_read from reading a stale index + testv $opts, "./test_view $tv_args -z -p longrefs/index.tmp.vcf.gz -x longrefs/index.tmp.vcf.gz.csi.otf -m 14 longrefs/index.vcf"; + testv $opts, "./test_view $tv_args -p longrefs/index.tmp.vcf_ longrefs/index.tmp.vcf.gz"; + testv $opts, "cmp longrefs/index.vcf longrefs/index.tmp.vcf_"; + + # Build index and compare with on-the-fly one made earlier. + test_compare $opts, "$$opts{path}/test_index -c longrefs/index.tmp.vcf.gz", "longrefs/index.tmp.vcf.gz.csi.otf", "longrefs/index.tmp.vcf.gz.csi", gz=>1; + + # test_view can't do indexed look-ups on vcf, but we can use tabix + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000100-10010000105 > longrefs/index.tmp.tabix1.vcf", "longrefs/index.expected1.vcf", "longrefs/index.tmp.tabix1.vcf", fix_newlines => 1; + test_compare $opts, "$$opts{bin}/tabix longrefs/index.tmp.vcf.gz 1:10010000120-10010000130 > longrefs/index.tmp.tabix2.vcf", "longrefs/index.expected2.vcf", "longrefs/index.tmp.tabix2.vcf", fix_newlines => 1; + + if ($test_view_failures == 0) { + passed($opts, "large position tests"); + } else { + failed($opts, "large position tests", "$test_view_failures subtests failed"); + } } # Tests CRAM's ability to correctly preserve MD and NM, irrespective of whether diff --git a/vcf.c b/vcf.c index 1ace26b26..c2228312f 100644 --- a/vcf.c +++ b/vcf.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include "htslib/vcf.h" @@ -498,18 +499,23 @@ static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_i static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { // contig - int i,j, ret, replacing = 0; + int i, ret, replacing = 0; khint_t k; char *str; if ( !strcmp(hrec->key, "contig") ) { + hts_pos_t len = 0; hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); - if ( i<0 ) j = 0; - else if ( sscanf(hrec->vals[i],"%d",&j)!=1 ) return 0; + if ( i<0 ) len = 0; + else { + char *end = hrec->vals[i]; + len = strtoll(hrec->vals[i], &end, 10); + if (end == hrec->vals[i] || len < 0) return 0; + } i = bcf_hrec_find_key(hrec,"ID"); if ( i<0 ) return 0; @@ -547,7 +553,7 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = idx; - kh_val(d, k).info[0] = j; + kh_val(d, k).info[0] = len; kh_val(d, k).hrec[0] = hrec; if (bcf_hdr_set_idx(hdr, BCF_DT_CTG, kh_key(d,k), &kh_val(d,k)) < 0) { if (!replacing) { @@ -1180,21 +1186,27 @@ void bcf_destroy(bcf1_t *v) static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) { - uint32_t x[8]; + union { + uint32_t i; + float f; + } x[8]; ssize_t ret; if ((ret = bgzf_read(fp, x, 32)) != 32) { if (ret == 0) return -1; return -2; } bcf_clear1(v); - if (x[0] < 24) return -2; - x[0] -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, x[0]) != 0) return -2; - if (ks_resize(&v->indiv, x[1]) != 0) return -2; - memcpy(v, x + 2, 16); - v->n_allele = x[6]>>16; v->n_info = x[6]&0xffff; - v->n_fmt = x[7]>>24; v->n_sample = x[7]&0xffffff; - v->shared.l = x[0], v->indiv.l = x[1]; + if (x[0].i < 24) return -2; + x[0].i -= 24; // to exclude six 32-bit integers + if (ks_resize(&v->shared, x[0].i) != 0) return -2; + if (ks_resize(&v->indiv, x[1].i) != 0) return -2; + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = x[6].i>>16; v->n_info = x[6].i&0xffff; + v->n_fmt = x[7].i>>24; v->n_sample = x[7].i&0xffffff; + v->shared.l = x[0].i, v->indiv.l = x[1].i; // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; @@ -1436,7 +1448,7 @@ int bcf_read(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) return bcf_subset_format(h,v); } -int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, int *beg, int *end) +int bcf_readrec(BGZF *fp, void *null, void *vv, int *tid, hts_pos_t *beg, hts_pos_t *end) { bcf1_t *v = (bcf1_t *) vv; int ret; @@ -1684,7 +1696,7 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) } if ( bcf_hdr_nsamples(h)!=v->n_sample ) { - hts_log_error("Broken VCF record, the number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Broken VCF record, the number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); return -1; } @@ -1704,12 +1716,18 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) bcf1_sync(v); // check if the BCF record was modified BGZF *fp = hfp->fp.bgzf; - uint32_t x[8]; - x[0] = v->shared.l + 24; // to include six 32-bit integers - x[1] = v->indiv.l; - memcpy(x + 2, v, 16); - x[6] = (uint32_t)v->n_allele<<16 | v->n_info; - x[7] = (uint32_t)v->n_fmt<<24 | v->n_sample; + union { + uint32_t i; + float f; + } x[8]; + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; if ( bgzf_write(fp, x, 32) != 32 ) return -1; if ( bgzf_write(fp, v->shared.s, v->shared.l) != v->shared.l ) return -1; if ( bgzf_write(fp, v->indiv.s, v->indiv.l) != v->indiv.l ) return -1; @@ -2009,6 +2027,24 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) return 0; // FIXME: check for errs in this function } +static int bcf_enc_long1(kstring_t *s, int64_t x) { + uint32_t e = 0; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) + return bcf_enc_int1(s, x); + if (x == bcf_int64_vector_end) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_vector_end, s) < 0; + } else if (x == bcf_int64_missing) { + e |= bcf_enc_size(s, 1, BCF_BT_INT8); + e |= kputc(bcf_int8_missing, s) < 0; + } else { + e |= bcf_enc_size(s, 1, BCF_BT_INT64); + e |= ks_expand(s, 8); + if (e == 0) { u64_to_le(x, (uint8_t *) s->s + s->l); s->l += 8; } + } + return e == 0 ? 0 : -1; +} + static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; size_t i; @@ -2132,7 +2168,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p char *end = s->s + s->l; if ( q>=end ) { - hts_log_error("FORMAT column with no sample columns starting at %s:%d", s->s, v->pos+1); + hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", s->s, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; } @@ -2148,7 +2184,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; - hts_log_error("FORMAT column at %s:%d lists more identifiers than htslib can handle", + hts_log_error("FORMAT column at %s:%"PRIhts_pos" lists more identifiers than htslib can handle", bcf_seqname(h,v), v->pos+1); return -1; } @@ -2220,7 +2256,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p j++; if ( j>=v->n_fmt ) { - hts_log_error("Incorrect number of FORMAT fields at %s:%d", + hts_log_error("Incorrect number of FORMAT fields at %s:%"PRIhts_pos"", h->id[BCF_DT_CTG][v->rid].key, v->pos+1); v->errcode |= BCF_ERR_NCOLS; return -1; @@ -2327,7 +2363,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } else { char buffer[8]; - hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%d", + hts_log_error("Invalid character '%s' in '%s' FORMAT field at %s:%"PRIhts_pos"", dump_char(buffer, *t), h->id[BCF_DT_ID][z->key].key, bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_CHAR; return -1; @@ -2386,14 +2422,14 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( v->n_sample!=bcf_hdr_nsamples(h) ) { - hts_log_error("Number of columns at %s:%d does not match the number of samples (%d vs %d)", + hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", bcf_seqname(h,v), v->pos+1, v->n_sample, bcf_hdr_nsamples(h)); v->errcode |= BCF_ERR_NCOLS; return -1; } if ( v->indiv.l > 0xffffffff ) { - hts_log_error("The FORMAT at %s:%d is too long", bcf_seqname(h,v), v->pos+1); + hts_log_error("The FORMAT at %s:%"PRIhts_pos" is too long", bcf_seqname(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // Error recovery: return -1 if this is a critical error or 0 if we want to ignore the FORMAT and proceed @@ -2453,7 +2489,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) } v->rid = kh_val(d, k).id; } else if (i == 1) { // POS - v->pos = atoi(p) - 1; + v->pos = strtoll(p, NULL, 10) - 1; } else if (i == 2) { // ID if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); else bcf_enc_size(str, 0, BCF_BT_CHAR); @@ -2588,29 +2624,39 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) val_a = z; } if ((y>>4&0xf) == BCF_HT_INT) { - for (i = 0, t = val; i < n_val; ++i, ++t) + // Allow first value only to be 64 bit + // (for large END value) + int64_t v64 = strtoll(val, &te, 10); + if ( te==val ) { // conversion failed + val_a[0] = bcf_int32_missing; + v64 = bcf_int64_missing; + } else { + val_a[0] = v64 >= BCF_MIN_BT_INT32 && v64 <= BCF_MAX_BT_INT32 ? v64 : bcf_int32_missing; + } + for (t = te; *t && *t != ','; t++); + if (*t == ',') ++t; + for (i = 1; i < n_val; ++i, ++t) { val_a[i] = strtol(t, &te, 10); if ( te==t ) // conversion failed - { val_a[i] = bcf_int32_missing; - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); } - bcf_enc_vint(str, n_val, val_a, -1); - if (strcmp(key, "END") == 0) v->rlen = val_a[0] - v->pos; + if (n_val == 1) { + bcf_enc_long1(str, v64); + } else { + bcf_enc_vint(str, n_val, val_a, -1); + } + if (strcmp(key, "END") == 0) + v->rlen = v64 - v->pos; } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)val_a; for (i = 0, t = val; i < n_val; ++i, ++t) { val_f[i] = strtod(t, &te); if ( te==t ) // conversion failed - { bcf_float_set_missing(val_f[i]); - while ( *te && *te!=',' ) te++; - } - t = te; + for (t = te; *t && *t != ','; t++); } bcf_enc_vfloat(str, n_val, val_f); } @@ -2673,6 +2719,7 @@ static inline uint8_t *bcf_unpack_info_core1(uint8_t *ptr, bcf_info_t *info) else if (info->type == BCF_BT_INT32) info->v1.i = le_to_i32(ptr); else if (info->type == BCF_BT_FLOAT) info->v1.f = le_to_float(ptr); else if (info->type == BCF_BT_INT16) info->v1.i = le_to_i16(ptr); + else if (info->type == BCF_BT_INT64) info->v1.i = le_to_i64(ptr); } ptr += info->len << bcf_type_shift[info->type]; info->vptr_len = ptr - info->vptr; @@ -2753,7 +2800,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) int i; bcf_unpack((bcf1_t*)v, BCF_UN_ALL); kputs(h->id[BCF_DT_CTG][v->rid].key, s); // CHROM - kputc('\t', s); kputw(v->pos + 1, s); // POS + kputc('\t', s); kputll(v->pos + 1, s); // POS kputc('\t', s); kputs(v->d.id ? v->d.id : ".", s); // ID kputc('\t', s); // REF if (v->n_allele > 0) kputs(v->d.allele[0], s); @@ -2798,6 +2845,7 @@ int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s) case BCF_BT_INT8: if ( z->v1.i==bcf_int8_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT16: if ( z->v1.i==bcf_int16_missing ) kputc('.', s); else kputw(z->v1.i, s); break; case BCF_BT_INT32: if ( z->v1.i==bcf_int32_missing ) kputc('.', s); else kputw(z->v1.i, s); break; + case BCF_BT_INT64: if ( z->v1.i==bcf_int64_missing ) kputc('.', s); else kputll(z->v1.i, s); break; case BCF_BT_FLOAT: if ( bcf_float_is_missing(z->v1.f) ) kputc('.', s); else kputd(z->v1.f, s); break; case BCF_BT_CHAR: kputc(z->v1.i, s); break; default: hts_log_error("Unexpected type %d", z->type); exit(1); break; @@ -2903,26 +2951,41 @@ int bcf_hdr_id2int(const bcf_hdr_t *h, int which, const char *id) *** BCF indexing *** ********************/ +// Calculate number of index levels given min_shift and the header contig +// list. Also returns number of contigs in *nids_out. +static int idx_calc_n_lvls_ids(const bcf_hdr_t *h, int min_shift, + int starting_n_lvls, int *nids_out) +{ + int n_lvls, i, nids = 0; + int64_t max_len = 0, s; + + for (i = 0; i < h->n[BCF_DT_CTG]; ++i) + { + if ( !h->id[BCF_DT_CTG][i].val ) continue; + if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) + max_len = h->id[BCF_DT_CTG][i].val->info[0]; + nids++; + } + if ( !max_len ) max_len = (1LL<<31) - 1; // In case contig line is broken. + max_len += 256; + s = 1LL << (min_shift + starting_n_lvls * 3); + for (n_lvls = starting_n_lvls; max_len > s; ++n_lvls, s <<= 3); + + if (nids_out) *nids_out = nids; + return n_lvls; +} + hts_idx_t *bcf_index(htsFile *fp, int min_shift) { - int n_lvls, i; + int n_lvls; bcf1_t *b = NULL; hts_idx_t *idx = NULL; bcf_hdr_t *h; - int64_t max_len = 0, s; int r; h = bcf_hdr_read(fp); if ( !h ) return NULL; int nids = 0; - for (i = 0; i < h->n[BCF_DT_CTG]; ++i) - { - if ( !h->id[BCF_DT_CTG][i].val ) continue; - if ( max_len < h->id[BCF_DT_CTG][i].val->info[0] ) max_len = h->id[BCF_DT_CTG][i].val->info[0]; - nids++; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; - for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); if (!idx) goto fail; b = bcf_init1(); @@ -3012,23 +3075,17 @@ int bcf_index_build(const char *fn, int min_shift) // Initialise fp->idx for the current format type. // This must be called after the header has been written but no other data. static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { - int n_lvls, i, fmt; - int64_t max_len = 0; - - for (i = 0; i < h->n[BCF_DT_CTG]; i++) { - if (!h->id[BCF_DT_CTG][i].val) continue; - if (max_len < h->id[BCF_DT_CTG][i].val->info[0]) - max_len = h->id[BCF_DT_CTG][i].val->info[0]; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; + int n_lvls, fmt; if (min_shift == 0) { min_shift = 14; n_lvls = 5; fmt = HTS_FMT_TBI; } else { - n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; + // Set initial n_lvls to match tbx_index() + int starting_n_lvls = (TBX_MAX_SHIFT - min_shift + 2) / 3; + // Increase if necessary + n_lvls = idx_calc_n_lvls_ids(h, min_shift, starting_n_lvls, NULL); fmt = HTS_FMT_CSI; } @@ -3058,8 +3115,6 @@ static int vcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fn // This must be called after the header has been written but no other data. int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { int n_lvls, nids = 0; - int64_t max_len = 0, s; - int i; if (fp->format.format == vcf) return vcf_idx_init(fp, h, min_shift, fnidx); @@ -3067,15 +3122,7 @@ int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { if (!min_shift) min_shift = 14; - for (i = 0; i < h->n[BCF_DT_CTG]; i++) { - if (!h->id[BCF_DT_CTG][i].val) continue; - if (max_len < h->id[BCF_DT_CTG][i].val->info[0]) - max_len = h->id[BCF_DT_CTG][i].val->info[0]; - nids++; - } - if ( !max_len ) max_len = ((int64_t)1<<31) - 1; // In case contig line is broken. - max_len += 256; - for (n_lvls = 0, s = 1< s; ++n_lvls, s <<= 3); + n_lvls = idx_calc_n_lvls_ids(h, min_shift, 0, &nids); fp->idx = hts_idx_init(nids, HTS_FMT_CSI, bgzf_tell(fp->fp.bgzf), min_shift, n_lvls); if (!fp->idx) return -1; @@ -3739,6 +3786,14 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v else bcf_enc_vchar(&str, strlen((char*)values), (char*)values); } + else if ( type==BCF_HT_LONG ) + { + if (n != 1) { + hts_log_error("Only storing a single BCF_HT_LONG value is supported"); + abort(); + } + bcf_enc_long1(&str, *(int64_t *) values); + } else { hts_log_error("The type %d not implemented yet", type); @@ -3780,7 +3835,11 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v } line->unpacked |= BCF_UN_INFO; - if ( n==1 && !strcmp("END",key) ) line->rlen = ((int32_t*)values)[0] - line->pos; + if ( n==1 && !strcmp("END",key) ) { + assert(type == BCF_HT_INT || type == BCF_HT_LONG); + int64_t end = type == BCF_HT_INT ? *(int32_t *) values : *(int64_t *) values; + line->rlen = end - line->pos; + } return 0; } @@ -4123,7 +4182,7 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=type ) return -2; // expected different type + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -4147,7 +4206,15 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } // Make sure the buffer is big enough - int size1 = type==BCF_HT_INT ? sizeof(int32_t) : sizeof(float); + int size1; + switch (type) { + case BCF_HT_INT: size1 = sizeof(int32_t); break; + case BCF_HT_LONG: size1 = sizeof(int64_t); break; + case BCF_HT_REAL: size1 = sizeof(float); break; + default: + hts_log_error("Unexpected output type %d", type); + return -2; + } if ( *ndst < info->len ) { *ndst = info->len; @@ -4168,11 +4235,28 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi ret = j; \ } while (0) switch (info->type) { - case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; - case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + case BCF_BT_INT8: + if (type == BCF_HT_LONG) { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT16: + if (type == BCF_HT_LONG) { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); + } + break; + case BCF_BT_INT32: + if (type == BCF_HT_LONG) { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); break; + } else { + BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; + } case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; - default: hts_log_error("Unexpected type %d", info->type); exit(1); + default: hts_log_error("Unexpected type %d", info->type); return -2; } #undef BRANCH return ret; // set by BRANCH diff --git a/vcfutils.c b/vcfutils.c index 008dbe6f5..b2a477b4a 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -23,6 +23,7 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include +#include #include "htslib/vcfutils.h" #include "htslib/kbitset.h" @@ -64,12 +65,12 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t); break; case BCF_BT_INT16: BRANCH_INT(int16_t); break; case BCF_BT_INT32: BRANCH_INT(int32_t); break; - default: hts_log_error("Unexpected type %d at %s:%d", ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, ac_type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT if ( anid[BCF_DT_CTG][line->rid].key, line->pos+1); + hts_log_error("Incorrect AN/AC counts at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); } ac[0] = an - nac; @@ -98,7 +99,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( p[ial]>>1 > line->n_allele ) \ { \ - hts_log_error("Incorrect allele (\"%d\") in %s at %s:%d", (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Incorrect allele (\"%d\") in %s at %s:%"PRIhts_pos, (p[ial]>>1)-1, header->samples[i], header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ exit(1); \ } \ ac[(p[ial]>>1)-1]++; \ @@ -109,7 +110,7 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) case BCF_BT_INT8: BRANCH_INT(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH_INT(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH_INT(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected type %d at %s:%d", fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; + default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt_gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); exit(1); break; } #undef BRANCH_INT return 1; @@ -188,7 +189,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) if ( p[ial]==vector_end ) break; /* smaller ploidy */ \ if ( bcf_gt_is_missing(p[ial]) ) continue; /* missing allele */ \ if ( (p[ial]>>1)-1 >= line->n_allele ) { \ - hts_log_error("Allele index is out of bounds at %s:%d", header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ + hts_log_error("Allele index is out of bounds at %s:%"PRIhts_pos, header->id[BCF_DT_CTG][line->rid].key, line->pos+1); \ ret = -1; \ goto clean; \ } \ @@ -200,7 +201,7 @@ int bcf_trim_alleles(const bcf_hdr_t *header, bcf1_t *line) case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_vector_end); break; case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_vector_end); break; case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_vector_end); break; - default: hts_log_error("Unexpected GT %d at %s:%d", + default: hts_log_error("Unexpected GT %d at %s:%"PRIhts_pos, gt->type, header->id[BCF_DT_CTG][line->rid].key, line->pos + 1); goto clean; } @@ -265,7 +266,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nR_new = line->n_allele-nrm; if ( nR_new<=0 ) // should not be able to remove reference allele { - hts_log_error("Cannot remove reference allele at %s:%d [%d]", + hts_log_error("Cannot remove reference allele at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nR_new); goto err; } @@ -296,7 +297,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access INFO/%s at %s:%d [%d]", + hts_log_error("Could not access INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -334,7 +335,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); goto err; } @@ -365,7 +366,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( n==1 && s == '.' ) continue; // missing if ( n!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, n); goto err; } @@ -374,7 +375,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -406,7 +407,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nA_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nA_ori, nret); goto err; } @@ -418,7 +419,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nR_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nR_ori, nret); goto err; } @@ -450,7 +451,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nret!=nG_ori ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nG_ori, nret); goto err; } @@ -484,7 +485,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_info(header, line, bcf_hdr_int2id(header,BCF_DT_ID,info->key), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update INFO/%s at %s:%d [%d]", + hts_log_error("Could not update INFO/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -510,7 +511,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int al = bcf_gt_allele(ptr[j]); if ( !( al=0 ) ) { - hts_log_error("Problem updating genotypes at %s:%d [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", + hts_log_error("Problem updating genotypes at %s:%"PRIhts_pos" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", bcf_seqname(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } @@ -521,7 +522,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_genotypes(header, line, (void*)dat, nret*line->n_sample); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/GT at %s:%d [%d]", + hts_log_error("Could not update FORMAT/GT at %s:%"PRIhts_pos" [%d]", bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -548,7 +549,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb mdat_bytes = mdat * size; if ( nret<0 ) { - hts_log_error("Could not access FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not access FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -589,7 +590,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( k_src==1 && s == '.' ) continue; // missing if ( k_src!=nexp ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=%c=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); goto err; } @@ -614,7 +615,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( nexp==1 && s == '.' ) continue; // missing if ( nexp!=nG_ori && nexp!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(diploid) or %d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(diploid) or %d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nR_ori, nexp); goto err; } @@ -659,7 +660,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } if ( k_src!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d(haploid), but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d(haploid), but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, k_src); goto err; } @@ -671,7 +672,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)str.s, str.l, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; } @@ -707,7 +708,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nA_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=A=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nA_ori, nori); goto err; } @@ -719,7 +720,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nR_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=R=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nR_ori, nori); goto err; } @@ -755,7 +756,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( nori!=nG_ori ) { - hts_log_error("Unexpected number of values in FORMAT/%s at %s:%d; expected Number=G=%d, but found %d", + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=G=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nG_ori, nori); goto err; } @@ -808,7 +809,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nret = bcf_update_format(header, line, bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), (void*)dat, ndat, type); if ( nret<0 ) { - hts_log_error("Could not update FORMAT/%s at %s:%d [%d]", + hts_log_error("Could not update FORMAT/%s at %s:%"PRIhts_pos" [%d]", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname(header,line), line->pos+1, nret); goto err; }