From 7c1d3ccd1ae10dd81cbcfc84d1dd984534cf2caa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 Jul 2023 15:49:44 +0100 Subject: [PATCH 1/3] The first stage of vcf_parse_format speed improvements. This simply turns the monolithic vcf_parse_format functions into a series of numbers sub-functions whose primary is to localise the variables to that code block and to make it easier to see the structure of the tasks being performed. There is no code optimisation here and the main algorithm is unchanged, so this is just moving of code from 1 function to multiple functions. However it makes the next commit easier to understand as we're not trying to see a delta mixed in with restructuring. An unexpected consequence however of making the variables local to their blocks is that it also speeds up the code. The first step in separating this code into functions was simply adding curly braces around each code segment and moving the function-global variables into their respective blocks. The before/after benchmarkjs on 100,000 lines of a multi-sample G1K VCF are ("perf stat" cycle counts): ORIG LOCALISED gcc7 29335942870 27353443704 gcc13 31974757094 31452452908 clang13 31290989382 29508665020 Benchmarked again after moving to actual functions, but the difference was tiny in comparison.) --- vcf.c | 146 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 32 deletions(-) diff --git a/vcf.c b/vcf.c index 9e589f993..4dbac2580 100644 --- a/vcf.c +++ b/vcf.c @@ -2633,22 +2633,11 @@ static inline int align_mem(kstring_t *s) return e == 0 ? 0 : -1; } -// p,q is the start and the end of the FORMAT field #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ -static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) -{ - if ( !bcf_hdr_nsamples(h) ) return 0; - - static int extreme_val_warned = 0; - char *r, *t; - int j, l, m, g, overflow = 0; - khint_t k; - ks_tokaux_t aux1; - vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - kstring_t *mem = (kstring_t*)&h->mem; - fmt_aux_t fmt[MAX_N_FMT]; - mem->l = 0; +// detect FORMAT "." +static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) { char *end = s->s + s->l; if ( q>=end ) { @@ -2661,10 +2650,19 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." { v->n_sample = bcf_hdr_nsamples(h); - return 0; + return 1; } - // get format information from the dictionary + return 0; +} + +// get format information from the dictionary +static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { + const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + char *t; + int j; + ks_tokaux_t aux1; for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; @@ -2674,7 +2672,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } *(char*)aux1.p = 0; - k = kh_get(vdict, d, t); + khint_t k = kh_get(vdict, d, t); if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { if ( t[0]=='.' && t[1]==0 ) { @@ -2706,10 +2704,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; v->n_fmt++; } - // compute max + return 0; +} + +// compute max +static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { int n_sample_ori = -1; - r = q + 1; // r: position in the format string - l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + char *r = q + 1; // r: position in the format string + int l = 0, m = 1, g = 1, j; + v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + char *end = s->s + s->l; while ( rmem; + + int j; for (j = 0; j < v->n_fmt; ++j) { fmt_aux_t *f = &fmt[j]; if ( !f->max_m ) f->max_m = 1; // omitted trailing format field @@ -2804,11 +2817,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } mem->l += v->n_sample * f->size; } - for (j = 0; j < v->n_fmt; ++j) - fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; - // fill the sample fields; at beginning of the loop, t points to the first char of a format - n_sample_ori = -1; - t = q + 1; m = 0; // m: sample id + + { + int j; + for (j = 0; j < v->n_fmt; ++j) + fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; + } + + return 0; +} + +// fill the sample fields; at beginning of the loop +static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { + static int extreme_val_warned = 0; + int n_sample_ori = -1; + // t points to the first char of a format + char *t = q + 1; + int m = 0; // m: sample id + char *end = s->s + s->l; while ( ty>>4&0xf) == BCF_HT_STR) { + int l; if (z->is_gt) { // genotypes int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - overflow = 0; + int overflow = 0; for (l = 0;; ++t) { if (*t == '.') { ++t, x[l++] = is_phased; @@ -2867,16 +2895,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else { char *x = (char*)z->buf + z->size * (size_t)m; - for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t; + for (l = 0; *t != ':' && *t; ++t) x[l++] = *t; for (; l < z->size; ++l) x[l] = 0; } } else if ((z->y>>4&0xf) == BCF_HT_INT) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.') { x[l++] = bcf_int32_missing, ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); if ( te==t || overflow || tmp_valBCF_MAX_BT_INT32 ) @@ -2897,11 +2926,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else if ((z->y>>4&0xf) == BCF_HT_REAL) { float *x = (float*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.' && !isdigit_c(t[1])) { bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; float tmp_val = hts_str2dbl(t, &te, &overflow); if ( (te==t || overflow) && !extreme_val_warned ) @@ -2940,6 +2970,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; j < v->n_fmt; ++j) { // fill end-of-vector values fmt_aux_t *z = &fmt[j]; + int l; if ((z->y>>4&0xf) == BCF_HT_STR) { if (z->is_gt) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); @@ -2964,7 +2995,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p m++; t++; } - // write individual genotype information + return 0; +} + +// write individual genotype information +static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; int i; if (v->n_sample > 0) { @@ -2988,6 +3024,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } } + return 0; +} + +// validity checking +static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { if ( v->n_sample!=bcf_hdr_nsamples(h) ) { hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", @@ -3008,6 +3049,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return 0; } +// p,q is the start and the end of the FORMAT field +static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) +{ + if ( !bcf_hdr_nsamples(h) ) return 0; + kstring_t *mem = (kstring_t*)&h->mem; + mem->l = 0; + + fmt_aux_t fmt[MAX_N_FMT]; + + // detect FORMAT "." + int ret; // +ve = ok, -ve = err + if ((ret = vcf_parse_format_empty1(s, h, v, p, q))) + return ret ? 0 : -1; + + // get format information from the dictionary + if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) + return -1; + + // compute max + if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) + return -1; + + // allocate memory for arrays + if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0) + return -1; + + // fill the sample fields; at beginning of the loop + if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0) + return -1; + + // write individual genotype information + if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0) + return -1; + + // validity checking + if (vcf_parse_format_check7(h, v) < 0) + return -1; + + return 0; +} + static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has // been already printed, but will enable tools like vcfcheck to proceed. From 231897568c45c5821c6515ca88c98d1885060f97 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 Jul 2023 17:06:42 +0100 Subject: [PATCH 2/3] Further VCF reading speeds optimisations. This is the main meat of the VCF read speedup, following on from the previous code refactoring. Combined timings on testing GNOMAD very INFO heavy single-sample file, a many-sample (approx 4000) FORMAT rich file for different compilers, and the GIAB HG002 VCF truth set are: INFO heavy (15-29% speedup) (34-39% speedup) dev(s) PR(s) dev(s) PR(s) clang13 6.29 5.34 2.84 1.85 gcc13 6.74 5.22 2.93 1.93 gcc7 7.96 5.65 3.25 1.98 FORMAT heavy (6-19% speedup) (18-22% speedup) dev PR dev PR clang13 9.17 8.58 5.45 4.48 gcc13 9.88 8.04 5.08 3.95 gcc7 9.12 8.33 4.87 3.98 GIAB HG002 (28-29% speedup) (33-37% speedup) dev PR dev PR clang13 12.88 9.30 5.12 3.29 gcc13 12.04 8.60 4.74 3.19 gcc7 12.87 9.37 5.32 3.34 (Tested on Intel Xeon) Gold 6142 and an AMD Zen4 respectively) Bigger speedups (see first message in PR) were seen on some older hardware. Specific optimisations along with estimates of their benefit include, in approximate order of writing / testing: - Adding consts and caching of bcf_hdr_nsamples(h). No difference on system gcc (gcc7) and clang13, but a couple percent gain on gcc13. - Remove the need for most calls to hts_str2uint by recognising that most GT numbers are single digits. This was 4-5% saving for gcc and 9-10% on clang. - Remove kputc calls in bcf_enc_vint / bcf_enc_size, avoiding repeated ks_resize checking. This is a further ~10% speedup. - Unrolling in bcf_enc_vint to encourage SIMD. - Improve speed of bgzf_getline and kstrrok via memchr/strchr. In tabix timings indexing VCF, bgzf_getline change is 9-22% quicker with clang 13 and 19-25% quicker with gcc 7. I did investigate a manually unrolled 64-bit search, before I remembered the existance of memchr (doh!). This is often faster on clang (17-19%), but marginally slower on gcc. The actual speed up to this function however is considerably more (3-4x quicker). For interest, I include the equivalent code here, as it may be useful in other contexts: #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff // 64-bit unrolled delim detection #define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL) // Quicker broadcast on clang than bit shuffling in delim union { uint64_t d8; uint8_t d[8]; } u; memset(u.d, delim, 8); const uint64_t d8 = u.d8; uint64_t *b8 = (uint64_t *)(&buf[fp->block_offset]); const int l8 = (fp->block_length-fp->block_offset)/8; for (l = 0; l < (l8 & ~3); l+=4) { if (haszero(b8[l+0] ^ d8)) break; if (haszero(b8[l+1] ^ d8)) { l++; break; } if (haszero(b8[l+2] ^ d8)) { l+=2; break; } if (haszero(b8[l+3] ^ d8)) { l+=3; break; } } l *= 8; for (l += fp->block_offset; l < fp->block_length && buf[l] != delim; l++); The analogous kstrtok change is using strchr+strlen instead of memchr as we don't know the string end. This makes kstrtok around 150% quicker when parsing a single sample VCF. When not finding aux->sep in the string, strchr returns NULL rather than end of string, so we need an additional strlen to set aux->p. However there is also glibc's strchrnul which solves this in a single call. This makes kstrtok another 40% quicker on this test, but overall it's not a big bottleneck any more. - Use strchr in vcf_parse_info. This is a major speed increase over manual searching on Linux. TODO: is this just glibc? Eg libmusl speeds, etc? Other OSes? It saves about 33% of time in vcf_parse (vcf_parse_info inlined to it) with gcc. Even more with clang. The total speed gain on a single sample VCF view (GIAB truth set) is 12-19% fewer cycles: - Minor "GT" check improvement. This has no real affect on gcc13 and clang13, but the system gcc (gcc7) speeds up single sample VCF decoding by 7% - Speed up the unknown value check (strcmp(p, "."). Helps gcc7 the most (9%), with gcc13/clang13 in the 3-4% gains. - Speed up vcf_parse_format_max3. This is the first parse through the FORMAT fields. Ideally we'd merge this and fill5 (the other parse through), but that is harder due to the data pivot / rotate. For now we just optimise the existing code path. Instead of a laborious switch character by character, we have an initial tight loop to find the first meta-character and then a switch to do char dependant code. This is 5% to 13% speed up depending on data set. - Remove kputc and minimise resize for bcf_enc_int1. 3-8% speedup depending on data / compiler. - Use memcmp instead of strcmp for "END" and ensure we have room. Also memset over explicit nulling of arrays. - Force BCF header dicts to be larger than needed. This is a tactic to reduce hash collisions due to the use of overly simple hash functions. It seems to typically be around 3-8% speed gain. - Restructure of main vcf_parse function. This can speed things up by 6-7% on basic single-sample files. The previous loop caused lots of branch prediction misses due to the counter 'i' being used to do 8 different parts of code depending on token number. Additionally it's got better error checking now as previously running out of tokens early just did a return 0 rather than complaining about missing columns. --- bgzf.c | 8 +- htslib/vcf.h | 85 +++++--- kstring.c | 13 +- vcf.c | 559 +++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 476 insertions(+), 189 deletions(-) diff --git a/bgzf.c b/bgzf.c index 45f2b1150..5ef433c20 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2280,7 +2280,13 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) if (fp->block_length == 0) { state = -1; break; } } unsigned char *buf = fp->uncompressed_block; - for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + + // Equivalent to a naive byte by byte search from + // buf + block_offset to buf + block_length. + void *e = memchr(&buf[fp->block_offset], delim, + fp->block_length - fp->block_offset); + l = e ? (unsigned char *)e - buf : fp->block_length; + if (l < fp->block_length) state = 1; l -= fp->block_offset; if (ks_expand(str, l + 2) < 0) { state = -3; break; } diff --git a/htslib/vcf.h b/htslib/vcf.h index 83659ae12..70cf95372 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1522,26 +1522,37 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) static inline int bcf_enc_size(kstring_t *s, int size, int type) { - uint32_t e = 0; - uint8_t x[4]; - if (size >= 15) { - e |= kputc(15<<4|type, s) < 0; - if (size >= 128) { - if (size >= 32768) { - i32_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT32, s) < 0; - e |= kputsn((char*)&x, 4, s) < 0; - } else { - i16_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT16, s) < 0; - e |= kputsn((char*)&x, 2, s) < 0; - } + // Most common case is first + if (size < 15) { + if (ks_resize(s, s->l + 1) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = (size<<4) | type; + s->l++; + return 0; + } + + if (ks_resize(s, s->l + 6) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = 15<<4|type; + + if (size < 128) { + *p++ = 1<<4|BCF_BT_INT8; + *p++ = size; + s->l += 3; + } else { + if (size < 32768) { + *p++ = 1<<4|BCF_BT_INT16; + i16_to_le(size, p); + s->l += 4; } else { - e |= kputc(1<<4|BCF_BT_INT8, s) < 0; - e |= kputc(size, s) < 0; + *p++ = 1<<4|BCF_BT_INT32; + i32_to_le(size, p); + s->l += 6; } - } else e |= kputc(size<<4|type, s) < 0; - return e == 0 ? 0 : -1; + } + return 0; } static inline int bcf_enc_inttype(long x) @@ -1553,27 +1564,35 @@ static inline int bcf_enc_inttype(long x) static inline int bcf_enc_int1(kstring_t *s, int32_t x) { - uint32_t e = 0; - uint8_t z[4]; + if (ks_resize(s, s->l + 5) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + if (x == bcf_int32_vector_end) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_vector_end, s) < 0; + // An inline implementation of bcf_enc_size with size==1 and + // memory allocation already accounted for. + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_vector_end; + s->l+=2; } else if (x == bcf_int32_missing) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_missing, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_missing; + s->l+=2; } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(x, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = x; + s->l+=2; } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) { - i16_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT16); - e |= kputsn((char*)&z, 2, s) < 0; + *p = (1<<4) | BCF_BT_INT16; + i16_to_le(x, p+1); + s->l+=3; } else { - i32_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT32); - e |= kputsn((char*)&z, 4, s) < 0; + *p = (1<<4) | BCF_BT_INT32; + i32_to_le(x, p+1); + s->l+=5; } - return e == 0 ? 0 : -1; + + return 0; } /// Return the value of a single typed integer. diff --git a/kstring.c b/kstring.c index 71facf975..f8e0f9f3d 100644 --- a/kstring.c +++ b/kstring.c @@ -204,8 +204,17 @@ char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) for (p = start; *p; ++p) if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; } else { - for (p = start; *p; ++p) - if (*p == aux->sep) break; + // Using strchr is fast for next token, but slower for + // last token due to extra pass from strlen. Overall + // on a VCF parse this func was 146% faster with // strchr. + // Equiv to: + // for (p = start; *p; ++p) if (*p == aux->sep) break; + + // NB: We could use strchrnul() here from glibc if detected, + // which is ~40% faster again, but it's not so portable. + // i.e. p = (uint8_t *)strchrnul((char *)start, aux->sep); + uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep); + p = p2 ? p2 : start + strlen((char *)start); } aux->p = (const char *) p; // end of token if (*p == 0) aux->finished = 1; // no more tokens diff --git a/vcf.c b/vcf.c index 4dbac2580..ce54dc227 100644 --- a/vcf.c +++ b/vcf.c @@ -46,9 +46,29 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash_str2int.h" #include "htslib/kstring.h" #include "htslib/sam.h" - #include "htslib/khash.h" + +#if 0 +// This helps on Intel a bit, often 6-7% faster VCF parsing. +// Conversely sometimes harms AMD Zen4 as ~9% slower. +// Possibly related to IPC differences. However for now it's just a +// curiousity we ignore and stick with the simpler code. +// +// Left here as a hint for future explorers. +static inline int xstreq(const char *a, const char *b) { + while (*a && *a == *b) + a++, b++; + return *a == *b; +} + +#define KHASH_MAP_INIT_XSTR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq) + +KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t) +#else KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +#endif + typedef khash_t(vdict) vdict_t; KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) @@ -1370,8 +1390,12 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_t *h; h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); if (!h) return NULL; - for (i = 0; i < 3; ++i) + for (i = 0; i < 3; ++i) { if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; + // Supersize the hash to make collisions very unlikely + static int dsize[3] = {16384,16384,2048}; // info, contig, format + if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail; + } bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); if ( !aux ) goto fail; @@ -2463,25 +2487,64 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) { int32_t max = INT32_MIN, min = INT32_MAX; int i; - if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL); - else if (n == 1) bcf_enc_int1(s, a[0]); - else { + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_int1(s, a[0]); + } else { if (wsize <= 0) wsize = n; - for (i = 0; i < n; ++i) { - if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; + int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { if (max < a[i]) max = a[i]; - if (min > a[i]) min = a[i]; + if (min > a[i] && a[i] > INT32_MIN+1) min = a[i]; } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { - bcf_enc_size(s, wsize, BCF_BT_INT8); - for (i = 0; i < n; ++i) - if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); - else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); - else kputc(a[i], s); + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT16); - ks_resize(s, s->l + n * sizeof(int16_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { @@ -2495,8 +2558,9 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) s->l += n * sizeof(int16_t); } else { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT32); - ks_resize(s, s->l + n * sizeof(int32_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { i32_to_le(a[i], p); @@ -2506,7 +2570,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) } } - return 0; // FIXME: check for errs in this function + return 0; } #ifdef VCF_ALLOW_INT64 @@ -2616,13 +2680,36 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) ********************/ typedef struct { - int key, max_m, size, offset; - uint32_t is_gt:1, max_g:31; - uint32_t max_l; - uint32_t y; - uint8_t *buf; + int key; // Key for h->id[BCF_DT_ID][key] vdict + int max_m; // number of elements in field array (ie commas) + int size; // field size (max_l or max_g*4 if is_gt) + int offset; // offset of buf into h->mem + uint32_t is_gt:1, // is genotype + max_g:31; // maximum number of genotypes + uint32_t max_l; // length of field + uint32_t y; // h->id[0][fmt[j].key].val->info[BCF_HL_FMT] + uint8_t *buf; // Pointer into h->mem } fmt_aux_t; +// fmt_aux_t field notes: +// max_* are biggest sizes of the various FORMAT fields across all samples. +// We use these after pivoting the data to ensure easy random access +// of a specific sample. +// +// max_m is only used for type BCF_HT_REAL or BCF_HT_INT +// max_g is only used for is_gt == 1 (will be BCF_HT_STR) +// max_l is only used for is_gt == 0 (will be BCF_HT_STR) +// +// These are computed in vcf_parse_format_max3 and used in +// vcf_parse_format_alloc4 to get the size. +// +// size is computed from max_g, max_l, max_m and is_gt. Once computed +// the max values are never accessed again. +// +// In theory all 4 vars could be coalesced into a single variable, but this +// significantly harms speed (even if done via a union). It's about 25-30% +// slower. + static inline int align_mem(kstring_t *s) { int e = 0; @@ -2637,8 +2724,8 @@ static inline int align_mem(kstring_t *s) // detect FORMAT "." static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q) { - char *end = s->s + s->l; + const char *p, const char *q) { + const char *end = s->s + s->l; if ( q>=end ) { hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); @@ -2658,11 +2745,12 @@ static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // get format information from the dictionary static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, fmt_aux_t *fmt) { const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; char *t; int j; ks_tokaux_t aux1; + for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; @@ -2700,7 +2788,7 @@ static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; fmt[j].key = kh_val(d, k).id; - fmt[j].is_gt = !strcmp(t, "GT"); + fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; v->n_fmt++; } @@ -2714,7 +2802,8 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *r = q + 1; // r: position in the format string int l = 0, m = 1, g = 1, j; v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles - char *end = s->s + s->l; + const char *end = s->s + s->l; + while ( rmax_m < m) f->max_m = m; if (f->max_l < l) f->max_l = l; if (f->is_gt && f->max_g < g) f->max_g = g; @@ -2764,7 +2871,7 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, break; } if ( r>=end ) break; - r++; l++; + r++; } end_for: v->n_sample++; @@ -2777,23 +2884,25 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // allocate memory for arrays static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, + fmt_aux_t *fmt) { kstring_t *mem = (kstring_t*)&h->mem; int j; for (j = 0; j < v->n_fmt; ++j) { fmt_aux_t *f = &fmt[j]; if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + if ((f->y>>4&0xf) == BCF_HT_STR) { f->size = f->is_gt? f->max_g << 2 : f->max_l; } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { f->size = f->max_m << 2; - } else - { + } else { hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } + if (align_mem(mem) < 0) { hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; @@ -2827,15 +2936,17 @@ static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return 0; } -// fill the sample fields; at beginning of the loop +// Fill the sample fields static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { - static int extreme_val_warned = 0; + const char *p, const char *q, fmt_aux_t *fmt) { + static int extreme_val_warned = 0; int n_sample_ori = -1; - // t points to the first char of a format - char *t = q + 1; + // At beginning of the loop t points to the first char of a format + const char *t = q + 1; int m = 0; // m: sample id - char *end = s->s + s->l; + const int nsamples = bcf_hdr_nsamples(h); + + const char *end = s->s + s->l; while ( ty>>4&0xf; if (!z->buf) { hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; return -1; } - if ((z->y>>4&0xf) == BCF_HT_STR) { + + if (htype == BCF_HT_STR) { int l; - if (z->is_gt) { // genotypes + if (z->is_gt) { + // Genotypes. + // ([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; @@ -2873,9 +2988,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (*t == '.') { ++t, x[l++] = is_phased; } else { - char *tt = t; - uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow); - unreadable |= tt == t; + const char *tt = t; + uint32_t val; + // Or "v->n_allele < 10", but it doesn't + // seem to be any faster and this feels safer. + if (*t >= '0' && *t <= '9' && + !(t[1] >= '0' && t[1] <= '9')) { + val = *t++ - '0'; + } else { + val = hts_str2uint(t, (char **)&t, + sizeof(val) * CHAR_MAX - 2, + &overflow); + unreadable |= tt == t; + } if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } @@ -2892,13 +3017,20 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return -1; } if ( !l ) x[l++] = 0; // An empty field, insert missing value - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + } else { + // Otherwise arbitrary strings char *x = (char*)z->buf + z->size * (size_t)m; - for (l = 0; *t != ':' && *t; ++t) x[l++] = *t; - for (; l < z->size; ++l) x[l] = 0; + for (l = 0; *t != ':' && *t; ++t) + x[l++] = *t; + if (z->size > l) + memset(&x[l], 0, (z->size-l) * sizeof(*x)); } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + + } else if (htype == BCF_HT_INT) { + // One or more integers in an array int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); int l; for (l = 0;; ++t) { @@ -2912,7 +3044,8 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, { if ( !extreme_val_warned ) { - hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); + hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, + h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); extreme_val_warned = 1; } tmp_val = bcf_int32_missing; @@ -2922,9 +3055,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } if (*t != ',') break; } - if ( !l ) x[l++] = bcf_int32_missing; - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + if ( !l ) + x[l++] = bcf_int32_missing; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + + } else if (htype == BCF_HT_REAL) { + // One of more floating point values in an array float *x = (float*)(z->buf + z->size * (size_t)m); int l; for (l = 0;; ++t) { @@ -2944,10 +3081,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } if (*t != ',') break; } - if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value - for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); + if ( !l ) + // An empty field, insert missing value + bcf_float_set_missing(x[l++]); + for (; l < z->size>>2; ++l) + bcf_float_set_vector_end(x[l]); } else { - hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); + hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } @@ -2968,24 +3108,28 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } } - for (; j < v->n_fmt; ++j) { // fill end-of-vector values + // fill end-of-vector values + for (; j < v->n_fmt; ++j) { fmt_aux_t *z = &fmt[j]; + const int htype = z->y>>4&0xf; int l; - if ((z->y>>4&0xf) == BCF_HT_STR) { + if (htype == BCF_HT_STR) { if (z->is_gt) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); if (z->size) x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else { char *x = (char*)z->buf + z->size * (size_t)m; - if ( z->size ) x[0] = '.'; - for (l = 1; l < z->size; ++l) x[l] = 0; + if ( z->size ) { + x[0] = '.'; + memset(&x[1], 0, (z->size-1) * sizeof(*x)); + } } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + } else if (htype == BCF_HT_INT) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + } else if (htype == BCF_HT_REAL) { float *x = (float*)(z->buf + z->size * (size_t)m); bcf_float_set_missing(x[0]); for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); @@ -3000,7 +3144,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // write individual genotype information static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; int i; if (v->n_sample > 0) { @@ -3050,7 +3194,8 @@ static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { } // p,q is the start and the end of the FORMAT field -static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) +static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) { if ( !bcf_hdr_nsamples(h) ) return 0; kstring_t *mem = (kstring_t*)&h->mem; @@ -3067,6 +3212,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) return -1; + // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is + // stored as per-type arrays AAA... BBB... CCC... This is basically + // a data rotation or pivot. + + // The size of elements in the array grow to their maximum needed, + // permitting fast random access. This means however we have to first + // scan the whole FORMAT line to find the maximum of each type, and + // then scan it again to find the store the data. + // We break this down into compute-max, allocate, fill-out-buffers + + // TODO: ? + // The alternative would be to pivot on the first pass, with fixed + // size entries for numerics and concatenated strings otherwise, also + // tracking maximum sizes. Then on a second pass we reallocate and + // copy the data again to a uniformly sized array. Two passes through + // memory, but without doubling string parsing. + // compute max if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) return -1; @@ -3174,25 +3336,50 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; - for (r = key = p;; ++r) { - int c; - char *val, *end; - if (*r != ';' && *r != '=' && *r != 0) continue; - if (v->n_info == UINT16_MAX) { - hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto fail; + + // Parsing consists of processing key=value; or key; so we only need + // to track the next = and ; symbols, plus end of string. + // We could process 1 char at a time, but this is inefficient compared + // to strchr which can process word by word. Hence even doing two strchrs + // is quicker than byte by byte processing. + char *next_equals = strchr(p, '='); + char *next_semicolon = strchr(p, ';'); + r = p; + while (*r) { + // Look for key=value or just key. + char *val, *end, *from; + key = r; + if (next_equals && (!next_semicolon || next_equals < next_semicolon)) { + // key=value; + *next_equals = 0; + from = val = next_equals+1; + // Prefetching d->keys[hash] helps here provided we avoid + // computing hash twice (needs API change), but not universally. + // It may be significant for other applications though so it's + // something to consider for the future. + } else { + // key; + val = NULL; + from = key; + } + + // Update location of next ; and if used = + if (next_semicolon) { + end = next_semicolon; + r = end+1; + next_semicolon = strchr(end+1, ';'); + if (val) + next_equals = strchr(end, '='); + } else { + // find nul location, starting from key or val. + r = end = from + strlen(from); } - val = end = 0; - c = *r; *r = 0; - if (c == '=') { - val = r + 1; - for (end = val; *end != ';' && *end != 0; ++end); - c = *end; *end = 0; - } else end = r; - if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO + + *end = 0; + + // We've now got key and val (maybe NULL), so process it k = kh_get(vdict, d, key); + // 15 is default (unknown) type. See bcf_idinfo_def at top if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) { hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key); @@ -3294,7 +3481,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p } else { bcf_enc_vint(str, n_val, a_val, -1); } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0) + if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && + memcmp(key, "END", 4) == 0) { if ( val1 <= v->pos ) { @@ -3320,9 +3508,6 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p bcf_enc_vfloat(str, n_val, val_f); } } - if (c == 0) break; - r = end; - key = r + 1; } free(a_val); @@ -3335,94 +3520,162 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { - int i = 0, ret = -2, overflow = 0; + int ret = -2, overflow = 0; char *p, *q, *r, *t; kstring_t *str; khint_t k; ks_tokaux_t aux; +//#define NOT_DOT(p) strcmp((p), ".") +//#define NOT_DOT(p) (!(*p == '.' && !p[1])) +//#define NOT_DOT(p) ((*p) != '.' || (p)[1]) +//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2)) +#define NOT_DOT(p) (memcmp(p, ".\0", 2)) + if (!s || !h || !v || !(s->s)) return ret; // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); + // Ensure string we parse has space to permit some over-flow when during + // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over + // the more straight forward looking strcmp, giving a speed advantage. + if (ks_resize(s, s->l+4) < 0) + return -1; + + // Force our memory to be initialised so we avoid the technicality of + // undefined behaviour in using a 4-byte memcmp. (The reality is this + // almost certainly is never detected by the compiler so has no impact, + // but equally so this code has minimal (often beneficial) impact on + // performance too.) + s->s[s->l+0] = 0; + s->s[s->l+1] = 0; + s->s[s->l+2] = 0; + s->s[s->l+3] = 0; + bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); - for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) { - q = (char*)aux.p; - *q = 0; - if (i == 0) { // CHROM - vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - k = kh_get(vdict, d, p); - if (k == kh_end(d)) - { - hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); - v->errcode = BCF_ERR_CTG_UNDEF; - if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { - hts_log_error("Could not add dummy header for contig '%s'", p); - v->errcode |= BCF_ERR_CTG_INVALID; + + // CHROM + if (!(p = kstrtok(s->s, "\t", &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; + k = kh_get(vdict, d, p); + if (k == kh_end(d)) { + hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); + v->errcode = BCF_ERR_CTG_UNDEF; + if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { + hts_log_error("Could not add dummy header for contig '%s'", p); + v->errcode |= BCF_ERR_CTG_INVALID; + goto err; + } + } + v->rid = kh_val(d, k).id; + + // POS + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + overflow = 0; + char *tmp = p; + v->pos = hts_str2uint(p, &p, 63, &overflow); + if (overflow) { + hts_log_error("Position value '%s' is too large", tmp); + goto err; + } else if ( *p ) { + hts_log_error("Could not parse the position '%s'", tmp); + goto err; + } else { + v->pos -= 1; + } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; + + // ID + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p); + else bcf_enc_size(str, 0, BCF_BT_CHAR); + + // REF + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + bcf_enc_vchar(str, q - p, p); + v->n_allele = 1, v->rlen = q - p; + + // ALT + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + for (r = t = p;; ++r) { + if (*r == ',' || *r == 0) { + if (v->n_allele == UINT16_MAX) { + hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; goto err; } + bcf_enc_vchar(str, r - t, t); + t = r + 1; + ++v->n_allele; } - v->rid = kh_val(d, k).id; - } else if (i == 1) { // POS - overflow = 0; - char *tmp = p; - v->pos = hts_str2uint(p, &p, 63, &overflow); - if (overflow) { - hts_log_error("Position value '%s' is too large", tmp); - goto err; - } else if ( *p ) { - hts_log_error("Could not parse the position '%s'", tmp); - goto err; - } else { - v->pos -= 1; - } - if (v->pos >= INT32_MAX) - v->unpacked |= BCF_IS_64BIT; - } else if (i == 2) { // ID - if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); - else bcf_enc_size(str, 0, BCF_BT_CHAR); - } else if (i == 3) { // REF - bcf_enc_vchar(str, q - p, p); - v->n_allele = 1, v->rlen = q - p; - } else if (i == 4) { // ALT - if (strcmp(p, ".")) { - for (r = t = p;; ++r) { - if (*r == ',' || *r == 0) { - if (v->n_allele == UINT16_MAX) { - hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto err; - } - bcf_enc_vchar(str, r - t, t); - t = r + 1; - ++v->n_allele; - } - if (r == q) break; - } - } - } else if (i == 5) { // QUAL - if (strcmp(p, ".")) v->qual = atof(p); - else bcf_float_set_missing(v->qual); - if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR - } else if (i == 6) { // FILTER - if (strcmp(p, ".")) { - if (vcf_parse_filter(str, h, v, p, q)) goto err; - } else bcf_enc_vint(str, 0, 0, -1); - if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT - } else if (i == 7) { // INFO - if (strcmp(p, ".")) { - if (vcf_parse_info(str, h, v, p, q)) goto err; - } - if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; - } else if (i == 8) {// FORMAT - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + if (r == q) break; + } + } + + // QUAL + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) v->qual = atof(p); + else bcf_float_set_missing(v->qual); + if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR + + // FILTER + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_filter(str, h, v, p, q)) { + goto err; + } + } else bcf_enc_vint(str, 0, 0, -1); + if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT + + // INFO + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_info(str, h, v, p, q)) { + goto err; } } + if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; + + // FORMAT; optional + p = kstrtok(0, 0, &aux); + if (p) { + *(q = (char*)aux.p) = 0; + + return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + } else { + return 0; + } end: ret = 0; From ac70212aefe0d27662dc48bcce4302057dd5f507 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Aug 2023 15:19:59 +0100 Subject: [PATCH 3/3] Revert most of the vcf_parse_info improvements. The two we keep are the internal while loop to find the next ; or = instead of iterating back in the outer for loop, and memcmp instead of strcmp for "END". The strchr changes do help glibc on excessively long INFO tokens, seen in the GIAB truth set and GNOMAD files, but they have no impact on most mainstream VCF outputs. Furthermore, other C libraries, such as MUSL, are considerably slowed down by the use of strchr. Hence this isn't a particularly robust or warranted change. --- vcf.c | 65 ++++++++++++++++++++--------------------------------------- 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/vcf.c b/vcf.c index ce54dc227..afd60fcaa 100644 --- a/vcf.c +++ b/vcf.c @@ -3336,50 +3336,26 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; - - // Parsing consists of processing key=value; or key; so we only need - // to track the next = and ; symbols, plus end of string. - // We could process 1 char at a time, but this is inefficient compared - // to strchr which can process word by word. Hence even doing two strchrs - // is quicker than byte by byte processing. - char *next_equals = strchr(p, '='); - char *next_semicolon = strchr(p, ';'); - r = p; - while (*r) { - // Look for key=value or just key. - char *val, *end, *from; - key = r; - if (next_equals && (!next_semicolon || next_equals < next_semicolon)) { - // key=value; - *next_equals = 0; - from = val = next_equals+1; - // Prefetching d->keys[hash] helps here provided we avoid - // computing hash twice (needs API change), but not universally. - // It may be significant for other applications though so it's - // something to consider for the future. - } else { - // key; - val = NULL; - from = key; - } - - // Update location of next ; and if used = - if (next_semicolon) { - end = next_semicolon; - r = end+1; - next_semicolon = strchr(end+1, ';'); - if (val) - next_equals = strchr(end, '='); - } else { - // find nul location, starting from key or val. - r = end = from + strlen(from); + for (r = key = p;; ++r) { + int c; + char *val, *end; + while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++; + if (v->n_info == UINT16_MAX) { + hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; + goto fail; } + val = end = NULL; + c = *r; *r = 0; + if (c == '=') { + val = r + 1; - *end = 0; - - // We've now got key and val (maybe NULL), so process it + for (end = val; *end != ';' && *end != 0; ++end); + c = *end; *end = 0; + } else end = r; + if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO k = kh_get(vdict, d, key); - // 15 is default (unknown) type. See bcf_idinfo_def at top if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) { hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key); @@ -3481,8 +3457,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p } else { bcf_enc_vint(str, n_val, a_val, -1); } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && - memcmp(key, "END", 4) == 0) + if (n_val==1 && (val1!=bcf_int32_missing || is_int64) + && memcmp(key, "END", 4) == 0) { if ( val1 <= v->pos ) { @@ -3508,6 +3484,9 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p bcf_enc_vfloat(str, n_val, val_f); } } + if (c == 0) break; + r = end; + key = r + 1; } free(a_val);