From 7c1d3ccd1ae10dd81cbcfc84d1dd984534cf2caa Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 4 Jul 2023 15:49:44 +0100
Subject: [PATCH 1/3] The first stage of vcf_parse_format speed improvements.

This simply turns the monolithic vcf_parse_format functions into a
series of numbers sub-functions whose primary is to localise the
variables to that code block and to make it easier to see the
structure of the tasks being performed.

There is no code optimisation here and the main algorithm is
unchanged, so this is just moving of code from 1 function to multiple
functions.  However it makes the next commit easier to understand as
we're not trying to see a delta mixed in with restructuring.

An unexpected consequence however of making the variables local to
their blocks is that it also speeds up the code.

The first step in separating this code into functions was simply
adding curly braces around each code segment and moving the
function-global variables into their respective blocks.  The
before/after benchmarkjs on 100,000 lines of a multi-sample G1K VCF
are ("perf stat" cycle counts):

        ORIG            LOCALISED
gcc7    29335942870     27353443704
gcc13   31974757094     31452452908
clang13 31290989382     29508665020

Benchmarked again after moving to actual functions, but the difference
was tiny in comparison.)
---
 vcf.c | 146 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 114 insertions(+), 32 deletions(-)

diff --git a/vcf.c b/vcf.c
index 9e589f993..4dbac2580 100644
--- a/vcf.c
+++ b/vcf.c
@@ -2633,22 +2633,11 @@ static inline int align_mem(kstring_t *s)
     return e == 0 ? 0 : -1;
 }
 
-// p,q is the start and the end of the FORMAT field
 #define MAX_N_FMT 255   /* Limited by size of bcf1_t n_fmt field */
-static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
-{
-    if ( !bcf_hdr_nsamples(h) ) return 0;
-
-    static int extreme_val_warned = 0;
-    char *r, *t;
-    int j, l, m, g, overflow = 0;
-    khint_t k;
-    ks_tokaux_t aux1;
-    vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
-    kstring_t *mem = (kstring_t*)&h->mem;
-    fmt_aux_t fmt[MAX_N_FMT];
-    mem->l = 0;
 
+// detect FORMAT "."
+static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                   char *p, char *q) {
     char *end = s->s + s->l;
     if ( q>=end )
     {
@@ -2661,10 +2650,19 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
     if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "."
     {
         v->n_sample = bcf_hdr_nsamples(h);
-        return 0;
+        return 1;
     }
 
-    // get format information from the dictionary
+    return 0;
+}
+
+// get format information from the dictionary
+static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                  char *p, char *q, fmt_aux_t *fmt) {
+    const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
+    char *t;
+    int j;
+    ks_tokaux_t aux1;
     for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
         if (j >= MAX_N_FMT) {
             v->errcode |= BCF_ERR_LIMITS;
@@ -2674,7 +2672,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
 
         *(char*)aux1.p = 0;
-        k = kh_get(vdict, d, t);
+        khint_t k = kh_get(vdict, d, t);
         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) {
             if ( t[0]=='.' && t[1]==0 )
             {
@@ -2706,10 +2704,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
         v->n_fmt++;
     }
-    // compute max
+    return 0;
+}
+
+// compute max
+static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                 char *p, char *q, fmt_aux_t *fmt) {
     int n_sample_ori = -1;
-    r = q + 1;  // r: position in the format string
-    l = 0, m = g = 1, v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
+    char *r = q + 1;  // r: position in the format string
+    int l = 0, m = 1, g = 1, j;
+    v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
+    char *end = s->s + s->l;
     while ( r<end )
     {
         // can we skip some samples?
@@ -2767,7 +2772,15 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         r++;
     }
 
-    // allocate memory for arrays
+    return 0;
+}
+
+// allocate memory for arrays
+static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                   char *p, char *q, fmt_aux_t *fmt) {
+    kstring_t *mem = (kstring_t*)&h->mem;
+
+    int j;
     for (j = 0; j < v->n_fmt; ++j) {
         fmt_aux_t *f = &fmt[j];
         if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
@@ -2804,11 +2817,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
         mem->l += v->n_sample * f->size;
     }
-    for (j = 0; j < v->n_fmt; ++j)
-        fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
-    // fill the sample fields; at beginning of the loop, t points to the first char of a format
-    n_sample_ori = -1;
-    t = q + 1; m = 0;   // m: sample id
+
+    {
+        int j;
+        for (j = 0; j < v->n_fmt; ++j)
+            fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset;
+    }
+
+    return 0;
+}
+
+// fill the sample fields; at beginning of the loop
+static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                  char *p, char *q, fmt_aux_t *fmt) {
+    static int extreme_val_warned = 0; 
+    int n_sample_ori = -1;
+    // t points to the first char of a format
+    char *t = q + 1;
+    int m = 0;   // m: sample id
+    char *end = s->s + s->l;
     while ( t<end )
     {
         // can we skip some samples?
@@ -2824,7 +2851,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
         if ( m == bcf_hdr_nsamples(h) ) break;
 
-        j = 0; // j-th format field, m-th sample
+        int j = 0; // j-th format field, m-th sample
         while ( t < end )
         {
             fmt_aux_t *z = &fmt[j++];
@@ -2835,12 +2862,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                 return -1;
             }
             if ((z->y>>4&0xf) == BCF_HT_STR) {
+                int l;
                 if (z->is_gt) { // genotypes
                     int32_t is_phased = 0;
                     uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
                     uint32_t unreadable = 0;
                     uint32_t max = 0;
-                    overflow = 0;
+                    int overflow = 0;
                     for (l = 0;; ++t) {
                         if (*t == '.') {
                             ++t, x[l++] = is_phased;
@@ -2867,16 +2895,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                     for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
                 } else {
                     char *x = (char*)z->buf + z->size * (size_t)m;
-                    for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t;
+                    for (l = 0; *t != ':' && *t; ++t) x[l++] = *t;
                     for (; l < z->size; ++l) x[l] = 0;
                 }
             } else if ((z->y>>4&0xf) == BCF_HT_INT) {
                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
+                int l;
                 for (l = 0;; ++t) {
                     if (*t == '.') {
                         x[l++] = bcf_int32_missing, ++t; // ++t to skip "."
                     } else {
-                        overflow = 0;
+                        int overflow = 0;
                         char *te;
                         long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow);
                         if ( te==t || overflow || tmp_val<BCF_MIN_BT_INT32 || tmp_val>BCF_MAX_BT_INT32 )
@@ -2897,11 +2926,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
                 for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
             } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
                 float *x = (float*)(z->buf + z->size * (size_t)m);
+                int l;
                 for (l = 0;; ++t) {
                     if (*t == '.' && !isdigit_c(t[1])) {
                         bcf_float_set_missing(x[l++]), ++t; // ++t to skip "."
                     } else {
-                        overflow = 0;
+                        int overflow = 0;
                         char *te;
                         float tmp_val = hts_str2dbl(t, &te, &overflow);
                         if ( (te==t || overflow) && !extreme_val_warned )
@@ -2940,6 +2970,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
 
         for (; j < v->n_fmt; ++j) { // fill end-of-vector values
             fmt_aux_t *z = &fmt[j];
+            int l;
             if ((z->y>>4&0xf) == BCF_HT_STR) {
                 if (z->is_gt) {
                     int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
@@ -2964,7 +2995,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         m++; t++;
     }
 
-    // write individual genotype information
+    return 0;
+}
+
+// write individual genotype information
+static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                                char *p, char *q, fmt_aux_t *fmt) {
     kstring_t *str = &v->indiv;
     int i;
     if (v->n_sample > 0) {
@@ -2988,6 +3024,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
         }
     }
 
+    return 0;
+}
+
+// validity checking
+static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
     if ( v->n_sample!=bcf_hdr_nsamples(h) )
     {
         hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)",
@@ -3008,6 +3049,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
     return 0;
 }
 
+// p,q is the start and the end of the FORMAT field
+static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
+{
+    if ( !bcf_hdr_nsamples(h) ) return 0;
+    kstring_t *mem = (kstring_t*)&h->mem;
+    mem->l = 0;
+
+    fmt_aux_t fmt[MAX_N_FMT];
+
+    // detect FORMAT "."
+    int ret; // +ve = ok, -ve = err
+    if ((ret = vcf_parse_format_empty1(s, h, v, p, q)))
+        return ret ? 0 : -1;
+
+    // get format information from the dictionary
+    if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // compute max
+    if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // allocate memory for arrays
+    if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // fill the sample fields; at beginning of the loop
+    if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // write individual genotype information
+    if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0)
+        return -1;
+
+    // validity checking
+    if (vcf_parse_format_check7(h, v) < 0)
+        return -1;
+
+    return 0;
+}
+
 static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) {
     // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has
     // been already printed, but will enable tools like vcfcheck to proceed.

From 231897568c45c5821c6515ca88c98d1885060f97 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Tue, 4 Jul 2023 17:06:42 +0100
Subject: [PATCH 2/3] Further VCF reading speeds optimisations.

This is the main meat of the VCF read speedup, following on from the
previous code refactoring.  Combined timings on testing GNOMAD very
INFO heavy single-sample file, a many-sample (approx 4000) FORMAT rich
file for different compilers, and the GIAB HG002 VCF truth set are:

INFO heavy          (15-29% speedup)    (34-39% speedup)
                    dev(s)  PR(s)       dev(s)  PR(s)
    clang13         6.29    5.34        2.84    1.85
    gcc13           6.74    5.22        2.93    1.93
    gcc7            7.96    5.65        3.25    1.98

FORMAT heavy        (6-19% speedup)     (18-22% speedup)
                    dev     PR          dev     PR
    clang13         9.17    8.58        5.45    4.48
    gcc13           9.88    8.04        5.08    3.95
    gcc7            9.12    8.33        4.87    3.98

GIAB HG002          (28-29% speedup)    (33-37% speedup)
                    dev     PR          dev     PR
    clang13         12.88   9.30        5.12    3.29
    gcc13           12.04   8.60        4.74    3.19
    gcc7            12.87   9.37        5.32    3.34

(Tested on Intel Xeon) Gold 6142 and an AMD Zen4 respectively)

Bigger speedups (see first message in PR) were seen on some older
hardware.

Specific optimisations along with estimates of their benefit include,
in approximate order of writing / testing:

- Adding consts and caching of bcf_hdr_nsamples(h).
  No difference on system gcc (gcc7) and clang13, but a couple percent
  gain on gcc13.

- Remove the need for most calls to hts_str2uint by recognising that
  most GT numbers are single digits.  This was 4-5% saving for gcc and
  9-10% on clang.

- Remove kputc calls in bcf_enc_vint / bcf_enc_size, avoiding repeated
  ks_resize checking.  This is a further ~10% speedup.

- Unrolling in bcf_enc_vint to encourage SIMD.

- Improve speed of bgzf_getline and kstrrok via memchr/strchr.

  In tabix timings indexing VCF, bgzf_getline change is 9-22% quicker
  with clang 13 and 19-25% quicker with gcc 7.  I did investigate a
  manually unrolled 64-bit search, before I remembered the existance of
  memchr (doh!).  This is often faster on clang (17-19%), but marginally
  slower on gcc.  The actual speed up to this function however is
  considerably more (3-4x quicker).

  For interest, I include the equivalent code here, as it may be useful in
  other contexts:

    #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff
    // 64-bit unrolled delim detection
    #define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL)
            // Quicker broadcast on clang than bit shuffling in delim
            union {
                uint64_t d8;
                uint8_t d[8];
            } u;
            memset(u.d, delim, 8);
            const uint64_t d8 = u.d8;

            uint64_t *b8 = (uint64_t *)(&buf[fp->block_offset]);
            const int l8 = (fp->block_length-fp->block_offset)/8;
            for (l = 0; l < (l8 & ~3); l+=4) {
                if (haszero(b8[l+0] ^ d8))
                    break;
                if (haszero(b8[l+1] ^ d8)) {
                    l++;
                    break;
                }
                if (haszero(b8[l+2] ^ d8)) {
                    l+=2;
                    break;
                }
                if (haszero(b8[l+3] ^ d8)) {
                    l+=3;
                    break;
                }
            }
            l *= 8;
            for (l += fp->block_offset;
                 l < fp->block_length && buf[l] != delim;
                 l++);

  The analogous kstrtok change is using strchr+strlen instead of memchr
  as we don't know the string end. This makes kstrtok around 150%
  quicker when parsing a single sample VCF.

  When not finding aux->sep in the string, strchr returns NULL rather
  than end of string, so we need an additional strlen to set aux->p.
  However there is also glibc's strchrnul which solves this in a single
  call.  This makes kstrtok another 40% quicker on this test, but
  overall it's not a big bottleneck any more.

- Use strchr in vcf_parse_info.

  This is a major speed increase over manual searching on Linux.
  TODO: is this just glibc?  Eg libmusl speeds, etc?  Other OSes?

  It saves about 33% of time in vcf_parse (vcf_parse_info inlined to it)
  with gcc.  Even more with clang.  The total speed gain on a single
  sample VCF view (GIAB truth set) is 12-19% fewer cycles:

- Minor "GT" check improvement.  This has no real affect on gcc13 and
  clang13, but the system gcc (gcc7) speeds up single sample VCF decoding by 7%

- Speed up the unknown value check (strcmp(p, ".").  Helps gcc7 the
  most (9%), with gcc13/clang13 in the 3-4% gains.

- Speed up vcf_parse_format_max3.

  This is the first parse through the FORMAT fields.  Ideally we'd merge
  this and fill5 (the other parse through), but that is harder due to
  the data pivot / rotate.

  For now we just optimise the existing code path.  Instead of a
  laborious switch character by character, we have an initial tight loop
  to find the first meta-character and then a switch to do char
  dependant code.

  This is 5% to 13% speed up depending on data set.

- Remove kputc and minimise resize for bcf_enc_int1.
  3-8% speedup depending on data / compiler.

- Use memcmp instead of strcmp for "END" and ensure we have room.
  Also memset over explicit nulling of arrays.

- Force BCF header dicts to be larger than needed.

  This is a tactic to reduce hash collisions due to the use of overly
  simple hash functions.  It seems to typically be around 3-8% speed gain.

- Restructure of main vcf_parse function.

  This can speed things up by 6-7% on basic single-sample files.

  The previous loop caused lots of branch prediction misses due to the
  counter 'i' being used to do 8 different parts of code depending on
  token number.  Additionally it's got better error checking now as
  previously running out of tokens early just did a return 0 rather than
  complaining about missing columns.
---
 bgzf.c       |   8 +-
 htslib/vcf.h |  85 +++++---
 kstring.c    |  13 +-
 vcf.c        | 559 +++++++++++++++++++++++++++++++++++++--------------
 4 files changed, 476 insertions(+), 189 deletions(-)

diff --git a/bgzf.c b/bgzf.c
index 45f2b1150..5ef433c20 100644
--- a/bgzf.c
+++ b/bgzf.c
@@ -2280,7 +2280,13 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str)
             if (fp->block_length == 0) { state = -1; break; }
         }
         unsigned char *buf = fp->uncompressed_block;
-        for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l);
+
+        // Equivalent to a naive byte by byte search from
+        // buf + block_offset to buf + block_length.
+        void *e = memchr(&buf[fp->block_offset], delim,
+                         fp->block_length - fp->block_offset);
+        l = e ? (unsigned char *)e - buf : fp->block_length;
+
         if (l < fp->block_length) state = 1;
         l -= fp->block_offset;
         if (ks_expand(str, l + 2) < 0) { state = -3; break; }
diff --git a/htslib/vcf.h b/htslib/vcf.h
index 83659ae12..70cf95372 100644
--- a/htslib/vcf.h
+++ b/htslib/vcf.h
@@ -1522,26 +1522,37 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str)
 
 static inline int bcf_enc_size(kstring_t *s, int size, int type)
 {
-    uint32_t e = 0;
-    uint8_t x[4];
-    if (size >= 15) {
-        e |= kputc(15<<4|type, s) < 0;
-        if (size >= 128) {
-            if (size >= 32768) {
-                i32_to_le(size, x);
-                e |= kputc(1<<4|BCF_BT_INT32, s) < 0;
-                e |= kputsn((char*)&x, 4, s) < 0;
-            } else {
-                i16_to_le(size, x);
-                e |= kputc(1<<4|BCF_BT_INT16, s) < 0;
-                e |= kputsn((char*)&x, 2, s) < 0;
-            }
+    // Most common case is first
+    if (size < 15) {
+        if (ks_resize(s, s->l + 1) < 0)
+            return -1;
+        uint8_t *p = (uint8_t *)s->s + s->l;
+        *p++ = (size<<4) | type;
+        s->l++;
+        return 0;
+    }
+
+    if (ks_resize(s, s->l + 6) < 0)
+        return -1;
+    uint8_t *p = (uint8_t *)s->s + s->l;
+    *p++ = 15<<4|type;
+
+    if (size < 128) {
+        *p++ = 1<<4|BCF_BT_INT8;
+        *p++ = size;
+        s->l += 3;
+    } else {
+        if (size < 32768) {
+            *p++ = 1<<4|BCF_BT_INT16;
+            i16_to_le(size, p);
+            s->l += 4;
         } else {
-            e |= kputc(1<<4|BCF_BT_INT8, s) < 0;
-            e |= kputc(size, s) < 0;
+            *p++ = 1<<4|BCF_BT_INT32;
+            i32_to_le(size, p);
+            s->l += 6;
         }
-    } else e |= kputc(size<<4|type, s) < 0;
-    return e == 0 ? 0 : -1;
+    }
+    return 0;
 }
 
 static inline int bcf_enc_inttype(long x)
@@ -1553,27 +1564,35 @@ static inline int bcf_enc_inttype(long x)
 
 static inline int bcf_enc_int1(kstring_t *s, int32_t x)
 {
-    uint32_t e = 0;
-    uint8_t z[4];
+    if (ks_resize(s, s->l + 5) < 0)
+        return -1;
+    uint8_t *p = (uint8_t *)s->s + s->l;
+
     if (x == bcf_int32_vector_end) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(bcf_int8_vector_end, s) < 0;
+        // An inline implementation of bcf_enc_size with size==1 and
+        // memory allocation already accounted for.
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = bcf_int8_vector_end;
+        s->l+=2;
     } else if (x == bcf_int32_missing) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(bcf_int8_missing, s) < 0;
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = bcf_int8_missing;
+        s->l+=2;
     } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) {
-        e |= bcf_enc_size(s, 1, BCF_BT_INT8);
-        e |= kputc(x, s) < 0;
+        *p = (1<<4) | BCF_BT_INT8;
+        p[1] = x;
+        s->l+=2;
     } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) {
-        i16_to_le(x, z);
-        e |= bcf_enc_size(s, 1, BCF_BT_INT16);
-        e |= kputsn((char*)&z, 2, s) < 0;
+        *p = (1<<4) | BCF_BT_INT16;
+        i16_to_le(x, p+1);
+        s->l+=3;
     } else {
-        i32_to_le(x, z);
-        e |= bcf_enc_size(s, 1, BCF_BT_INT32);
-        e |= kputsn((char*)&z, 4, s) < 0;
+        *p = (1<<4) | BCF_BT_INT32;
+        i32_to_le(x, p+1);
+        s->l+=5;
     }
-    return e == 0 ? 0 : -1;
+
+    return 0;
 }
 
 /// Return the value of a single typed integer.
diff --git a/kstring.c b/kstring.c
index 71facf975..f8e0f9f3d 100644
--- a/kstring.c
+++ b/kstring.c
@@ -204,8 +204,17 @@ char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux)
 		for (p = start; *p; ++p)
 			if (aux->tab[*p>>6]>>(*p&0x3f)&1) break;
 	} else {
-		for (p = start; *p; ++p)
-			if (*p == aux->sep) break;
+		// Using strchr is fast for next token, but slower for
+		// last token due to extra pass from strlen.  Overall
+		// on a VCF parse this func was 146% faster with // strchr.
+		// Equiv to:
+		// for (p = start; *p; ++p) if (*p == aux->sep) break;
+
+		// NB: We could use strchrnul() here from glibc if detected,
+		// which is ~40% faster again, but it's not so portable.
+		// i.e.   p = (uint8_t *)strchrnul((char *)start, aux->sep);
+		uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep);
+		p = p2 ? p2 : start + strlen((char *)start);
 	}
 	aux->p = (const char *) p; // end of token
 	if (*p == 0) aux->finished = 1; // no more tokens
diff --git a/vcf.c b/vcf.c
index 4dbac2580..ce54dc227 100644
--- a/vcf.c
+++ b/vcf.c
@@ -46,9 +46,29 @@ DEALINGS IN THE SOFTWARE.  */
 #include "htslib/khash_str2int.h"
 #include "htslib/kstring.h"
 #include "htslib/sam.h"
-
 #include "htslib/khash.h"
+
+#if 0
+// This helps on Intel a bit, often 6-7% faster VCF parsing.
+// Conversely sometimes harms AMD Zen4 as ~9% slower.
+// Possibly related to IPC differences.  However for now it's just a
+// curiousity we ignore and stick with the simpler code.
+//
+// Left here as a hint for future explorers.
+static inline int xstreq(const char *a, const char *b) {
+    while (*a && *a == *b)
+        a++, b++;
+    return *a == *b;
+}
+
+#define KHASH_MAP_INIT_XSTR(name, khval_t) \
+  KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq)
+
+KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t)
+#else
 KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t)
+#endif
+
 typedef khash_t(vdict) vdict_t;
 
 KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*)
@@ -1370,8 +1390,12 @@ bcf_hdr_t *bcf_hdr_init(const char *mode)
     bcf_hdr_t *h;
     h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t));
     if (!h) return NULL;
-    for (i = 0; i < 3; ++i)
+    for (i = 0; i < 3; ++i) {
         if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail;
+        // Supersize the hash to make collisions very unlikely
+        static int dsize[3] = {16384,16384,2048}; // info, contig, format
+        if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail;
+    }
 
     bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t));
     if ( !aux ) goto fail;
@@ -2463,25 +2487,64 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
 {
     int32_t max = INT32_MIN, min = INT32_MAX;
     int i;
-    if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL);
-    else if (n == 1) bcf_enc_int1(s, a[0]);
-    else {
+    if (n <= 0) {
+        return bcf_enc_size(s, 0, BCF_BT_NULL);
+    } else if (n == 1) {
+        return bcf_enc_int1(s, a[0]);
+    } else {
         if (wsize <= 0) wsize = n;
-        for (i = 0; i < n; ++i) {
-            if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue;
+
+        // Equivalent to:
+        // for (i = 0; i < n; ++i) {
+        //     if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end )
+        //         continue;
+        //     if (max < a[i]) max = a[i];
+        //     if (min > a[i]) min = a[i];
+        // }
+        int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN};
+        int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX};
+        for (i = 0; i < (n&~3); i+=4) {
+            // bcf_int32_missing    == INT32_MIN and
+            // bcf_int32_vector_end == INT32_MIN+1.
+            // We skip these, but can mostly avoid explicit checking
+            if (max4[0] < a[i+0]) max4[0] = a[i+0];
+            if (max4[1] < a[i+1]) max4[1] = a[i+1];
+            if (max4[2] < a[i+2]) max4[2] = a[i+2];
+            if (max4[3] < a[i+3]) max4[3] = a[i+3];
+            if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0];
+            if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1];
+            if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2];
+            if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3];
+        }
+        min = min4[0];
+        if (min > min4[1]) min = min4[1];
+        if (min > min4[2]) min = min4[2];
+        if (min > min4[3]) min = min4[3];
+        max = max4[0];
+        if (max < max4[1]) max = max4[1];
+        if (max < max4[2]) max = max4[2];
+        if (max < max4[3]) max = max4[3];
+        for (; i < n; ++i) {
             if (max < a[i]) max = a[i];
-            if (min > a[i]) min = a[i];
+            if (min > a[i] && a[i] > INT32_MIN+1) min = a[i];
         }
+
         if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) {
-            bcf_enc_size(s, wsize, BCF_BT_INT8);
-            for (i = 0; i < n; ++i)
-                if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s);
-                else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s);
-                else kputc(a[i], s);
+            if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 ||
+                ks_resize(s, s->l + n) < 0)
+                return -1;
+            uint8_t *p = (uint8_t *) s->s + s->l;
+            for (i = 0; i < n; ++i, p++) {
+                if ( a[i]==bcf_int32_vector_end )   *p = bcf_int8_vector_end;
+                else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing;
+                else *p = a[i];
+            }
+            s->l += n;
         } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) {
             uint8_t *p;
-            bcf_enc_size(s, wsize, BCF_BT_INT16);
-            ks_resize(s, s->l + n * sizeof(int16_t));
+            if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 ||
+                ks_resize(s, s->l + n * sizeof(int16_t)) < 0)
+                return -1;
             p = (uint8_t *) s->s + s->l;
             for (i = 0; i < n; ++i)
             {
@@ -2495,8 +2558,9 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
             s->l += n * sizeof(int16_t);
         } else {
             uint8_t *p;
-            bcf_enc_size(s, wsize, BCF_BT_INT32);
-            ks_resize(s, s->l + n * sizeof(int32_t));
+            if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 ||
+                ks_resize(s, s->l + n * sizeof(int32_t)) < 0)
+                return -1;
             p = (uint8_t *) s->s + s->l;
             for (i = 0; i < n; ++i) {
                 i32_to_le(a[i], p);
@@ -2506,7 +2570,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize)
         }
     }
 
-    return 0; // FIXME: check for errs in this function
+    return 0;
 }
 
 #ifdef VCF_ALLOW_INT64
@@ -2616,13 +2680,36 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr)
  ********************/
 
 typedef struct {
-    int key, max_m, size, offset;
-    uint32_t is_gt:1, max_g:31;
-    uint32_t max_l;
-    uint32_t y;
-    uint8_t *buf;
+    int key;            // Key for h->id[BCF_DT_ID][key] vdict
+    int max_m;          // number of elements in field array (ie commas)
+    int size;           // field size (max_l or max_g*4 if is_gt)
+    int offset;         // offset of buf into h->mem
+    uint32_t is_gt:1,   // is genotype
+             max_g:31;  // maximum number of genotypes
+    uint32_t max_l;     // length of field
+    uint32_t y;         // h->id[0][fmt[j].key].val->info[BCF_HL_FMT]
+    uint8_t *buf;       // Pointer into h->mem
 } fmt_aux_t;
 
+// fmt_aux_t field notes:
+// max_* are biggest sizes of the various FORMAT fields across all samples.
+// We use these after pivoting the data to ensure easy random access
+// of a specific sample.
+//
+// max_m is only used for type BCF_HT_REAL or BCF_HT_INT
+// max_g is only used for is_gt == 1 (will be BCF_HT_STR)
+// max_l is only used for is_gt == 0 (will be BCF_HT_STR)
+//
+// These are computed in vcf_parse_format_max3 and used in
+// vcf_parse_format_alloc4 to get the size.
+//
+// size is computed from max_g, max_l, max_m and is_gt.  Once computed
+// the max values are never accessed again.
+//
+// In theory all 4 vars could be coalesced into a single variable, but this
+// significantly harms speed (even if done via a union).  It's about 25-30%
+// slower.
+
 static inline int align_mem(kstring_t *s)
 {
     int e = 0;
@@ -2637,8 +2724,8 @@ static inline int align_mem(kstring_t *s)
 
 // detect FORMAT "."
 static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
-                                   char *p, char *q) {
-    char *end = s->s + s->l;
+                                   const char *p, const char *q) {
+    const char *end = s->s + s->l;
     if ( q>=end )
     {
         hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1);
@@ -2658,11 +2745,12 @@ static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
 
 // get format information from the dictionary
 static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
-                                  char *p, char *q, fmt_aux_t *fmt) {
+                                  const char *p, const char *q, fmt_aux_t *fmt) {
     const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID];
     char *t;
     int j;
     ks_tokaux_t aux1;
+
     for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) {
         if (j >= MAX_N_FMT) {
             v->errcode |= BCF_ERR_LIMITS;
@@ -2700,7 +2788,7 @@ static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
         }
         fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0;
         fmt[j].key = kh_val(d, k).id;
-        fmt[j].is_gt = !strcmp(t, "GT");
+        fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]);
         fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT];
         v->n_fmt++;
     }
@@ -2714,7 +2802,8 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
     char *r = q + 1;  // r: position in the format string
     int l = 0, m = 1, g = 1, j;
     v->n_sample = 0;  // m: max vector size, l: max field len, g: max number of alleles
-    char *end = s->s + s->l;
+    const char *end = s->s + s->l;
+
     while ( r<end )
     {
         // can we skip some samples?
@@ -2732,7 +2821,23 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
         // collect fmt stats: max vector size, length, number of alleles
         j = 0;  // j-th format field
         fmt_aux_t *f = fmt;
+        static char meta[256] = {
+            // \0 \t , / : |
+            1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+            0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+        };
+
+        char *r_start = r;
         for (;;) {
+            // Quickly skip ahead to an appropriate meta-character
+            while (!meta[(unsigned char)*r]) r++;
+
             switch (*r) {
             case ',':
                 m++;
@@ -2746,8 +2851,10 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
             case '\t':
                 *r = 0; // fall through
 
+            default: // valid due to while loop above.
             case '\0':
             case ':':
+                l = r - r_start; r_start = r;
                 if (f->max_m < m) f->max_m = m;
                 if (f->max_l < l) f->max_l = l;
                 if (f->is_gt && f->max_g < g) f->max_g = g;
@@ -2764,7 +2871,7 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                 break;
             }
             if ( r>=end ) break;
-            r++; l++;
+            r++;
         }
     end_for:
         v->n_sample++;
@@ -2777,23 +2884,25 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
 
 // allocate memory for arrays
 static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
-                                   char *p, char *q, fmt_aux_t *fmt) {
+                                   const char *p, const char *q,
+                                   fmt_aux_t *fmt) {
     kstring_t *mem = (kstring_t*)&h->mem;
 
     int j;
     for (j = 0; j < v->n_fmt; ++j) {
         fmt_aux_t *f = &fmt[j];
         if ( !f->max_m ) f->max_m = 1;  // omitted trailing format field
+
         if ((f->y>>4&0xf) == BCF_HT_STR) {
             f->size = f->is_gt? f->max_g << 2 : f->max_l;
         } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) {
             f->size = f->max_m << 2;
-        } else
-        {
+        } else {
             hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
             v->errcode |= BCF_ERR_TAG_INVALID;
             return -1;
         }
+
         if (align_mem(mem) < 0) {
             hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1);
             v->errcode |= BCF_ERR_LIMITS;
@@ -2827,15 +2936,17 @@ static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
     return 0;
 }
 
-// fill the sample fields; at beginning of the loop
+// Fill the sample fields
 static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
-                                  char *p, char *q, fmt_aux_t *fmt) {
-    static int extreme_val_warned = 0; 
+                                  const char *p, const char *q, fmt_aux_t *fmt) {
+    static int extreme_val_warned = 0;
     int n_sample_ori = -1;
-    // t points to the first char of a format
-    char *t = q + 1;
+    // At beginning of the loop t points to the first char of a format
+    const char *t = q + 1;
     int m = 0;   // m: sample id
-    char *end = s->s + s->l;
+    const int nsamples = bcf_hdr_nsamples(h);
+
+    const char *end = s->s + s->l;
     while ( t<end )
     {
         // can we skip some samples?
@@ -2849,21 +2960,25 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                 continue;
             }
         }
-        if ( m == bcf_hdr_nsamples(h) ) break;
+        if ( m == nsamples ) break;
 
         int j = 0; // j-th format field, m-th sample
         while ( t < end )
         {
             fmt_aux_t *z = &fmt[j++];
+            const int htype = z->y>>4&0xf;
             if (!z->buf) {
                 hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos,
                               z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_LIMITS;
                 return -1;
             }
-            if ((z->y>>4&0xf) == BCF_HT_STR) {
+
+            if (htype == BCF_HT_STR) {
                 int l;
-                if (z->is_gt) { // genotypes
+                if (z->is_gt) {
+                    // Genotypes.
+                    // <val>([|/]<val>)+... where <val> is [0-9]+ or ".".
                     int32_t is_phased = 0;
                     uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m);
                     uint32_t unreadable = 0;
@@ -2873,9 +2988,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                         if (*t == '.') {
                             ++t, x[l++] = is_phased;
                         } else {
-                            char *tt = t;
-                            uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow);
-                            unreadable |= tt == t;
+                            const char *tt = t;
+                            uint32_t val;
+                            // Or "v->n_allele < 10", but it doesn't
+                            // seem to be any faster and this feels safer.
+                            if (*t >= '0' && *t <= '9' &&
+                                !(t[1] >= '0' && t[1] <= '9')) {
+                                val = *t++ - '0';
+                            } else {
+                                val = hts_str2uint(t, (char **)&t,
+                                                   sizeof(val) * CHAR_MAX - 2,
+                                                   &overflow);
+                                unreadable |= tt == t;
+                            }
                             if (max < val) max = val;
                             x[l++] = (val + 1) << 1 | is_phased;
                         }
@@ -2892,13 +3017,20 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                         return -1;
                     }
                     if ( !l ) x[l++] = 0;   // An empty field, insert missing value
-                    for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
+                    for (; l < z->size>>2; ++l)
+                        x[l] = bcf_int32_vector_end;
+
                 } else {
+                    // Otherwise arbitrary strings
                     char *x = (char*)z->buf + z->size * (size_t)m;
-                    for (l = 0; *t != ':' && *t; ++t) x[l++] = *t;
-                    for (; l < z->size; ++l) x[l] = 0;
+                    for (l = 0; *t != ':' && *t; ++t)
+                        x[l++] = *t;
+                    if (z->size > l)
+                        memset(&x[l], 0, (z->size-l) * sizeof(*x));
                 }
-            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+
+            } else if (htype == BCF_HT_INT) {
+                // One or more integers in an array
                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
                 int l;
                 for (l = 0;; ++t) {
@@ -2912,7 +3044,8 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                         {
                             if ( !extreme_val_warned )
                             {
-                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
+                                hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos,
+                                                h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1);
                                 extreme_val_warned = 1;
                             }
                             tmp_val = bcf_int32_missing;
@@ -2922,9 +3055,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                     }
                     if (*t != ',') break;
                 }
-                if ( !l ) x[l++] = bcf_int32_missing;
-                for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
-            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+                if ( !l )
+                    x[l++] = bcf_int32_missing;
+                for (; l < z->size>>2; ++l)
+                    x[l] = bcf_int32_vector_end;
+
+            } else if (htype == BCF_HT_REAL) {
+                // One of more floating point values in an array
                 float *x = (float*)(z->buf + z->size * (size_t)m);
                 int l;
                 for (l = 0;; ++t) {
@@ -2944,10 +3081,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
                     }
                     if (*t != ',') break;
                 }
-                if ( !l ) bcf_float_set_missing(x[l++]);    // An empty field, insert missing value
-                for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
+                if ( !l )
+                    // An empty field, insert missing value
+                    bcf_float_set_missing(x[l++]);
+                for (; l < z->size>>2; ++l)
+                    bcf_float_set_vector_end(x[l]);
             } else {
-                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1);
+                hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1);
                 v->errcode |= BCF_ERR_TAG_INVALID;
                 return -1;
             }
@@ -2968,24 +3108,28 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
             }
         }
 
-        for (; j < v->n_fmt; ++j) { // fill end-of-vector values
+        // fill end-of-vector values
+        for (; j < v->n_fmt; ++j) {
             fmt_aux_t *z = &fmt[j];
+            const int htype = z->y>>4&0xf;
             int l;
-            if ((z->y>>4&0xf) == BCF_HT_STR) {
+            if (htype == BCF_HT_STR) {
                 if (z->is_gt) {
                     int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
                     if (z->size) x[0] = bcf_int32_missing;
                     for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
                 } else {
                     char *x = (char*)z->buf + z->size * (size_t)m;
-                    if ( z->size ) x[0] = '.';
-                    for (l = 1; l < z->size; ++l) x[l] = 0;
+                    if ( z->size ) {
+                        x[0] = '.';
+                        memset(&x[1], 0, (z->size-1) * sizeof(*x));
+                    }
                 }
-            } else if ((z->y>>4&0xf) == BCF_HT_INT) {
+            } else if (htype == BCF_HT_INT) {
                 int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m);
                 x[0] = bcf_int32_missing;
                 for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end;
-            } else if ((z->y>>4&0xf) == BCF_HT_REAL) {
+            } else if (htype == BCF_HT_REAL) {
                 float *x = (float*)(z->buf + z->size * (size_t)m);
                 bcf_float_set_missing(x[0]);
                 for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]);
@@ -3000,7 +3144,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
 
 // write individual genotype information
 static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
-                                char *p, char *q, fmt_aux_t *fmt) {
+                                const char *p, const char *q, fmt_aux_t *fmt) {
     kstring_t *str = &v->indiv;
     int i;
     if (v->n_sample > 0) {
@@ -3050,7 +3194,8 @@ static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) {
 }
 
 // p,q is the start and the end of the FORMAT field
-static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q)
+static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v,
+                            char *p, char *q)
 {
     if ( !bcf_hdr_nsamples(h) ) return 0;
     kstring_t *mem = (kstring_t*)&h->mem;
@@ -3067,6 +3212,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p
     if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0)
         return -1;
 
+    // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is
+    // stored as per-type arrays AAA... BBB... CCC...  This is basically
+    // a data rotation or pivot.
+
+    // The size of elements in the array grow to their maximum needed,
+    // permitting fast random access.  This means however we have to first
+    // scan the whole FORMAT line to find the maximum of each type, and
+    // then scan it again to find the store the data.
+    // We break this down into compute-max, allocate, fill-out-buffers
+
+    // TODO: ?
+    // The alternative would be to pivot on the first pass, with fixed
+    // size entries for numerics and concatenated strings otherwise, also
+    // tracking maximum sizes.  Then on a second pass we reallocate and
+    // copy the data again to a uniformly sized array.  Two passes through
+    // memory, but without doubling string parsing.
+
     // compute max
     if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0)
         return -1;
@@ -3174,25 +3336,50 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
 
     v->n_info = 0;
     if (*(q-1) == ';') *(q-1) = 0;
-    for (r = key = p;; ++r) {
-        int c;
-        char *val, *end;
-        if (*r != ';' && *r != '=' && *r != 0) continue;
-        if (v->n_info == UINT16_MAX) {
-            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
-                          bcf_seqname_safe(h,v), v->pos+1);
-            v->errcode |= BCF_ERR_LIMITS;
-            goto fail;
+
+    // Parsing consists of processing key=value; or key; so we only need
+    // to track the next = and ; symbols, plus end of string.
+    // We could process 1 char at a time, but this is inefficient compared
+    // to strchr which can process word by word.  Hence even doing two strchrs
+    // is quicker than byte by byte processing.
+    char *next_equals = strchr(p, '=');
+    char *next_semicolon = strchr(p, ';');
+    r = p;
+    while (*r) {
+        // Look for key=value or just key.
+        char *val, *end, *from;
+        key = r;
+        if (next_equals && (!next_semicolon || next_equals < next_semicolon)) {
+            // key=value;
+            *next_equals = 0;
+            from = val = next_equals+1;
+            // Prefetching d->keys[hash] helps here provided we avoid
+            // computing hash twice (needs API change), but not universally.
+            // It may be significant for other applications though so it's
+            // something to consider for the future.
+        } else {
+            // key;
+            val = NULL;
+            from = key;
+        }
+
+        // Update location of next ; and if used =
+        if (next_semicolon) {
+            end = next_semicolon;
+            r = end+1;
+            next_semicolon = strchr(end+1, ';');
+            if (val)
+                next_equals = strchr(end, '=');
+        } else {
+            // find nul location, starting from key or val.
+            r = end = from + strlen(from);
         }
-        val = end = 0;
-        c = *r; *r = 0;
-        if (c == '=') {
-            val = r + 1;
-            for (end = val; *end != ';' && *end != 0; ++end);
-            c = *end; *end = 0;
-        } else end = r;
-        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
+
+        *end = 0;
+
+        // We've now got key and val (maybe NULL), so process it
         k = kh_get(vdict, d, key);
+        // 15 is default (unknown) type. See bcf_idinfo_def at top
         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
         {
             hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
@@ -3294,7 +3481,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
                 } else {
                     bcf_enc_vint(str, n_val, a_val, -1);
                 }
-                if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0)
+                if (n_val==1 && (val1!=bcf_int32_missing || is_int64) &&
+                    memcmp(key, "END", 4) == 0)
                 {
                     if ( val1 <= v->pos )
                     {
@@ -3320,9 +3508,6 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
                 bcf_enc_vfloat(str, n_val, val_f);
             }
         }
-        if (c == 0) break;
-        r = end;
-        key = r + 1;
     }
 
     free(a_val);
@@ -3335,94 +3520,162 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
 
 int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v)
 {
-    int i = 0, ret = -2, overflow = 0;
+    int ret = -2, overflow = 0;
     char *p, *q, *r, *t;
     kstring_t *str;
     khint_t k;
     ks_tokaux_t aux;
 
+//#define NOT_DOT(p) strcmp((p), ".")
+//#define NOT_DOT(p) (!(*p == '.' && !p[1]))
+//#define NOT_DOT(p) ((*p) != '.' || (p)[1])
+//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2))
+#define NOT_DOT(p) (memcmp(p, ".\0", 2))
+
     if (!s || !h || !v || !(s->s))
         return ret;
 
     // Assumed in lots of places, but we may as well spot this early
     assert(sizeof(float) == sizeof(int32_t));
 
+    // Ensure string we parse has space to permit some over-flow when during
+    // parsing.  Eg to do memcmp(key, "END", 4) in vcf_parse_info over
+    // the more straight forward looking strcmp, giving a speed advantage.
+    if (ks_resize(s, s->l+4) < 0)
+        return -1;
+
+    // Force our memory to be initialised so we avoid the technicality of
+    // undefined behaviour in using a 4-byte memcmp.  (The reality is this
+    // almost certainly is never detected by the compiler so has no impact,
+    // but equally so this code has minimal (often beneficial) impact on
+    // performance too.)
+    s->s[s->l+0] = 0;
+    s->s[s->l+1] = 0;
+    s->s[s->l+2] = 0;
+    s->s[s->l+3] = 0;
+
     bcf_clear1(v);
     str = &v->shared;
     memset(&aux, 0, sizeof(ks_tokaux_t));
-    for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) {
-        q = (char*)aux.p;
-        *q = 0;
-        if (i == 0) { // CHROM
-            vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
-            k = kh_get(vdict, d, p);
-            if (k == kh_end(d))
-            {
-                hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
-                v->errcode = BCF_ERR_CTG_UNDEF;
-                if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
-                    hts_log_error("Could not add dummy header for contig '%s'", p);
-                    v->errcode |= BCF_ERR_CTG_INVALID;
+
+    // CHROM
+    if (!(p = kstrtok(s->s, "\t", &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG];
+    k = kh_get(vdict, d, p);
+    if (k == kh_end(d)) {
+        hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p);
+        v->errcode = BCF_ERR_CTG_UNDEF;
+        if ((k = fix_chromosome(h, d, p)) == kh_end(d)) {
+            hts_log_error("Could not add dummy header for contig '%s'", p);
+            v->errcode |= BCF_ERR_CTG_INVALID;
+            goto err;
+        }
+    }
+    v->rid = kh_val(d, k).id;
+
+    // POS
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    overflow = 0;
+    char *tmp = p;
+    v->pos = hts_str2uint(p, &p, 63, &overflow);
+    if (overflow) {
+        hts_log_error("Position value '%s' is too large", tmp);
+        goto err;
+    } else if ( *p ) {
+        hts_log_error("Could not parse the position '%s'", tmp);
+        goto err;
+    } else {
+        v->pos -= 1;
+    }
+    if (v->pos >= INT32_MAX)
+        v->unpacked |= BCF_IS_64BIT;
+
+    // ID
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p);
+    else bcf_enc_size(str, 0, BCF_BT_CHAR);
+
+    // REF
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    bcf_enc_vchar(str, q - p, p);
+    v->n_allele = 1, v->rlen = q - p;
+
+    // ALT
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        for (r = t = p;; ++r) {
+            if (*r == ',' || *r == 0) {
+                if (v->n_allele == UINT16_MAX) {
+                    hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
+                                  bcf_seqname_safe(h,v), v->pos+1);
+                    v->errcode |= BCF_ERR_LIMITS;
                     goto err;
                 }
+                bcf_enc_vchar(str, r - t, t);
+                t = r + 1;
+                ++v->n_allele;
             }
-            v->rid = kh_val(d, k).id;
-        } else if (i == 1) { // POS
-            overflow = 0;
-            char *tmp = p;
-            v->pos = hts_str2uint(p, &p, 63, &overflow);
-            if (overflow) {
-                hts_log_error("Position value '%s' is too large", tmp);
-                goto err;
-            } else if ( *p ) {
-                hts_log_error("Could not parse the position '%s'", tmp);
-                goto err;
-            } else {
-                v->pos -= 1;
-            }
-            if (v->pos >= INT32_MAX)
-                v->unpacked |= BCF_IS_64BIT;
-        } else if (i == 2) { // ID
-            if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p);
-            else bcf_enc_size(str, 0, BCF_BT_CHAR);
-        } else if (i == 3) { // REF
-            bcf_enc_vchar(str, q - p, p);
-            v->n_allele = 1, v->rlen = q - p;
-        } else if (i == 4) { // ALT
-            if (strcmp(p, ".")) {
-                for (r = t = p;; ++r) {
-                    if (*r == ',' || *r == 0) {
-                        if (v->n_allele == UINT16_MAX) {
-                            hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos,
-                                          bcf_seqname_safe(h,v), v->pos+1);
-                            v->errcode |= BCF_ERR_LIMITS;
-                            goto err;
-                        }
-                        bcf_enc_vchar(str, r - t, t);
-                        t = r + 1;
-                        ++v->n_allele;
-                    }
-                    if (r == q) break;
-                }
-            }
-        } else if (i == 5) { // QUAL
-            if (strcmp(p, ".")) v->qual = atof(p);
-            else bcf_float_set_missing(v->qual);
-            if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
-        } else if (i == 6) { // FILTER
-            if (strcmp(p, ".")) {
-                if (vcf_parse_filter(str, h, v, p, q)) goto err;
-            } else bcf_enc_vint(str, 0, 0, -1);
-            if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
-        } else if (i == 7) { // INFO
-            if (strcmp(p, ".")) {
-                if (vcf_parse_info(str, h, v, p, q)) goto err;
-            }
-            if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
-        } else if (i == 8) {// FORMAT
-            return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
+            if (r == q) break;
+        }
+    }
+
+    // QUAL
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) v->qual = atof(p);
+    else bcf_float_set_missing(v->qual);
+    if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR
+
+    // FILTER
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        if (vcf_parse_filter(str, h, v, p, q)) {
+            goto err;
+        }
+    } else bcf_enc_vint(str, 0, 0, -1);
+    if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT
+
+    // INFO
+    if (!(p = kstrtok(0, 0, &aux)))
+        goto err;
+    *(q = (char*)aux.p) = 0;
+
+    if (NOT_DOT(p)) {
+        if (vcf_parse_info(str, h, v, p, q)) {
+            goto err;
         }
     }
+    if ( v->max_unpack && !(v->max_unpack>>3) ) goto end;
+
+    // FORMAT; optional
+    p = kstrtok(0, 0, &aux);
+    if (p) {
+        *(q = (char*)aux.p) = 0;
+
+        return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2;
+    } else {
+        return 0;
+    }
 
  end:
     ret = 0;

From ac70212aefe0d27662dc48bcce4302057dd5f507 Mon Sep 17 00:00:00 2001
From: James Bonfield <jkb@sanger.ac.uk>
Date: Wed, 9 Aug 2023 15:19:59 +0100
Subject: [PATCH 3/3] Revert most of the vcf_parse_info improvements.

The two we keep are the internal while loop to find the next ; or =
instead of iterating back in the outer for loop, and memcmp instead of
strcmp for "END".

The strchr changes do help glibc on excessively long INFO tokens, seen
in the GIAB truth set and GNOMAD files, but they have no impact on
most mainstream VCF outputs.  Furthermore, other C libraries, such as
MUSL, are considerably slowed down by the use of strchr.  Hence this
isn't a particularly robust or warranted change.
---
 vcf.c | 65 ++++++++++++++++++++---------------------------------------
 1 file changed, 22 insertions(+), 43 deletions(-)

diff --git a/vcf.c b/vcf.c
index ce54dc227..afd60fcaa 100644
--- a/vcf.c
+++ b/vcf.c
@@ -3336,50 +3336,26 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
 
     v->n_info = 0;
     if (*(q-1) == ';') *(q-1) = 0;
-
-    // Parsing consists of processing key=value; or key; so we only need
-    // to track the next = and ; symbols, plus end of string.
-    // We could process 1 char at a time, but this is inefficient compared
-    // to strchr which can process word by word.  Hence even doing two strchrs
-    // is quicker than byte by byte processing.
-    char *next_equals = strchr(p, '=');
-    char *next_semicolon = strchr(p, ';');
-    r = p;
-    while (*r) {
-        // Look for key=value or just key.
-        char *val, *end, *from;
-        key = r;
-        if (next_equals && (!next_semicolon || next_equals < next_semicolon)) {
-            // key=value;
-            *next_equals = 0;
-            from = val = next_equals+1;
-            // Prefetching d->keys[hash] helps here provided we avoid
-            // computing hash twice (needs API change), but not universally.
-            // It may be significant for other applications though so it's
-            // something to consider for the future.
-        } else {
-            // key;
-            val = NULL;
-            from = key;
-        }
-
-        // Update location of next ; and if used =
-        if (next_semicolon) {
-            end = next_semicolon;
-            r = end+1;
-            next_semicolon = strchr(end+1, ';');
-            if (val)
-                next_equals = strchr(end, '=');
-        } else {
-            // find nul location, starting from key or val.
-            r = end = from + strlen(from);
+    for (r = key = p;; ++r) {
+        int c;
+        char *val, *end;
+        while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++;
+        if (v->n_info == UINT16_MAX) {
+            hts_log_error("Too many INFO entries at %s:%"PRIhts_pos,
+                          bcf_seqname_safe(h,v), v->pos+1);
+            v->errcode |= BCF_ERR_LIMITS;
+            goto fail;
         }
+        val = end = NULL;
+        c = *r; *r = 0;
+        if (c == '=') {
+            val = r + 1;
 
-        *end = 0;
-
-        // We've now got key and val (maybe NULL), so process it
+            for (end = val; *end != ';' && *end != 0; ++end);
+            c = *end; *end = 0;
+        } else end = r;
+        if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; }  // faulty VCF, ";;" in the INFO
         k = kh_get(vdict, d, key);
-        // 15 is default (unknown) type. See bcf_idinfo_def at top
         if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15)
         {
             hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key);
@@ -3481,8 +3457,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
                 } else {
                     bcf_enc_vint(str, n_val, a_val, -1);
                 }
-                if (n_val==1 && (val1!=bcf_int32_missing || is_int64) &&
-                    memcmp(key, "END", 4) == 0)
+                if (n_val==1 && (val1!=bcf_int32_missing || is_int64)
+                    && memcmp(key, "END", 4) == 0)
                 {
                     if ( val1 <= v->pos )
                     {
@@ -3508,6 +3484,9 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p
                 bcf_enc_vfloat(str, n_val, val_f);
             }
         }
+        if (c == 0) break;
+        r = end;
+        key = r + 1;
     }
 
     free(a_val);