diff --git a/Makefile b/Makefile index 0bbb078d6..99d18a660 100644 --- a/Makefile +++ b/Makefile @@ -872,7 +872,7 @@ test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) -test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) +test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_vcfutils_h) $(htslib_kbitset_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_hts_defs_h) $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) diff --git a/htslib/vcf.h b/htslib/vcf.h index 105d70d70..36d45ad82 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -65,11 +65,21 @@ extern "C" { #define BCF_HT_STR 3 #define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! -#define BCF_VL_FIXED 0 // variable length -#define BCF_VL_VAR 1 -#define BCF_VL_A 2 -#define BCF_VL_G 3 -#define BCF_VL_R 4 +// Values for INFO / FORMAT "Number=" fields +#define BCF_VL_FIXED 0 ///< Integer defining a fixed number of items +#define BCF_VL_VAR 1 ///< Generic variable length ("Number=.") +#define BCF_VL_A 2 ///< One value per alternate allele +#define BCF_VL_G 3 ///< One value for each possible genotype +#define BCF_VL_R 4 ///< One value for each allele, including the reference + +// The following was introduced in VCFv4.4 and is only valid for FORMAT header lines +#define BCF_VL_P 5 ///< One value for each allele value defined in GT + +// The following were introduced in VCFv4.5 and are only valid for FORMAT header lines +#define BCF_VL_LA 6 ///< As BCF_VL_A, but only alt alleles listed in LAA are considered present +#define BCF_VL_LG 7 ///< As BCF_VL_G, but only alt alleles listed in LAA are considered present +#define BCF_VL_LR 8 ///< As BCF_VL_R, but only alt alleles listed in LAA are considered present +#define BCF_VL_M 9 ///< One value for each possible base modification for the corresponding ChEBI ID /* === Dictionary === diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 90dbc080b..2de39c72e 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -30,6 +30,8 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/hts.h" #include "../htslib/vcf.h" +#include "../htslib/vcfutils.h" +#include "../htslib/kbitset.h" #include "../htslib/kstring.h" #include "../htslib/kseq.h" @@ -802,6 +804,246 @@ void test_rlen_values(void) hts_set_log_level(logging); } +static void test_vl_types(void) +{ + const char *test_vcf = "data:," + "##fileformat=VCFv4.5\n" + "##FILTER=\n" + "##contig=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tAAA\n"; + + typedef struct { + const char *id; + int type; + int expected_vl_code; + int expected_number; + } expected_types_t; + + expected_types_t expected[] = { + { "FIXED_1_INFO", BCF_HL_INFO, BCF_VL_FIXED, 1 }, + { "FIXED_4_INFO", BCF_HL_INFO, BCF_VL_FIXED, 4 }, + { "VL_DOT_INFO", BCF_HL_INFO, BCF_VL_VAR, 0xfffff }, + { "VL_A_INFO", BCF_HL_INFO, BCF_VL_A, 0xfffff }, + { "VL_G_INFO", BCF_HL_INFO, BCF_VL_G, 0xfffff }, + { "VL_R_INFO", BCF_HL_INFO, BCF_VL_R, 0xfffff }, + { "FIXED_1_FMT", BCF_HL_FMT, BCF_VL_FIXED, 1 }, + { "FIXED_4_FMT", BCF_HL_FMT, BCF_VL_FIXED, 4 }, + { "VL_DOT_FMT", BCF_HL_FMT, BCF_VL_VAR, 0xfffff }, + { "VL_A_FMT", BCF_HL_FMT, BCF_VL_A, 0xfffff }, + { "VL_G_FMT", BCF_HL_FMT, BCF_VL_G, 0xfffff }, + { "VL_R_FMT", BCF_HL_FMT, BCF_VL_R, 0xfffff }, + { "VL_P_FMT", BCF_HL_FMT, BCF_VL_P, 0xfffff }, + { "VL_LA_FMT", BCF_HL_FMT, BCF_VL_LA, 0xfffff }, + { "VL_LG_FMT", BCF_HL_FMT, BCF_VL_LG, 0xfffff }, + { "VL_LR_FMT", BCF_HL_FMT, BCF_VL_LR, 0xfffff }, + { "VL_M_FMT", BCF_HL_FMT, BCF_VL_M, 0xfffff }, + }; + + htsFile *fp = hts_open(test_vcf, "r"); + if (!fp) error("Failed to open test data\n"); + bcf_hdr_t *hdr = bcf_hdr_read(fp); + if (!hdr) error("Failed to read BCF header\n"); + size_t i; + for (i = 0; i < sizeof(expected)/sizeof(expected[0]); i++) + { + int id_num = bcf_hdr_id2int(hdr, BCF_DT_ID, expected[i].id); + if (id_num < 0) + { + error("Couldn't look up VCF header ID %s\n", expected[i].id); + } + int vl_code = bcf_hdr_id2length(hdr, expected[i].type, id_num); + if (vl_code != expected[i].expected_vl_code) + { + const char *length_types[] = { + "BCF_VL_FIXED", "BCF_VL_VAR", "BCF_VL_A", "BCF_VL_G", + "BCF_VL_R", "BCF_VL_P", "BCF_VL_LA", "BCF_VL_LG", + "BCF_VL_LR", "BCF_VL_M" + }; + if (vl_code >= 0 && vl_code < sizeof(length_types)/sizeof(length_types[0])) + { + error("Unexpected length code for %s: expected %s got %s\n", + expected[i].id, + length_types[expected[i].expected_vl_code], + length_types[vl_code]); + } + else + { + error("Unexpected length code for %s: expected %s got %d\n", + expected[i].id, + length_types[expected[i].expected_vl_code], vl_code); + } + } + int num = bcf_hdr_id2number(hdr, expected[i].type, id_num); + if (num != expected[i].expected_number) + { + error("Unexpected number for %s: expected %d%s got %d%s\n", + expected[i].id, + expected[i].expected_number, + (expected[i].expected_number == 0xfffff + ? " (= code for not fixed)" : ""), + num, + num == 0xfffff ? " (= code for not fixed)" : ""); + } + } + bcf_hdr_destroy(hdr); + bcf_close(fp); +} + +static int read_vcf_line(const char *line, bcf_hdr_t *hdr, bcf1_t *rec, + kstring_t *kstr) +{ + int ret; + if (kputsn(line, strlen(line), ks_clear(kstr)) < 0) + return -1; + + ret = vcf_parse(kstr, hdr, rec); + if (ret != 0) { + fprintf(stderr, "vcf_parse(\"%s\", hdr, rec) returned %d\n", + ks_c_str(kstr), ret); + } + return ret; +} + +static void chomp(kstring_t *kstr) +{ + if (kstr->l < 1) + return; + if (kstr->s[kstr->l - 1] == '\n') + kstr->s[--kstr->l] = '\0'; +} + +// Test bcf_remove_allele_set() +void test_bcf_remove_allele_set(void) +{ + char header[] = "##fileformat=VCFv4.5\n" + "##FILTER=\n" + "##contig=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##INFO=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##FORMAT=\n" + "##ALT=\n" + "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tAAA\tBBB\tCCC\n"; + const char * inputs[] = { + "5\t110285\t.\tT\tC,<*>\t.\tPASS\tAC=1,0;AD=6,5,0;AF=0.99,0.01;VL_A_STR_INFO=alt_c,alt_nonref;VL_R_STR_INFO=ref,alt_c,alt_nonref\tGT:AD:EC:PL:VL_A_STR_FMT:VL_G_STR_FMT:VL_R_STR_FMT\t.:.:.:.:.:.:.\t0/1:6,5,0:4,0:114,0,15,35,73,113:alt_c,alt_nonref:gt_00,gt_01,gt_11,gt_02,gt_12,gt_22:ref,alt_c,alt_nonref\t.:.:.:.:.:.:.", + "5\t110290\t.\tT\tC,A\t.\tPASS\tAC=90,1;AD=6,5,6;AF=0.009,0.0001;VL_A_STR_INFO=alt_c,alt_a;VL_R_STR_INFO=ref,alt_c,alt_a\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/1:1,2:3,2,0:44,27:114,0,15,35,73,113:alt_c,alt_a:gt_00,gt_01,gt_11,gt_02,gt_12,gt_22:ref,alt_c,alt_a\t1/1:1:0,3:46:110,15,0:alt_c:gt_00,gt_01,gt_11:ref,alt_c", + "5\t110350\t.\tT\t,\t.\tPASS\tIMPRECISE;SVLEN=100,200;CIEND=-50,50,-25,25;CIPOS=-10,10,-20,20\tGT\t0/1\t0/1\t0/1", + "5\t110500\t.\tT\t,\t.\tPASS\tIMPRECISE;SVLEN=50,100;CILEN=0,25,-25,25;CN=2,4;CICN=-0.5,1,-1.5,1.5\tGT\t0/1\t0/1\t0/1", + "5\t110700\t.\tA\t,\t.\tPASS\tMEINFO=AluY,1,260,+,FLAM_C,1,110,-;METRANS=1,94820,95080,+,1,129678,129788,-\tGT\t0/1\t0/1\t0/1", + "5\t112000\t.\tC\t,\t.\tPASS\tRN=2,1;RUS=CAG,TTG,CA;RUL=3,3,2;RB=12,6,6;RUC=4,2,3;RUB=3,3,3,3,3,3,2,2,2;SVLEN=18,6", + "5\t113000\t.\tT\tC,A\t.\tPASS\tAC=90,1;AD=6,5,6;AF=0.009,0.0001;VL_A_STR_INFO=alt_c,alt_a;VL_R_STR_INFO=ref,alt_c,alt_a\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/1:1,2:3,2,0:44,27:114,0,15,35,73,113:alt_c,alt_a:gt_00,gt_01,gt_11,gt_02,gt_12,gt_22:ref,alt_c,alt_a\t1/1:1:0,3:46:110,15,0:alt_c:gt_00,gt_01,gt_11:ref,alt_c", + "5\t114000\t.\tT\tC,A\t.\tPASS\tAC=90,1;AD=6,5,6;AF=0.009,0.0001;VL_A_STR_INFO=alt_c,alt_a;VL_R_STR_INFO=ref,alt_c,alt_a\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/1:1,2:3,2,0:44,27:114,0,15,35,73,113:alt_c,alt_a:gt_00,gt_01,gt_11,gt_02,gt_12,gt_22:ref,alt_c,alt_a\t1/1:1:0,3:46:110,15,0:alt_c:gt_00,gt_01,gt_11:ref,alt_c", + "5\t115000\t.\tC\t,\t.\tPASS\tRN=2,1;RUS=CAG,TTG,CA;RUL=3,3,2;RB=12,6,6;RUC=4,2,3;RUB=3,3,3,3,3,3,2,2,2;SVLEN=18,6", + }; + const char * expected[] = { + // 2nd allele removed + "5\t110285\t.\tT\tC\t.\tPASS\tAC=1;AD=6,5;AF=0.99;VL_A_STR_INFO=alt_c;VL_R_STR_INFO=ref,alt_c\tGT:AD:EC:PL:VL_A_STR_FMT:VL_G_STR_FMT:VL_R_STR_FMT\t.:.:.:.:.:.:.\t0/1:6,5:4:114,0,15:alt_c:gt_00,gt_01,gt_11:ref,alt_c\t.:.:.:.:.:.:.", + "5\t110290\t.\tT\tC\t.\tPASS\tAC=90;AD=6,5;AF=0.009;VL_A_STR_INFO=alt_c;VL_R_STR_INFO=ref,alt_c\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/1:1:3,2:44:114,0,15:alt_c:gt_00,gt_01,gt_11:ref,alt_c\t1/1:1:0,3:46:110,15,0:alt_c:gt_00,gt_01,gt_11:ref,alt_c", + "5\t110350\t.\tT\t\t.\tPASS\tIMPRECISE;SVLEN=100;CIEND=-50,50;CIPOS=-10,10\tGT\t0/1\t0/1\t0/1", + "5\t110500\t.\tT\t\t.\tPASS\tIMPRECISE;SVLEN=50;CILEN=0,25;CN=2;CICN=-0.5,1\tGT\t0/1\t0/1\t0/1", + "5\t110700\t.\tA\t\t.\tPASS\tMEINFO=AluY,1,260,+;METRANS=1,94820,95080,+\tGT\t0/1\t0/1\t0/1", + "5\t112000\t.\tC\t\t.\tPASS\tRN=2;RUS=CAG,TTG;RUL=3,3;RB=12,6;RUC=4,2;RUB=3,3,3,3,3,3;SVLEN=18", + // 1st allele removed + "5\t113000\t.\tT\tA\t.\tPASS\tAC=1;AD=6,6;AF=0.0001;VL_A_STR_INFO=alt_a;VL_R_STR_INFO=ref,alt_a\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/.:1:3,0:27:114,35,113:alt_a:gt_00,gt_02,gt_22:ref,alt_a\t./.:.:0:.:110:.:gt_00:ref", + // Both alleles removed + "5\t114000\t.\tT\t.\t.\tPASS\tAD=6;VL_R_STR_INFO=ref\tGT:LAA:LAD:LEC:LPL:VL_LA_STR_FMT:VL_LG_STR_FMT:VL_LR_STR_FMT\t0/0:.:3:.:0:.:gt_00:ref\t0/.:.:3:.:114:.:gt_00:ref\t./.:.:0:.:110:.:gt_00:ref", + "5\t115000\t.\tC\t.\t.\tPASS\t.", + }; + + kstring_t kstr = KS_INITIALIZE; + + bcf_hdr_t *hdr = bcf_hdr_init("r"); + bcf1_t *rec = bcf_init(); + struct kbitset_t *rm_set = kbs_init(3); + size_t i; + + if (!hdr) + error("bcf_hdr_init() failed\n"); + + if (!rec) + error("bcf_init() failed\n"); + + if (!rm_set) + error("kbs_init() failed\n"); + + check0(ks_resize(&kstr, 1000)); + check0(bcf_hdr_parse(hdr, header)); + for (i = 0; i < sizeof(inputs)/sizeof(inputs[0]); i++) + { + check0(read_vcf_line(inputs[i], hdr, rec, &kstr)); + kbs_clear(rm_set); + if (rec->pos == 113000 - 1) { + kbs_insert(rm_set, 1); + } else if (rec->pos >= 114000 - 1) { + kbs_insert(rm_set, 1); + kbs_insert(rm_set, 2); + } else { + kbs_insert(rm_set, 2); + } + check0(bcf_remove_allele_set(hdr, rec, rm_set)); + check0(vcf_format(hdr, rec, ks_clear(&kstr))); + chomp(&kstr); + if (strcmp(expected[i], ks_c_str(&kstr)) != 0) + { + error("bcf_remove_allele_set() output differs\nExpected:\n%s\nGot:\n%s\n", + expected[i], ks_c_str(&kstr)); + } + } + bcf_destroy(rec); + bcf_hdr_destroy(hdr); + ks_free(&kstr); + kbs_destroy(rm_set); +} + int main(int argc, char **argv) { char *fname = argc>1 ? argv[1] : "rmme.bcf"; @@ -815,9 +1057,11 @@ int main(int argc, char **argv) iterator(fname); // additional tests. quiet unless there's a failure. + test_vl_types(); test_get_info_values(fname); test_invalid_end_tag(); test_open_format(); test_rlen_values(); + test_bcf_remove_allele_set(); return 0; } diff --git a/vcf.c b/vcf.c index a03cfde7c..a274a8cd6 100644 --- a/vcf.c +++ b/vcf.c @@ -913,14 +913,20 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } else if ( !strcmp(hrec->keys[i], "Number") ) { + int is_fmt = hrec->type == BCF_HL_FMT; if ( !strcmp(hrec->vals[i],"A") ) var = BCF_VL_A; else if ( !strcmp(hrec->vals[i],"R") ) var = BCF_VL_R; else if ( !strcmp(hrec->vals[i],"G") ) var = BCF_VL_G; else if ( !strcmp(hrec->vals[i],".") ) var = BCF_VL_VAR; + else if ( is_fmt && !strcmp(hrec->vals[i],"P") ) var = BCF_VL_P; + else if ( is_fmt && !strcmp(hrec->vals[i],"LA") ) var = BCF_VL_LA; + else if ( is_fmt && !strcmp(hrec->vals[i],"LR") ) var = BCF_VL_LR; + else if ( is_fmt && !strcmp(hrec->vals[i],"LG") ) var = BCF_VL_LG; + else if ( is_fmt && !strcmp(hrec->vals[i],"M") ) var = BCF_VL_M; else { - sscanf(hrec->vals[i],"%d",&num); - var = BCF_VL_FIXED; + if (sscanf(hrec->vals[i],"%d",&num) == 1) + var = BCF_VL_FIXED; } if (var != BCF_VL_FIXED) num = 0xfffff; } @@ -931,7 +937,7 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) *hrec->key == 'I' ? "An" : "A", hrec->key); type = BCF_HT_STR; } - if (var == -1) { + if (var == UINT32_MAX) { hts_log_warning("%s %s field has no Number defined. Assuming '.'", *hrec->key == 'I' ? "An" : "A", hrec->key); var = BCF_VL_VAR; @@ -1230,26 +1236,117 @@ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, co return kh_val(d, k).hrec[type==BCF_HL_CTG?0:type]; } +// Check the VCF header is correctly formatted as per the specification. +// Note the code that calls this doesn't bother to check return values and +// we have so many broken VCFs in the wild that for now we just reprt a +// warning and continue anyway. So currently this is a void function. void bcf_hdr_check_sanity(bcf_hdr_t *hdr) { - static int PL_warned = 0, GL_warned = 0; + int version = bcf_get_version(hdr, NULL); - if ( !PL_warned ) - { - int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "PL"); - if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) - { - hts_log_warning("PL should be declared as Number=G"); - PL_warned = 1; + struct tag { + char name[10]; + char type_str[3]; + int type; + int version; + }; + + struct tag info_tags[] = { + {"AD", "R", BCF_VL_R, VCF_DEF}, + {"ADF", "R", BCF_VL_R, VCF_DEF}, + {"ADR", "R", BCF_VL_R, VCF_DEF}, + {"AC", "A", BCF_VL_A, VCF_DEF}, + {"AF", "A", BCF_VL_A, VCF_DEF}, + {"CIGAR", "A", BCF_VL_A, VCF_DEF}, + {"AA", "1", BCF_VL_FIXED, VCF_DEF}, + {"AN", "1", BCF_VL_FIXED, VCF_DEF}, + {"BQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"DB", "0", BCF_VL_FIXED, VCF_DEF}, + {"DP", "1", BCF_VL_FIXED, VCF_DEF}, + {"END", "1", BCF_VL_FIXED, VCF_DEF}, + {"H2", "0", BCF_VL_FIXED, VCF_DEF}, + {"H3", "0", BCF_VL_FIXED, VCF_DEF}, + {"MQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"MQ0", "1", BCF_VL_FIXED, VCF_DEF}, + {"NS", "1", BCF_VL_FIXED, VCF_DEF}, + {"SB", "4", BCF_VL_FIXED, VCF_DEF}, + {"SOMATIC", "0", BCF_VL_FIXED, VCF_DEF}, + {"VALIDATED", "0", BCF_VL_FIXED, VCF_DEF}, + {"1000G", "0", BCF_VL_FIXED, VCF_DEF}, + }; + static int info_warned[sizeof(info_tags)/sizeof(*info_tags)] = {0}; + + struct tag fmt_tags[] = { + {"AD", "R", BCF_VL_R, VCF_DEF}, + {"ADF", "R", BCF_VL_R, VCF_DEF}, + {"ADR", "R", BCF_VL_R, VCF_DEF}, + {"EC", "A", BCF_VL_A, VCF_DEF}, + {"GL", "G", BCF_VL_G, VCF_DEF}, + {"GP", "G", BCF_VL_G, VCF_DEF}, + {"PL", "G", BCF_VL_G, VCF_DEF}, + {"PP", "G", BCF_VL_G, VCF_DEF}, + {"DP", "1", BCF_VL_FIXED, VCF_DEF}, + {"LEN", "1", BCF_VL_FIXED, VCF_DEF}, + {"FT", "1", BCF_VL_FIXED, VCF_DEF}, + {"GQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"GT", "1", BCF_VL_FIXED, VCF_DEF}, + {"HQ", "2", BCF_VL_FIXED, VCF_DEF}, + {"MQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"PQ", "1", BCF_VL_FIXED, VCF_DEF}, + {"PS", "1", BCF_VL_FIXED, VCF_DEF}, + {"PSL", "P", BCF_VL_P, VCF44}, + {"PSO", "P", BCF_VL_P, VCF44}, + {"PSQ", "P", BCF_VL_P, VCF44}, + {"LGL", "LG", BCF_VL_LG, VCF45}, + {"LGP", "LG", BCF_VL_LG, VCF45}, + {"LPL", "LG", BCF_VL_LG, VCF45}, + {"LPP", "LG", BCF_VL_LG, VCF45}, + {"LEC", "LA", BCF_VL_LA, VCF45}, + {"LAD", "LR", BCF_VL_LR, VCF45}, + {"LADF", "LR", BCF_VL_LR, VCF45}, + {"LADR", "LR", BCF_VL_LR, VCF45}, + }; + static int fmt_warned[sizeof(fmt_tags)/sizeof(*fmt_tags)] = {0}; + + // Check INFO tag types. We shouldn't really permit ".", but it's + // commonly misused so we let it slide unless it's a new tag and the + // file format claims to be new also. We also cannot distinguish between + // Number=1 and Number=2, but we at least report the correct term if we + // get, say, Number=G in its place. + int i; + for (i = 0; i < sizeof(info_tags)/sizeof(*info_tags); i++) { + if (info_warned[i]) + continue; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, info_tags[i].name); + if (bcf_hdr_idinfo_exists(hdr, BCF_HL_INFO, id) && + bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != info_tags[i].type && + bcf_hdr_id2length(hdr, BCF_HL_INFO, id) != BCF_VL_VAR) { + hts_log_warning("%s should be declared as Number=%s", + info_tags[i].name, info_tags[i].type_str); + info_warned[i] = 1; } } - if ( !GL_warned ) - { - int id = bcf_hdr_id2int(hdr, BCF_DT_ID, "GL"); - if ( bcf_hdr_idinfo_exists(hdr,BCF_HL_FMT,id) && bcf_hdr_id2length(hdr,BCF_HL_FMT,id)!=BCF_VL_G ) - { - hts_log_warning("GL should be declared as Number=G"); - GL_warned = 1; + + // Check FORMAT tag types. + for (i = 0; i < sizeof(fmt_tags)/sizeof(*fmt_tags); i++) { + if (fmt_warned[i]) + continue; + int id = bcf_hdr_id2int(hdr, BCF_DT_ID, fmt_tags[i].name); + if (bcf_hdr_idinfo_exists(hdr, BCF_HL_FMT, id) && + bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != fmt_tags[i].type) { + // Permit "Number=." if this tag predates the vcf version it is + // defined within. This is a common tactic for callers to use + // new tags with older formats in order to avoid parsing failures + // with some software. + // We don't care for 4.3 and earlier as that's more of a wild-west + // and it's not abnormal to see incorrect usage of Number=. there. + if ((version < VCF44 && + bcf_hdr_id2length(hdr, BCF_HL_FMT, id) != BCF_VL_VAR) || + (version >= VCF44 && version >= fmt_tags[i].version)) { + hts_log_warning("%s should be declared as Number=%s", + fmt_tags[i].name, fmt_tags[i].type_str); + fmt_warned[i] = 1; + } } } } diff --git a/vcfutils.c b/vcfutils.c index 890c50a16..3e6abb6c8 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2018, 2020-2022 Genome Research Ltd. + Copyright (C) 2012-2018, 2020-2022, 2025 Genome Research Ltd. Author: Petr Danecek @@ -251,10 +251,427 @@ int bcf_remove_alleles(const bcf_hdr_t *header, bcf1_t *line, int rm_mask) return 0; // FIXME: check for errs in this function } +static inline int is_special_info_type(const char *name) +{ + // These types have Number=. but are really either two or four times + // the number of ALT fields. + switch (*name) + { + case 'C': + if (name[1] == 'I' && // Confidence intervals + (strcmp(name + 2, "CN") == 0 + || strcmp(name + 2, "END") == 0 + || strcmp(name + 2, "LEN") == 0 + || strcmp(name + 2, "POS") == 0)) + return 2; + break; + case 'M': + if (name[1] == 'E' && // Mobile elements + (strcmp(name, "MEINFO") == 0 + || strcmp(name, "METRANS") == 0)) + return 4; + break; + default: + break; + } + return 1; +} + +static int32_t get_int32_info_value(const bcf_info_t *info, + size_t index) +{ + size_t len = info->len > 0 ? info->len : 0; + if (index >= len) + return bcf_int32_missing; + + int32_t val; + float f; + switch (info->type) + { + case BCF_BT_INT8: + val = le_to_i8(info->vptr + index); + return (val > bcf_int8_vector_end + ? val : bcf_int32_vector_end - (bcf_int8_vector_end - val)); + break; + case BCF_BT_INT16: + val = le_to_i16(info->vptr + index * sizeof(int16_t)); + return (val > bcf_int16_vector_end + ? val : bcf_int32_vector_end - (bcf_int16_vector_end - val)); + case BCF_BT_INT32: + return le_to_i32(info->vptr + index * sizeof(int32_t)); + case BCF_BT_FLOAT: + f = le_to_float(info->vptr + index * sizeof(float)); + if (bcf_float_is_missing(f)) + return bcf_int32_missing; + if (bcf_float_is_vector_end(f)) + return bcf_int32_vector_end; + return f; + default: + break; + } + return bcf_int32_missing; +} + +static int32_t get_rn_value(const bcf_info_t *rn, size_t index) +{ + // If RN tag is not present, default to one repeat per allele. + int32_t val = rn ? get_int32_info_value(rn, index) : 1; + // Treat MISSING or illegal values as 0. + return val >= 0 ? val : 0; +} + +// If info->len becomes 1, the single value needs to be put in info->v1 +// in case the value is accessed through that route instead of an array lookup. +static void set_info_v1(bcf_info_t *info) +{ + switch (info->type) + { + case BCF_BT_INT8: + info->v1.i = le_to_i8(info->vptr); + break; + case BCF_BT_INT16: + info->v1.i = le_to_i16(info->vptr); + break; + case BCF_BT_INT32: + info->v1.i = le_to_i32(info->vptr); + break; + case BCF_BT_INT64: + info->v1.i = le_to_i64(info->vptr); + break; + case BCF_BT_FLOAT: + info->v1.f = le_to_float(info->vptr); + break; + default: + break; + } +} + +static int fixup_info_length_code(bcf_info_t *info) +{ + // Recreate INFO type encoding stored before info->vptr + uint8_t buf[24], *ptr = buf; + ptrdiff_t new_len; + int type = bcf_enc_inttype(info->key); + *ptr++ = (1 << 4) | type; + i32_to_le(info->key, ptr); + ptr += 1 << bcf_type_shift[type]; + type = bcf_enc_inttype(info->len); + if (info->len < 15) + { + *ptr++ = (info->len << 4) | info->type; + } + else + { + *ptr++ = 0xf0 | info->type; + type = bcf_enc_inttype(info->len); + *ptr++ = (1 << 4) | type; + i32_to_le(info->len, ptr); + ptr += 1 << bcf_type_shift[type]; + } + + new_len = ptr - buf; + + if (new_len == info->vptr_off) + { + // Happy case, length hasn't changed + memcpy(info->vptr - info->vptr_off, buf, new_len); + } + else if (new_len < info->vptr_off) + { + // Shrinkage - need to adjust location of following data + // Most likely to happen if length has gone from >= 15 to < 15. + ptrdiff_t adjust = info->vptr_off - new_len; + memcpy(info->vptr - info->vptr_off, buf, new_len); + memmove(info->vptr - adjust, info->vptr, info->vptr_len); + info->vptr -= adjust; + info->vptr_off -= adjust; + } + else + { + // Grown - this shouldn't happen as we are removing entries, + // but just in case... + uint8_t *new_info = malloc(info->vptr_len + new_len); + if (!new_info) + return -1; + memcpy(new_info, buf, new_len); + memcpy(new_info + new_len, info->vptr, info->vptr_len); + if (info->vptr_free) + free(info->vptr - info->vptr_off); + info->vptr_off = new_len; + info->vptr = new_info + new_len; + info->vptr_free = 1; + } + return 0; +} + +static int mark_for_removal(bcf_info_t *info) +{ + if (info->vptr_free) + { + free(info->vptr - info->vptr_off); + info->vptr_free = 0; + } + info->vptr = NULL; + info->vptr_off = info->vptr_len = 0; + return 0; +} + +// Remove integer CNV:TR-related tag data +// Returns 0 on success +// 1 if the tag had an unexpected type or number of elements +// -1 if it failed to allocate memory +static int trim_int_cnv_tr_int_tags(bcf_info_t *info, + const bcf_hdr_t *header, + const struct kbitset_t *rm_set, + const char *id, + const bcf_info_t *rn, + const bcf_info_t *ruc, + size_t num_alt_orig, + size_t orig_total) +{ + size_t count = *id == 'C' ? 2 : 1; + int type = bcf_hdr_id2type(header, BCF_HL_INFO, info->key); + int vlen = bcf_hdr_id2length(header, BCF_HL_INFO, info->key); + size_t allele, unit = 0, orig_pos = 0, new_pos = 0; + const uint32_t element_sizes[8] = { 0, 1, 2, 4, 0, 4, 0, 0 }; + size_t element_size = element_sizes[info->type & 0x7]; + int new_total = 0; + + // Give up in these cases + if (// Wrong type + (type != BCF_HT_INT && type != BCF_HT_REAL) + || element_size == 0 + // Not Number=. + || vlen != BCF_VL_VAR + // Unexpected number of items + || info->len != orig_total + // Might fall of the end of the stored data + || info->vptr_len < orig_total * element_size * count) + return 1; + + for (allele = 0; allele < num_alt_orig; allele++) + { + int32_t n_repeats = get_rn_value(rn, allele); + int32_t n_items = n_repeats; + size_t byte_len; + + if (ruc) // For the RUB tag, need to add up items in RUC + { + n_items = 0; + while (n_repeats-- > 0) + { + int32_t n_units = get_int32_info_value(ruc, unit++); + n_items += n_units >= 0 ? n_units : 0; + } + } + + byte_len = n_items * element_size * count; + + if (kbs_exists(rm_set, allele + 1)) // Skip + { + orig_pos += byte_len; + continue; + } + + if (new_pos < orig_pos) // Shuffle data down + memmove(info->vptr + new_pos, info->vptr + orig_pos, byte_len); + + orig_pos += byte_len; + new_pos += byte_len; + new_total += n_items; + } + + if (new_total == 0) + return mark_for_removal(info); + + info->vptr_len = new_pos; + info->len = new_total; + if (info->len == 1) + set_info_v1(info); + return fixup_info_length_code(info); +} + +// Remove string CNV:TR-related tag data +// Returns 0 on success +// 1 if the tag had an unexpected type +// -1 if it failed to allocate memory +static int trim_int_cnv_tr_str_tags(bcf_info_t *info, + const bcf_hdr_t *header, + const struct kbitset_t *rm_set, + const bcf_info_t *rn, + size_t num_alt_orig, + size_t orig_total) +{ + int type = bcf_hdr_id2type(header, BCF_HL_INFO, info->key); + int vlen = bcf_hdr_id2length(header, BCF_HL_INFO, info->key); + size_t allele, orig_pos = 0, new_pos = 0; + + // Give up in these cases + if ( // Wrong type + type != BCF_HT_STR + || info->type != BCF_BT_CHAR + // Not Number=. + || vlen != BCF_VL_VAR) + return 1; + + for (allele = 0; allele < num_alt_orig; allele++) + { + int32_t n_items = get_rn_value(rn, allele); + uint8_t *start = info->vptr + orig_pos; + uint8_t *end = start; + uint8_t *lim = info->vptr + info->vptr_len; + + while (n_items-- > 0) + { + while (end < lim && *end != '\0' && *end != ',') ++end; + if (end == lim || *end == '\0') + break; + end++; + } + + if (kbs_exists(rm_set, allele + 1)) // Skip + { + orig_pos += end - start; + continue; + } + + if (new_pos < orig_pos) // Shuffle data down + memmove(info->vptr + new_pos, info->vptr + orig_pos, end - start); + + orig_pos += end - start; + new_pos += end - start; + } + if (new_pos == 0) + return mark_for_removal(info); + + if (new_pos < orig_pos) + { + info->vptr[new_pos] = '\0'; + // Dropping items at the end can leave a trailing comma. Remove if + // present. + if (new_pos > 0 && info->vptr[new_pos - 1] == ',') + info->vptr[--new_pos] = '\0'; + info->len = new_pos; + info->vptr_len = new_pos; + return fixup_info_length_code(info); + } + return 0; +} + +static int fixup_cnv_tr_info_tags(const bcf_hdr_t *header, bcf1_t *line, + size_t num_alt_orig, + const struct kbitset_t *rm_set) +{ + /* + Tags for alleles (tandem repeats): + RN : Repeat number. Number=A, handled as other tags of this type + RUS : Repeat unit sequence. Number for each allele is in RN. + RUL : Repeat unit length. Number for each allele is in RN. + RB : Repeat sequence length. Number for each allele is in RN. + RUC : Repeat unit count. Number for each allele is in RN. + CIRB : Confidence interval around RB. Two items for each in RB. + CIRUC : Confidence interval around RUC. Two items for each in RUC. + RUB : Number of bases in each repeat unit. Number for each repeat unit + in RUC. + */ + + bcf_info_t *rn = bcf_get_info(header, line, "RN"); + bcf_info_t *ruc = bcf_get_info(header, line, "RUC"); + + int64_t orig_total_repeats = 0; + int64_t orig_total_units = 0; + size_t allele, unit = 0; + uint32_t i; + + // Get total number of values for RUS etc., and RUB so the counts found + // later can be checked. + for (allele = 0; allele < num_alt_orig; allele++) + { + int32_t n_repeats = get_rn_value(rn, allele); + orig_total_repeats += n_repeats; + if (ruc) + { + while (n_repeats-- > 0) + { + int32_t n_units = get_int32_info_value(ruc, unit++); + orig_total_units += n_units >= 0 ? n_units : 0; + } + } + } + + // Find any INFO tags that might need to be fixed up + for (i = 0; i < line->n_info; i++) + { + bcf_info_t *info = &line->d.info[i]; + const char *id = bcf_hdr_int2id(header, BCF_DT_ID, info->key); + uint8_t *orig_ptr = info->vptr - info->vptr_off; + if (*id != 'C' && *id != 'R') + continue; + // Ignore RUC here, as it's needed intact to process RUB + if (strcmp(id, "RB") == 0 + || strcmp(id, "RUL") == 0 + || strcmp(id, "CIRB") == 0 + || strcmp(id, "CIRUC") == 0) + { + int res = trim_int_cnv_tr_int_tags(info, header, rm_set, id, rn, + NULL, num_alt_orig, + orig_total_repeats); + // res could be > 0 if the tag had an unexpected type or number of + // values. Currently these are ignored, so left unchanged, but we + // may want to warn or treat them as errors instead. + if (res < 0) + return res; + } + else if (strcmp(id, "RUS") == 0) + { + int res = trim_int_cnv_tr_str_tags(info, header, rm_set, rn, + num_alt_orig, + orig_total_repeats); + if (res < 0) + return res; + } + else if (ruc && strcmp(id, "RUB") == 0) + { + int res = trim_int_cnv_tr_int_tags(info, header, rm_set, id, rn, + ruc, num_alt_orig, + orig_total_units); + if (res < 0) + return res; + } + + // Check if the info tag was removed, or storage had to be reallocated. + // The latter can only happen if the length code stored before info->vptr + // was too small, which should hopefully never be the case. + if (!info->vptr || info->vptr - info->vptr_off != orig_ptr) + line->d.shared_dirty |= BCF1_DIRTY_INF; + } + // Now do RUC, if present + if (ruc) + { + int res = trim_int_cnv_tr_int_tags(ruc, header, rm_set, "RUC", rn, NULL, + num_alt_orig, orig_total_repeats); + if (res < 0) + return res; + } + return 0; +} + int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kbitset_t *rm_set) { - int *map = (int*) calloc(line->n_allele, sizeof(int)); + const uint32_t vl_a_g_r = 1U << BCF_VL_A | 1U << BCF_VL_G | 1U << BCF_VL_R; + const uint32_t vl_la_lg_lr = 1U << BCF_VL_LA | 1U << BCF_VL_LG | 1U << BCF_VL_LR; + const uint32_t vl_a_g_r_la_lg_lr = vl_a_g_r | vl_la_lg_lr; + const uint32_t vl_a_r = 1U << BCF_VL_A | 1U << BCF_VL_R; + const uint32_t vl_la_lr = 1U << BCF_VL_LA | 1U << BCF_VL_LR; + const uint32_t vl_a_r_la_lr = vl_a_r | vl_la_lr; + const char *cardinalities[] = { + "fixed", ".", "A", "G", "R", "P", "LA", "LG", "LR", "M" + }; + int *map = malloc(line->n_allele * sizeof(int)); + int *laa = NULL, *laa_map = NULL, *lr_orig = NULL; uint8_t *dat = NULL; + int num_laa, laa_size = 0, laa_map_stride = 0; + int have_cnv_tr = 0; bcf_unpack(line, BCF_UN_ALL); @@ -263,12 +680,17 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles + map[0] = 0; for (i=1, j=1; in_allele; i++) { + if (strcmp(line->d.allele[i], "") == 0) + have_cnv_tr = 1; + if ( kbs_exists(rm_set, i) ) { // remove this allele line->d.allele[i] = NULL; + map[i] = -1; nrm++; continue; } @@ -295,6 +717,15 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb bcf_update_alleles_str(header, line, str.s); + if (have_cnv_tr) + { + if (fixup_cnv_tr_info_tags(header, line, nA_ori, rm_set) < 0) + { + hts_log_error("Out of memory"); + goto err; + } + } + // remove from Number=G, Number=R and Number=A INFO fields. int mdat = 0, ndat = 0, mdat_bytes = 0, nret; for (i=0; in_info; i++) @@ -302,7 +733,15 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb bcf_info_t *info = &line->d.info[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_INFO,info->key); - if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change + // Handle INFO types with Number=. but really a multiple of Number=A + int multiple = (vlen == BCF_VL_VAR + ? is_special_info_type(bcf_hdr_int2id(header, BCF_DT_ID, + info->key)) + : 1); + if (multiple > 1) + vlen = BCF_VL_A; + + if ((vl_a_g_r & (1 << vlen)) == 0) continue; // no need to change int type = bcf_hdr_id2type(header,BCF_HL_INFO,info->key); if ( type==BCF_HT_FLAG ) continue; @@ -329,7 +768,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int nexp, inc = 0; if ( vlen==BCF_VL_A ) { - nexp = nA_ori; + nexp = nA_ori * multiple; inc = 1; } else @@ -338,7 +777,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { if ( !*se ) break; while ( *se && *se!=',' ) se++; - if ( kbs_exists(rm_set, j+inc) ) + if ( kbs_exists(rm_set, j / multiple + inc) ) { if ( *se ) se++; ss = se; @@ -352,8 +791,8 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if ( j==1 && s == '.' ) continue; // missing if ( j!=nexp ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%c=%d, but found %d", - bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname_safe(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, j); + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%s=%d, but found %d", + bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname_safe(header,line), line->pos+1, cardinalities[vlen], nexp, j); goto err; } } @@ -422,14 +861,16 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int inc = 0, ntop; if ( vlen==BCF_VL_A ) { - if ( nret!=nA_ori ) + if ( nret!=nA_ori * multiple ) { - hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", - bcf_hdr_int2id(header,BCF_DT_ID,info->key), bcf_seqname_safe(header,line), line->pos+1, nA_ori, nret); + hts_log_error("Unexpected number of values in INFO/%s at %s:%"PRIhts_pos"; expected Number=%s=%d, but found %d", + bcf_hdr_int2id(header,BCF_DT_ID,info->key), + bcf_seqname_safe(header,line), line->pos+1, + multiple == 1 ? "A" : ".", nA_ori, nret); goto err; } - ntop = nA_ori; - ndat = nA_new; + ntop = nA_ori * multiple; + ndat = nA_new * multiple; inc = 1; } else @@ -452,7 +893,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb for (j=0; j=0 ) ) + if ( !( al=-1 ) ) { - hts_log_error("Problem updating genotypes at %s:%"PRIhts_pos" [ al=0 :: al=%d,nR_ori=%d,map[al]=%d ]", - bcf_seqname_safe(header,line), line->pos+1, al, nR_ori, map[al]); + hts_log_error("Problem updating genotypes at %s:%"PRIhts_pos" [ al=-1 :: al=%d,nR_ori=%d,map[al]=%d ]", + bcf_seqname_safe(header,line), line->pos+1, al, + nR_ori, al < nR_ori ? map[al] : -1); goto err; } - // if an allele other than the reference is mapped to 0, it has been removed, + // if an allele is mapped to -1, it has been removed, // so translate it to 'missing', while preserving the phasing bit - ptr[j] = ((al>0 && !map[al]) ? bcf_gt_missing : (map[al]+1)<<1) | (ptr[j]&1); + ptr[j] = (map[al] < 0 ? bcf_gt_missing : (map[al]+1)<<1) | (ptr[j]&1); } ptr += nret; } @@ -548,6 +990,92 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } } + // Do we have local alleles? + num_laa = bcf_get_format_int32(header, line, "LAA", &laa, &laa_size); + if (num_laa < -1 && num_laa != -3) + goto err; + if (num_laa > 0) + { + // Go through LAA values removing any in rm_set. + // At the same time, make a map showing which have been removed + // and the location of the remaining ones in the new list. + int num_laa_vals = num_laa / line->n_sample; + laa_map_stride = num_laa_vals + 1; + int max_k = 0; + laa_map = malloc(sizeof(*laa_map) * laa_map_stride * line->n_sample); + if (!laa_map) + goto err; + lr_orig = malloc(sizeof(*lr_orig) * line->n_sample); + if (!lr_orig) + goto err; + int laa_changed = 0; + for (i = 0; i < line->n_sample; i++) + { + int *sample_laa = &laa[i * num_laa_vals]; + int *sample_laa_map = &laa_map[i * laa_map_stride]; + int k; + sample_laa_map[0] = 0; + for (j = 0, k = 0; j < num_laa_vals; j++) + { + if (sample_laa[j] == bcf_int32_vector_end + || sample_laa[j] == bcf_int32_missing) + break; + int allele = (sample_laa[j] > 0 && sample_laa[j] < nR_ori) + ? sample_laa[j] : 0; + if (!allele || map[allele] < 0) + { + sample_laa_map[j + 1] = -1; + laa_changed = 1; + continue; + } + if (allele != map[allele]) + laa_changed = 1; + sample_laa[k] = map[allele]; + sample_laa_map[j + 1] = ++k; + } + + lr_orig[i] = j + 1; + + if (max_k < k) + max_k = k; + + for (; j < num_laa_vals; j++) + sample_laa_map[j + 1] = -1; + + for (; k < num_laa_vals; k++) + sample_laa[k] = k > 0 ? bcf_int32_vector_end : bcf_int32_missing; + } + if (laa_changed) + { + if (max_k < num_laa_vals) + { + // Max number of items has shrunk, so consolidate. + if (max_k > 0) { + for (i = 1; i < line->n_sample; i++) + { + memmove(&laa[i * max_k], + &laa[i * num_laa_vals], + max_k * sizeof(laa[0])); + } + num_laa = line->n_sample * max_k; + } else { + // No values left - all referenced alleles must have been + // removed. Store MISSING to prevent the LAA tag from + // also being removed (which would invalidate LAD, + // LPL etc.) + assert(num_laa >= line->n_sample); + for (i = 0; i < line->n_sample; i++) + laa[i] = bcf_int32_missing; + num_laa = line->n_sample; + } + } + // Push back new LAA values + if (bcf_update_format_int32(header, line, + "LAA", laa, num_laa) < 0) + goto err; + } + } + // Remove from Number=G, Number=R and Number=A FORMAT fields. // Assuming haploid or diploid GTs for (i=0; in_fmt; i++) @@ -555,7 +1083,8 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb bcf_fmt_t *fmt = &line->d.fmt[i]; int vlen = bcf_hdr_id2length(header,BCF_HL_FMT,fmt->id); - if ( vlen!=BCF_VL_A && vlen!=BCF_VL_G && vlen!=BCF_VL_R ) continue; // no need to change + if ((vl_a_g_r_la_lg_lr & (1 << vlen)) == 0) continue; // no need to change + int is_local = (vl_la_lg_lr & (1 << vlen)) != 0; int type = bcf_hdr_id2type(header,BCF_HL_FMT,fmt->id); if ( type==BCF_HT_FLAG ) continue; @@ -578,25 +1107,46 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb { int size = nret/line->n_sample; // number of bytes per sample str.l = 0; - if ( vlen==BCF_VL_A || vlen==BCF_VL_R ) + if (vl_a_r_la_lr & (1 << vlen)) { - int nexp, inc = 0; - if ( vlen==BCF_VL_A ) - { + int nexp = 0, inc = 0; + switch (vlen) { + case BCF_VL_A: nexp = nA_ori; inc = 1; - } - else + break; + case BCF_VL_R: nexp = nR_ori; + break; + case BCF_VL_LA: + inc = 1; + // fall through + case BCF_VL_LR: + if (!laa_map) + { + hts_log_error("No LAA data at %s:%"PRIhts_pos + "; required by FORMAT/%s with Number=%s", + bcf_seqname_safe(header,line), line->pos+1, + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), + cardinalities[vlen]); + goto err; + } + break; + default: + break; + } for (j=0; jn_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0]; int k_src = 0, k_dst = 0, l = str.l; + int *sample_map = is_local ? &laa_map[j * laa_map_stride] : map; + if (is_local) + nexp = lr_orig[j] - inc; for (k_src=0; k_src=se || !*ptr) break; while ( ptrid), bcf_seqname_safe(header,line), line->pos+1, vlen==BCF_VL_A ? 'A' : 'R', nexp, k_src); + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%s=%d, but found %d", + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), + bcf_seqname_safe(header,line), line->pos+1, + cardinalities[vlen], nexp, k_src); goto err; } l = str.l - l; - for (; ln_sample; j++) { char *ss = ((char*)dat) + j*size, *se = ss + size, *ptr = ss, s = ss[0]; int k_src = 0, k_dst = 0, l = str.l; int nexp = 0; // diploid or haploid? + int sample_nR_ori = is_local ? lr_orig[j] : nR_ori; + int sample_nG_ori = is_local + ? sample_nR_ori * (sample_nR_ori + 1) / 2 + : nG_ori; + int *sample_map = is_local ? &laa_map[j * laa_map_stride] : map; while ( ptrid), bcf_seqname_safe(header,line), line->pos+1, nG_ori, nR_ori, nexp); + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%s=%d(diploid) or %d(haploid), but found %d", + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), + bcf_seqname_safe(header,line), line->pos+1, + cardinalities[vlen], + sample_nG_ori, sample_nR_ori, nexp); goto err; } ptr = ss; - if ( nexp==nG_ori ) // diploid + if ( nexp==1 && s == '.' ) // missing + { + kputc('.', &str); + } + else if ( nexp==sample_nG_ori ) // diploid { int ia, ib; - for (ia=0; ia=se || !*ptr ) break; while ( ptr=se || !*ptr ) break; while ( ptrid), bcf_seqname_safe(header,line), line->pos+1, nR_ori, k_src); + hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=%s=%d(haploid), but found %d", + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), + bcf_seqname_safe(header,line), + line->pos+1, + cardinalities[vlen], + nR_ori, k_src); goto err; } - l = str.l - l; - for (; lid), (void*)str.s, str.l, type); @@ -720,11 +1288,13 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb if (all_missing) continue; // could remove this FORMAT tag? } - if ( vlen==BCF_VL_A || vlen==BCF_VL_R || (vlen==BCF_VL_G && nori==nR_ori) ) // Number=A, R or haploid Number=G + if ( (vl_a_r_la_lr & (1 << vlen)) != 0 + || (vlen==BCF_VL_G && nori==nR_ori) ) { - int inc = 0, nnew; - if ( vlen==BCF_VL_A ) - { + // Number=A, R, LA, LR or haploid Number=G + int inc = 0, nnew = 0; + switch (vlen) { + case BCF_VL_A: if ( nori!=nA_ori ) { hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=A=%d, but found %d", @@ -734,20 +1304,40 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb ndat = nA_new*line->n_sample; nnew = nA_new; inc = 1; - } - else - { + break; + case BCF_VL_R: if ( nori!=nR_ori ) { hts_log_error("Unexpected number of values in FORMAT/%s at %s:%"PRIhts_pos"; expected Number=R=%d, but found %d", bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), bcf_seqname_safe(header,line), line->pos+1, nR_ori, nori); goto err; } + // fall through + case BCF_VL_G: ndat = nR_new*line->n_sample; nnew = nR_new; + break; + case BCF_VL_LA: + inc = 1; + // fall through + case BCF_VL_LR: + if (!laa_map) + { + hts_log_error("No LAA data at %s:%"PRIhts_pos + "; required by FORMAT/%s with Number=%s", + bcf_seqname_safe(header,line), line->pos+1, + bcf_hdr_int2id(header,BCF_DT_ID,fmt->id), + cardinalities[vlen]); + goto err; + } + nnew = nori; + ndat = nori * line->n_sample; + break; + default: + break; } - #define BRANCH(type_t,is_vector_end,set_missing) \ + #define BRANCH(type_t,is_vector_end,set_missing,set_vector_end) \ { \ for (j=0; jn_sample; j++) \ { \ @@ -755,38 +1345,57 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb type_t *ptr_dst = ((type_t*)dat) + j*nnew; \ int size = sizeof(type_t); \ int k_src, k_dst = 0; \ - for (k_src=0; k_srcid), bcf_seqname_safe(header,line), line->pos+1, nG_ori, nori); goto err; } + if (is_local) + nG_new = nori; ndat = nG_new*line->n_sample; - #define BRANCH(type_t,is_vector_end) \ + #define BRANCH(type_t,is_vector_end,set_vector_end) \ { \ for (j=0; jn_sample; j++) \ { \ @@ -795,38 +1404,54 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int size = sizeof(type_t); \ int ia, ib, k_dst = 0, k_src; \ int nset = 0; /* haploid or diploid? */ \ - for (k_src=0; k_src