diff --git a/Makefile b/Makefile index 3d335f2bc..946f193db 100644 --- a/Makefile +++ b/Makefile @@ -40,9 +40,10 @@ OBJS = main.o vcfindex.o tabix.o \ vcfcall.o mcall.o vcmp.o gvcf.o reheader.o convert.o vcfconvert.o tsv2vcf.o \ vcfcnv.o vcfhead.o HMM.o consensus.o ploidy.o bin.o hclust.o version.o \ regidx.o smpl_ilist.o csq.o vcfbuf.o \ - mpileup.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \ + mpileup.o mpileup2.o bam2bcf.o bam2bcf_indel.o bam2bcf_iaux.o read_consensus.o bam_sample.o \ vcfsort.o cols.o extsort.o dist.o abuf.o \ - ccall.o em.o prob1.o kmin.o str_finder.o + ccall.o em.o prob1.o kmin.o str_finder.o \ + mpileup2/mpileup.o PLUGIN_OBJS = vcfplugin.o prefix = /usr/local @@ -277,6 +278,8 @@ cols.o: cols.c cols.h regidx.o: regidx.c $(htslib_hts_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_khash_str2int_h) regidx.h consensus.o: consensus.c $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_synced_bcf_reader_h) $(htslib_kseq_h) $(htslib_bgzf_h) regidx.h $(bcftools_h) rbuf.h $(filter_h) mpileup.o: mpileup.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) regidx.h $(bcftools_h) $(bam2bcf_h) $(bam_sample_h) $(gvcf_h) +mpileup2.o: mpileup2.c $(htslib_sam_h) $(htslib_faidx_h) $(htslib_kstring_h) $(htslib_khash_str2int_h) $(htslib_hts_os_h) $(bcftools_h) mpileup2/mpileup.h +mpileup2/mpileup.o: mpileup2/mpileup.h mpileup2/mpileup.c bam2bcf.o: bam2bcf.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(bam2bcf_h) mw.h bam2bcf_indel.o: bam2bcf_indel.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h bam2bcf_iaux.o: bam2bcf_iaux.c $(htslib_hts_h) $(htslib_sam_h) $(htslib_khash_str2int_h) $(bam2bcf_h) $(htslib_ksort_h) str_finder.h read_consensus.h cigar_state.h diff --git a/bam2bcf.h b/bam2bcf.h index 955c022bf..9b90fc3e7 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -108,7 +108,8 @@ plp_cd_t; typedef struct __bcf_callaux_t { - int fmt_flag, ambig_reads; + uint32_t fmt_flag; + int ambig_reads; int capQ, min_baseQ, max_baseQ, delta_baseQ; int openQ, extQ, tandemQ; // for indels uint32_t min_support, max_support; // for collecting indel candidates diff --git a/main.c b/main.c index a0213589f..9211b3ef2 100644 --- a/main.c +++ b/main.c @@ -61,6 +61,7 @@ int count_plugins(void); int main_consensus(int argc, char *argv[]); int main_csq(int argc, char *argv[]); int main_mpileup(int argc, char *argv[]); +int main_mpileup2(int argc, char *argv[]); int main_sort(int argc, char *argv[]); typedef struct @@ -174,6 +175,10 @@ static cmd_t cmds[] = .alias = "mpileup", .help = "multi-way pileup producing genotype likelihoods" }, + { .func = main_mpileup2, + .alias = "mpileup2", + .help = "-new version of mpileup (experimental)" // do not advertise yet + }, #if USE_GPL { .func = main_polysomy, .alias = "polysomy", diff --git a/mpileup.c b/mpileup.c index 9b21b1873..b71e31005 100644 --- a/mpileup.c +++ b/mpileup.c @@ -47,9 +47,6 @@ DEALINGS IN THE SOFTWARE. */ #include "bam_sample.h" #include "gvcf.h" -#define MPLP_BCF 1 -#define MPLP_VCF (1<<1) -#define MPLP_NO_COMP (1<<2) #define MPLP_NO_ORPHAN (1<<3) #define MPLP_REALN (1<<4) #define MPLP_NO_INDEL (1<<5) @@ -1078,7 +1075,7 @@ int read_file_list(const char *file_list,int *n,char **argv[]) continue; \ } -void parse_format_flag(uint32_t *flag, const char *str) +static void parse_format_flag(uint32_t *flag, const char *str) { int i, n_tags; char **tags = hts_readlist(str, 0, &n_tags); @@ -1562,19 +1559,6 @@ int main_mpileup(int argc, char *argv[]) fprintf(stderr,"[warning] The -a DP option is required with --gvcf, switching on.\n"); mplp.fmt_flag |= B2B_FMT_DP; } - if ( mplp.flag&(MPLP_BCF|MPLP_VCF|MPLP_NO_COMP) ) - { - if ( mplp.flag&MPLP_VCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_VCF; - else mplp.output_type = FT_VCF_GZ; - } - else if ( mplp.flag&MPLP_BCF ) - { - if ( mplp.flag&MPLP_NO_COMP ) mplp.output_type = FT_BCF; - else mplp.output_type = FT_BCF_GZ; - } - } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); diff --git a/mpileup2.c b/mpileup2.c new file mode 100644 index 000000000..a53106233 --- /dev/null +++ b/mpileup2.c @@ -0,0 +1,614 @@ +/* mpileup2.c -- mpileup2, new version of mpileup + + Copyright (C) 2022-2023 Genome Research Ltd. + + Author: Petr Danecek + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bcftools.h" +#include "mpileup2/mpileup.h" +#include "bam2bcf.h" +#include "gvcf.h" + +#define MPLP_NO_ORPHAN (1<<3) +#define MPLP_REALN (1<<4) +#define MPLP_NO_INDEL (1<<5) +#define MPLP_REDO_BAQ (1<<6) +#define MPLP_ILLUMINA13 (1<<7) +#define MPLP_IGNORE_RG (1<<8) +#define MPLP_PRINT_POS (1<<9) +#define MPLP_PRINT_MAPQ (1<<10) +#define MPLP_PER_SAMPLE (1<<11) +#define MPLP_SMART_OVERLAPS (1<<12) +#define MPLP_REALN_PARTIAL (1<<13) + +typedef struct +{ + int argc; + char **argv, *output_fname; + int flag; + int output_type, clevel, max_read_len; + int record_cmd_line; + mpileup_t *mplp; + bcf_callaux_t bca; + gvcf_t *gvcf; +} +args_t; + +static int run_mpileup(args_t *args) +{ + int ret, nsmpl = mpileup_get_val(args->mplp,int,N_SAMPLES); + while ( (ret=mpileup_next(args->mplp))==1 ) + { +xxxx hts_pos_t pos = mpileup_get_val(args->mplp,int,PLP_POS); + int *nreads = mpileup_get_val(args->mplp,int,N_READS); + char *bases = mpileup_get_val(args->mplp,char,BASES); + int i,j,k=0; + for (i=0; implp, int, SKIP_ALL_SET)); + char *tmp_skip_any_unset = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ANY_UNSET)); + char *tmp_skip_all_unset = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ALL_UNSET)); + char *tmp_skip_any_set = bam_flag2str(mpileup_get_val(args->mplp, int, SKIP_ANY_SET)); + + fprintf(fp, + "\n" + "Usage: bcftools mpileup2 [OPTIONS] in1.bam [in2.bam [...]]\n" + "\n" + "Input options:\n" + " -6, --illumina1.3+ Quality is in the Illumina-1.3+ encoding\n" + " -A, --count-orphans Do not discard anomalous read pairs\n" + " -b, --bam-list FILE List of input BAM filenames, one per line\n" + " -B, --no-BAQ Disable BAQ (per-Base Alignment Quality)\n" + " -C, --adjust-MQ INT Adjust mapping quality [0]\n" + " -D, --full-BAQ Apply BAQ everywhere, not just in problematic regions\n" + " -d, --max-depth INT Max raw per-sample depth; avoids excessive memory usage [%d]\n", mpileup_get_val(args->mplp, int, MAX_DP_PER_SAMPLE)); + fprintf(fp, + " -E, --redo-BAQ Recalculate BAQ on the fly, ignore existing BQs\n" + " -f, --fasta-ref FILE Faidx indexed reference sequence file\n" + " --no-reference Do not require fasta reference file\n" + " -G, --read-groups FILE Select or exclude read groups listed in the file\n" + " -q, --min-MQ INT Skip alignments with mapQ smaller than INT [%d]\n", mpileup_get_val(args->mplp, int, MIN_MQ)); + fprintf(fp, + " -Q, --min-BQ INT Skip bases with baseQ/BAQ smaller than INT [%d]\n", mpileup_get_val(args->mplp, int, MIN_BQ)); + fprintf(fp, + " --max-BQ INT Limit baseQ/BAQ to no more than INT [%d]\n", mpileup_get_val(args->mplp, int, MAX_BQ)); + fprintf(fp, + " --delta-BQ INT Use neighbour_qual + INT if less than qual [%d]\n", mpileup_get_val(args->mplp, int, DELTA_BQ)); + fprintf(fp, + " -r, --regions REG[,...] Comma separated list of regions in which pileup is generated\n" + " -R, --regions-file FILE Restrict to regions listed in a file\n" + " --ignore-RG Ignore RG tags (one BAM = one sample)\n" + " --ls, --skip-all-set STR|INT Skip reads with all of the bits set []\n"); + fprintf(fp, + " --ns, --skip-any-set STR|INT Skip reads with any of the bits set [%s]\n", tmp_skip_any_set); + fprintf(fp, + " --lu, --skip-all-unset STR|INT Skip reads with all of the bits unset []\n" + " --nu, --skip-any-unset STR|INT Skip reads with any of the bits unset []\n"); + fprintf(fp, + " -s, --samples LIST Comma separated list of samples to include\n" + " -S, --samples-file FILE File of samples to include\n" + " -t, --targets REG[,...] Similar to -r but streams rather than index-jumps\n" + " -T, --targets-file FILE Similar to -R but streams rather than index-jumps\n" + " -x, --ignore-overlaps Disable read-pair overlap detection\n" + " --seed INT Random number seed used for sampling deep regions [0]\n" + "\n" + "Output options:\n" + " -a, --annotate LIST Optional tags to output; '\\?' to list available tags []\n" + " -g, --gvcf INT[,...] Group non-variant sites into gVCF blocks according\n" + " To minimum per-sample DP\n" + " --no-version Do not append version and command line to the header\n" + " -o, --output FILE Write output to FILE [standard output]\n" + " -O, --output-type TYPE 'b' compressed BCF; 'u' uncompressed BCF;\n" + " 'z' compressed VCF; 'v' uncompressed VCF; 0-9 compression level [v]\n" + " --threads INT Use multithreading with INT worker threads [0]\n" + "\n" + "SNP/INDEL genotype likelihoods options:\n" + " -X, --config STR Specify platform specific profiles (see below)\n" + " -e, --ext-prob INT Phred-scaled gap extension seq error probability [%d]\n", args->bca.extQ); + fprintf(fp, + " -F, --gap-frac FLOAT Minimum fraction of gapped reads [%f]\n", mpileup_get_val(args->mplp, float, MIN_REALN_FRAC)); + fprintf(fp, + " -h, --tandem-qual INT Coefficient for homopolymer errors [%d]\n", args->bca.tandemQ); + fprintf(fp, + " -I, --skip-indels Do not perform indel calling\n" + " -L, --max-idepth INT Subsample to maximum per-sample depth for INDEL calling [%d]\n", mpileup_get_val(args->mplp, int, MAX_REALN_DP)); + fprintf(fp, + " -m, --min-ireads INT Minimum number gapped reads for indel candidates [%d]\n", mpileup_get_val(args->mplp, int, MIN_REALN_DP)); + fprintf(fp, + " -M, --max-read-len INT Maximum length of read to pass to BAQ algorithm [%d]\n", mpileup_get_val(args->mplp, int, MAX_REALN_LEN)); + fprintf(fp, + " -o, --open-prob INT Phred-scaled gap open seq error probability [%d]\n", args->bca.openQ); + fprintf(fp, + " -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n" + " -P, --platforms STR Comma separated list of platforms for indels [all]\n" + " --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n"); + fprintf(fp, + " --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", args->bca.indel_bias); + fprintf(fp, + " --indel-size INT Approximate maximum indel size considered [%d]\n", args->bca.indel_win_size); + fprintf(fp,"\n"); + fprintf(fp, + "Configuration profiles activated with -X, --config:\n" + " 1.12: -Q13 -h100 -m1 -F0.002\n" + " illumina: [ default values ]\n" + " ont: -B -Q5 --max-BQ 30 -I [also try eg |bcftools call -P0.01]\n" + " pacbio-ccs: -D -Q5 --max-BQ 50 -F0.1 -o25 -e1 --delta-BQ 10 -M99999\n" + "\n" + "Notes: Assuming diploid individuals.\n" + "\n" + "Example:\n" + " # See also http://samtools.github.io/bcftools/howtos/variant-calling.html\n" + " bcftools mpileup -Ou -f reference.fa alignments.bam | bcftools call -mv -Ob -o calls.bcf\n" + "\n"); + + free(tmp_skip_all_set); + free(tmp_skip_any_unset); + free(tmp_skip_all_unset); + free(tmp_skip_any_set); +} + +static void clean_up(args_t *args) +{ + mpileup_destroy(args->mplp); + free(args); +} + +int main_mpileup2(int argc, char *argv[]) +{ + int c, ret = 0; + args_t *args = (args_t*) calloc(1,sizeof(args_t)); + args->argc = argc; args->argv = argv; + args->flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL | MPLP_SMART_OVERLAPS; + args->bca.openQ = 40; + args->bca.extQ = 20; + args->bca.tandemQ = 500; + + args->mplp = mpileup_alloc(); + mpileup_set(args->mplp, MAX_DP_PER_SAMPLE, 250); + mpileup_set(args->mplp, MIN_MQ, 0); + mpileup_set(args->mplp, MAX_BQ, 60); + mpileup_set(args->mplp, DELTA_BQ, 30); + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.05); + mpileup_set(args->mplp, MIN_REALN_DP, 2); + mpileup_set(args->mplp, MAX_REALN_DP, 250); + mpileup_set(args->mplp, MAX_REALN_LEN, 500); + mpileup_set(args->mplp, SKIP_ANY_SET, BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP); + + hts_srand48(0); + +#if 0 + const char *file_list = NULL; + char **fn = NULL; + int nfiles = 0, use_orphan = 0, noref = 0; + mplp_conf_t mplp; + memset(&mplp, 0, sizeof(mplp_conf_t)); + mplp.max_baseQ = 60; + mplp.delta_baseQ = 30; + mplp.capQ_thres = 0; + mplp.max_depth = 250; mplp.max_indel_depth = 250; + mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 500; + mplp.min_frac = 0.05; mplp.indel_bias = 1.0; mplp.min_support = 2; + mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_REALN_PARTIAL + | MPLP_SMART_OVERLAPS; + mplp.argc = argc; mplp.argv = argv; + mplp.rflag_skip_any_set = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; + mplp.output_fname = NULL; + mplp.output_type = FT_VCF; + mplp.record_cmd_line = 1; + mplp.n_threads = 0; + // the default to be changed in future, see also parse_format_flag() + //mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; + //mplp.ambig_reads = B2B_DROP; + mplp.max_read_len = 500; + mplp.indel_win_size = 110; + mplp.clevel = -1; +#endif + + static const struct option lopts[] = + { + {"nu", required_argument, NULL, 16}, + {"lu", required_argument, NULL, 17}, + {"rf", required_argument, NULL, 17}, // old --rf, --incl-flags = --lu, --skip-all-unset + {"ns", required_argument, NULL, 18}, + {"ff", required_argument, NULL, 18}, // old --ff, --excl-flags = --ns, --skip-any-set + {"ls", required_argument, NULL, 19}, + {"skip-any-unset", required_argument, NULL, 16}, + {"skip-all-unset", required_argument, NULL, 17}, + {"skip-any-set", required_argument, NULL, 18}, + {"skip-all-set", required_argument, NULL, 19}, + {"output", required_argument, NULL, 3}, + {"open-prob", required_argument, NULL, 4}, + {"ignore-RG", no_argument, NULL, 5}, + {"ignore-rg", no_argument, NULL, 5}, + {"gvcf", required_argument, NULL, 'g'}, + {"no-reference", no_argument, NULL, 7}, + {"no-version", no_argument, NULL, 8}, + {"threads",required_argument,NULL,9}, + {"illumina1.3+", no_argument, NULL, '6'}, + {"count-orphans", no_argument, NULL, 'A'}, + {"bam-list", required_argument, NULL, 'b'}, + {"no-BAQ", no_argument, NULL, 'B'}, + {"no-baq", no_argument, NULL, 'B'}, + {"full-BAQ", no_argument, NULL, 'D'}, + {"full-baq", no_argument, NULL, 'D'}, + {"adjust-MQ", required_argument, NULL, 'C'}, + {"adjust-mq", required_argument, NULL, 'C'}, + {"max-depth", required_argument, NULL, 'd'}, + {"redo-BAQ", no_argument, NULL, 'E'}, + {"redo-baq", no_argument, NULL, 'E'}, + {"fasta-ref", required_argument, NULL, 'f'}, + {"read-groups", required_argument, NULL, 'G'}, + {"region", required_argument, NULL, 'r'}, + {"regions", required_argument, NULL, 'r'}, + {"regions-file", required_argument, NULL, 'R'}, + {"targets", required_argument, NULL, 't'}, + {"targets-file", required_argument, NULL, 'T'}, + {"min-MQ", required_argument, NULL, 'q'}, + {"min-mq", required_argument, NULL, 'q'}, + {"min-BQ", required_argument, NULL, 'Q'}, + {"min-bq", required_argument, NULL, 'Q'}, + {"max-bq", required_argument, NULL, 11}, + {"max-BQ", required_argument, NULL, 11}, + {"delta-BQ", required_argument, NULL, 12}, + {"ignore-overlaps", no_argument, NULL, 'x'}, + {"output-type", required_argument, NULL, 'O'}, + {"samples", required_argument, NULL, 's'}, + {"samples-file", required_argument, NULL, 'S'}, + {"annotate", required_argument, NULL, 'a'}, + {"ext-prob", required_argument, NULL, 'e'}, + {"gap-frac", required_argument, NULL, 'F'}, + {"indel-bias", required_argument, NULL, 10}, + {"indel-size", required_argument, NULL, 15}, + {"tandem-qual", required_argument, NULL, 'h'}, + {"skip-indels", no_argument, NULL, 'I'}, + {"max-idepth", required_argument, NULL, 'L'}, + {"min-ireads", required_argument, NULL, 'm'}, + {"per-sample-mF", no_argument, NULL, 'p'}, + {"per-sample-mf", no_argument, NULL, 'p'}, + {"platforms", required_argument, NULL, 'P'}, + {"max-read-len", required_argument, NULL, 'M'}, + {"config", required_argument, NULL, 'X'}, + {"mwu-u", no_argument, NULL, 'U'}, + {"seed", required_argument, NULL, 13}, + {"ambig-reads", required_argument, NULL, 14}, + {"ar", required_argument, NULL, 14}, + {NULL, 0, NULL, 0} + }; + while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) { + switch (c) { + case 'x': mpileup_set(args->mplp, SMART_OVERLAPS, 0); break; + case 16 : + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --nu %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ANY_UNSET, flag); + break; + } + case 17 : + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --lu %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ALL_UNSET, flag); + break; + } + case 18 : + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --ns %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ANY_SET, flag); + break; + } + case 19 : + { + int flag = bam_str2flag(optarg); + if ( flag<0 ) { fprintf(stderr,"Could not parse --ls %s\n", optarg); return 1; } + mpileup_set(args->mplp, SKIP_ALL_SET, flag); + break; + } + case 3 : args->output_fname = optarg; break; + case 4 : args->bca.openQ = atoi(optarg); break; + case 5 : mpileup_set(args->mplp, SMPL_IGNORE_RG, 1); break; + case 'g': + args->gvcf = gvcf_init(optarg); + if ( !args->gvcf ) error("Could not parse: --gvcf %s\n", optarg); + break; + case 'f': mpileup_set(args->mplp, FASTA_REF, optarg); break; + case 7 : error("todo: --no-reference"); /*noref = 1;*/ break; + case 8 : args->record_cmd_line = 0; break; + case 9 : error("todo: --threads\n"); /* note this was used *only* for output VCF compression, practically useless, moreover -Ou is recommended... */ break; + case 'd': mpileup_set(args->mplp, MAX_DP_PER_SAMPLE, atoi(optarg)); break; + case 'r': mpileup_set(args->mplp, REGIONS, optarg); break; + case 'R': mpileup_set(args->mplp, REGIONS_FNAME, optarg); break; + case 't': mpileup_set(args->mplp, TARGETS, optarg); break; + case 'T': mpileup_set(args->mplp, TARGETS_FNAME, optarg); break; + case 'P': error("todo: --platforms\n"); /* this was never used: mplp.pl_list = strdup(optarg); */ break; + case 'p': args->flag |= MPLP_PER_SAMPLE; break; + case 'B': args->flag &= ~MPLP_REALN; break; + case 'D': args->flag &= ~MPLP_REALN_PARTIAL; break; + case 'I': args->flag |= MPLP_NO_INDEL; break; + case 'E': args->flag |= MPLP_REDO_BAQ; break; + case '6': args->flag |= MPLP_ILLUMINA13; break; + case 's': mpileup_set(args->mplp, SAMPLES, optarg); break; + case 'S': mpileup_set(args->mplp, SAMPLES_FNAME, optarg); break; + case 'O': + switch (optarg[0]) { + case 'b': args->output_type = FT_BCF_GZ; break; + case 'u': args->output_type = FT_BCF; break; + case 'z': args->output_type = FT_VCF_GZ; break; + case 'v': args->output_type = FT_VCF; break; + default: + { + char *tmp; + args->clevel = strtol(optarg,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("The output type \"%s\" not recognised\n", optarg); + } + } + if ( optarg[1] ) + { + char *tmp; + args->clevel = strtol(optarg+1,&tmp,10); + if ( *tmp || args->clevel<0 || args->clevel>9 ) error("Could not parse argument: --output-type %s\n", optarg+1); + } + break; + case 'C': mpileup_set(args->mplp, ADJUST_MQ, atoi(optarg)); break; + case 'q': mpileup_set(args->mplp, MIN_MQ, atoi(optarg)); break; + case 'Q': mpileup_set(args->mplp, MIN_BQ, atoi(optarg)); break; + case 11: mpileup_set(args->mplp, MAX_BQ, atoi(optarg)); break; + case 12: mpileup_set(args->mplp, DELTA_BQ, atoi(optarg)); break; + case 'b': if ( mpileup_set(args->mplp, BAM_FNAME, optarg)!=0 ) clean_up(args); break; + case 'o': + /* this option used to be -o INT and -o FILE. In the new code disable -o as an alias of --open-prob as its use is very rare */ + args->output_fname = optarg; + break; + case 'e': args->bca.extQ = atoi(optarg); break; + case 'h': args->bca.tandemQ = atoi(optarg); break; + case 10: // --indel-bias (inverted so higher => more indels called) + if (atof(optarg) < 1e-2) + args->bca.indel_bias = 1/1e2; + else + args->bca.indel_bias = 1/atof(optarg); + break; + case 15: { + char *tmp; + args->bca.indel_win_size = strtol(optarg,&tmp,10); + if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); + if ( args->bca.indel_win_size < 110 ) + { + args->bca.indel_win_size = 110; + fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",args->bca.indel_win_size); + } + } + break; + case 'A': args->flag &= ~MPLP_NO_ORPHAN; break; + case 'F': mpileup_set(args->mplp, MIN_REALN_FRAC, atof(optarg)); break; + case 'm': mpileup_set(args->mplp, MIN_REALN_DP, atoi(optarg)); break; + case 'L': mpileup_set(args->mplp, MAX_REALN_DP, atoi(optarg)); break; + case 'G': mpileup_set(args->mplp, READ_GROUPS_FNAME, optarg); break; + case 'a': + if (optarg[0]=='?') { + list_annotations(stderr); + return 1; + } + parse_format_flag(&args->bca.fmt_flag,optarg); + break; + case 'M': args->max_read_len = atoi(optarg); break; + case 'X': + if (strcasecmp(optarg, "pacbio-ccs") == 0) { + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.1); + mpileup_set(args->mplp, MIN_BQ, 5); + mpileup_set(args->mplp, MAX_BQ, 50); + mpileup_set(args->mplp, DELTA_BQ, 10); + args->bca.openQ = 25; + args->bca.extQ = 1; + args->flag |= MPLP_REALN_PARTIAL; + mpileup_set(args->mplp, MAX_REALN_LEN, 99999); + } else if (strcasecmp(optarg, "ont") == 0) { + fprintf(stderr, "For ONT it may be beneficial to also run bcftools call with " + "a higher -P, eg -P0.01 or -P 0.1\n"); + mpileup_set(args->mplp, MIN_BQ, 5); + mpileup_set(args->mplp, MAX_BQ, 30); + args->flag &= ~MPLP_REALN; + args->flag |= MPLP_NO_INDEL; + } else if (strcasecmp(optarg, "1.12") == 0) { + // 1.12 and earlier + mpileup_set(args->mplp, MIN_REALN_FRAC, 0.002); + args->bca.min_support = 1; + mpileup_set(args->mplp, MIN_BQ, 13); + args->bca.tandemQ = 100; + args->flag &= ~MPLP_REALN_PARTIAL; + args->flag |= MPLP_REALN; + } else if (strcasecmp(optarg, "illumina") == 0) { + args->flag |= MPLP_REALN_PARTIAL; + } else { + fprintf(stderr, "Unknown configuration name '%s'\n" + "Please choose from 1.12, illumina, pacbio-ccs or ont\n", + optarg); + return 1; + } + break; + case 13: hts_srand48(atoi(optarg)); break; + case 14: + error("todo: --ambig-reads\n"); + // if ( !strcasecmp(optarg,"drop") ) .ambig_reads = B2B_DROP; + // else if ( !strcasecmp(optarg,"incAD") ) mplp.ambig_reads = B2B_INC_AD; + // else if ( !strcasecmp(optarg,"incAD0") ) mplp.ambig_reads = B2B_INC_AD0; + // else error("The option to --ambig-reads not recognised: %s\n",optarg); + break; + default: + fprintf(stderr,"Invalid option: '%c'\n", c); + return 1; + } + } + + if ( args->gvcf && !(args->bca.fmt_flag&B2B_FMT_DP) ) args->bca.fmt_flag |= B2B_FMT_DP; + if ( argc==1 ) + { + print_usage(args, stderr); + clean_up(args); + return 1; + } + int i; + for (i=0; implp, BAM, argv[optind+i])!=0 ) clean_up(args); + mpileup_init(args->mplp); + + ret = run_mpileup(args); + + clean_up(args); + return ret; +} diff --git a/mpileup2/mpileup.c b/mpileup2/mpileup.c new file mode 100644 index 000000000..afef6cf8e --- /dev/null +++ b/mpileup2/mpileup.c @@ -0,0 +1,925 @@ +/* mileup2.c -- mpileup v2.0 API + + Copyright (C) 2022-2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#include +#include +#include +#include +#include +#include +#include +//#include // fixme: itr_loop and itr->seq is not working for some reason +#include "regidx.h" +#include +#include "mpileup.h" +#include "bam_sample.h" + +typedef struct +{ + int icig; // position in the cigar string + int iseq; // the cigar[icig] operation refers to seq[iseq] + hts_pos_t ref_pos; // ref coordinate of the first base (if the current icig op consumes ref) +} +cigar_state_t; + +typedef struct +{ + bam1_t *bam; + cigar_state_t cstate; + uint32_t + next:31, // index of the next node, rbuf->tail's next points to itself + is_set:1; // if is_set==0 && next==0, the node is unitialized in a newly allocated block +} +rb_node_t; + +// Reads from a small window around the current position, per sample, not per BAM +typedef struct +{ + int tid; + hts_pos_t pos; // current position + int len; // keep reads overlapping pos+len + rb_node_t *buf; // the reads, unordered and with gaps, linked through 'next' indexes + int n, m; // the number of buffer elements used and allocated + int head, tail; // the index of the first and last used record + int empty; // the index of the first unused record + int nins, ndel, nbase; +} +read_buffer_t; + +typedef struct +{ + samFile *fp; + bam_hdr_t *hdr; + hts_idx_t *idx; + hts_itr_t *iter; + char *fname; + int bam_id; + bam1_t *cached_rec; + int cached_smpl; + int32_t tid; + hts_pos_t pos; +} +bam_aux_t; + +struct mpileup_t_ +{ + char *fasta_ref, *targets, *targets_fname, *samples, *samples_fname, *read_groups_fname; + int smart_overlaps, smpl_ignore_rg, max_dp_per_sample, adjust_mq, min_mq, min_bq, max_bq, delta_bq; + int min_realn_dp, max_realn_dp, max_realn_len, skip_any_unset, skip_all_unset, skip_any_set, skip_all_set; + float min_realn_frac; + int nbam, nbam_names; + char **bam_names; + regidx_t *regions_idx; + regitr_t *regions_itr; + int nregions; + bam_aux_t *bam; + bam_hdr_t *hdr; + bam_smpl_t *bsmpl; + read_buffer_t *read_buf; // read buffer for each sample, nsmpl elements + int nsmpl; + int32_t tid; + hts_pos_t pos; + bam_pileup1_t **plp; + int *nplp; + kstring_t tmp_str; +}; + +static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file); +static int mplp_add_region(mpileup_t *mplp, char *regions, int is_file); +static void read_buffer_clear(read_buffer_t *rbuf); +static void read_buffer_reset(read_buffer_t *rbuf); + +mpileup_t *mpileup_alloc(void) +{ + mpileup_t *mplp = (mpileup_t*) calloc(1,sizeof(mpileup_t)); + mpileup_set(mplp,MAX_DP_PER_SAMPLE,250); + mpileup_set(mplp,MIN_BQ,1); + mpileup_set(mplp,MAX_BQ,60); + mpileup_set(mplp,DELTA_BQ,30); + mpileup_set(mplp,MIN_REALN_FRAC,0.05); + mpileup_set(mplp,MIN_REALN_DP,2); + mpileup_set(mplp,MAX_REALN_DP,250); + mpileup_set(mplp,MAX_REALN_LEN,500); + mpileup_set(mplp,SKIP_ANY_SET,BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP); + mplp->pos = -1; + mplp->tid = -1; + return mplp; +} + +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...) +{ + va_list args; + switch (key) + { + case FASTA_REF: + free(mplp->fasta_ref); + va_start(args, key); + mplp->fasta_ref = strdup(va_arg(args,char*)); + va_end(args); + return mplp->fasta_ref ? 0 : -1; + + case BAM: + { + va_start(args, key); + char *bam = va_arg(args,char*); + va_end(args); + return mplp_add_bam(mplp,bam,0); + } + + case BAM_FNAME: + { + va_start(args, key); + char *bam = va_arg(args,char*); + va_end(args); + return mplp_add_bam(mplp,bam,1); + } + + case REGIONS: + { + va_start(args, key); + char *regions = va_arg(args,char*); + va_end(args); + return mplp_add_region(mplp,regions,0); + } + + case REGIONS_FNAME: + { + va_start(args, key); + char *regions = va_arg(args,char*); + va_end(args); + return mplp_add_region(mplp,regions,1); + } + + case TARGETS: + free(mplp->targets); + va_start(args, key); + mplp->targets = strdup(va_arg(args,char*)); + va_end(args); + return mplp->targets ? 0 : -1; + + case TARGETS_FNAME: + free(mplp->targets_fname); + va_start(args, key); + mplp->targets_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->targets_fname ? 0 : -1; + + case SAMPLES: + free(mplp->samples); + va_start(args, key); + mplp->samples = strdup(va_arg(args,char*)); + va_end(args); + return mplp->samples ? 0 : -1; + + case SAMPLES_FNAME: + free(mplp->samples_fname); + va_start(args, key); + mplp->samples_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->samples_fname ? 0 : -1; + + case READ_GROUPS_FNAME: + free(mplp->read_groups_fname); + va_start(args, key); + mplp->read_groups_fname = strdup(va_arg(args,char*)); + va_end(args); + return mplp->read_groups_fname ? 0 : -1; + + case SMART_OVERLAPS: + va_start(args, key); + mplp->smart_overlaps = va_arg(args,int); + va_end(args); + return 0; + + case SMPL_IGNORE_RG: + va_start(args, key); + mplp->smpl_ignore_rg = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ANY_UNSET: + va_start(args, key); + mplp->skip_any_unset = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ALL_UNSET: + va_start(args, key); + mplp->skip_all_unset = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ANY_SET: + va_start(args, key); + mplp->skip_any_set = va_arg(args,int); + va_end(args); + return 0; + + case SKIP_ALL_SET: + va_start(args, key); + mplp->skip_all_set = va_arg(args,int); + va_end(args); + return 0; + + case MAX_DP_PER_SAMPLE: + va_start(args, key); + mplp->max_dp_per_sample = va_arg(args,int); + va_end(args); + return 0; + + case ADJUST_MQ: + va_start(args, key); + mplp->adjust_mq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_MQ: + va_start(args, key); + mplp->min_mq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_BQ: + va_start(args, key); + mplp->min_bq = va_arg(args,int); + va_end(args); + return 0; + + case MAX_BQ: + va_start(args, key); + mplp->max_bq = va_arg(args,int); + va_end(args); + return 0; + + case DELTA_BQ: + va_start(args, key); + mplp->delta_bq = va_arg(args,int); + va_end(args); + return 0; + + case MIN_REALN_FRAC: + va_start(args, key); + mplp->min_realn_frac = va_arg(args,double); + va_end(args); + return 0; + + case MIN_REALN_DP: + va_start(args, key); + mplp->min_realn_dp = va_arg(args,int); + va_end(args); + return 0; + + case MAX_REALN_DP: + va_start(args, key); + mplp->max_realn_dp = va_arg(args,int); + va_end(args); + return 0; + + case MAX_REALN_LEN: + va_start(args, key); + mplp->max_realn_len = va_arg(args,int); + va_end(args); + return 0; + + default: + hts_log_error("Todo: mpileup_set key=%d",(int)key); + return -1; + break; + } + return 0; +} + +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key) +{ + switch (key) + { + case SMART_OVERLAPS: return &mplp->smart_overlaps; break; + case SMPL_IGNORE_RG: return &mplp->smpl_ignore_rg; break; + case MAX_DP_PER_SAMPLE: return &mplp->max_dp_per_sample; break; + case ADJUST_MQ: return &mplp->adjust_mq; break; + case MIN_MQ: return &mplp->min_mq; break; + case MIN_BQ: return &mplp->min_bq; break; + case MAX_BQ: return &mplp->max_bq; break; + case DELTA_BQ: return &mplp->delta_bq; break; + case MIN_REALN_FRAC: return &mplp->min_realn_frac; break; + case MIN_REALN_DP: return &mplp->min_realn_dp; break; + case MAX_REALN_DP: return &mplp->max_realn_dp; break; + case MAX_REALN_LEN: return &mplp->max_realn_len; break; + case SKIP_ANY_UNSET: return &mplp->skip_any_unset; break; + case SKIP_ALL_UNSET: return &mplp->skip_all_unset; break; + case SKIP_ANY_SET: return &mplp->skip_any_set; break; + case SKIP_ALL_SET: return &mplp->skip_all_set; break; + default: hts_log_error("Todo: mpileup_get key=%d",(int)key); return NULL; break; + } + return NULL; +} + +void mpileup_destroy(mpileup_t *mplp) +{ + int i; + for (i=0; inbam; i++) + { + sam_close(mplp->bam[i].fp); + if ( mplp->bam[i].cached_rec ) bam_destroy1(mplp->bam[i].cached_rec); + if ( mplp->bam[i].idx ) hts_idx_destroy(mplp->bam[i].idx); + if ( mplp->bam[i].iter ) hts_itr_destroy(mplp->bam[i].iter); + } + for (i=0; inbam_names; i++) free(mplp->bam_names[i]); + free(mplp->bam); + free(mplp->bam_names); + for (i=0; insmpl; i++) read_buffer_clear(&mplp->read_buf[i]); + free(mplp->read_buf); + free(mplp->plp); + free(mplp->nplp); + bam_hdr_destroy(mplp->hdr); + if ( mplp->bsmpl ) bam_smpl_destroy(mplp->bsmpl); + free(mplp->fasta_ref); + free(mplp->targets); + free(mplp->targets_fname); + if ( mplp->regions_idx ) regidx_destroy(mplp->regions_idx); + if ( mplp->regions_itr ) regitr_destroy(mplp->regions_itr); + free(mplp->samples); + free(mplp->samples_fname); + free(mplp->read_groups_fname); + free(mplp->tmp_str.s); + free(mplp); +} + +static int is_url(const char *str) +{ + static const char uri_scheme_chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+.-"; + return str[strspn(str, uri_scheme_chars)] == ':'; +} + +static int mplp_add_region(mpileup_t *mplp, char *regions, int is_file) +{ + if ( mplp->regions_idx ) return -1; // regions can be added only once + if ( is_file ) + mplp->regions_idx = regidx_init(regions,NULL,NULL,0,NULL); + else + { + mplp->regions_idx = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); + if ( !mplp->regions_idx || regidx_insert_list(mplp->regions_idx,regions,',')!=0 ) + { + hts_log_error("Could not initialize the regions: %s",regions); + return -1; + } + } + if ( !mplp->regions_idx ) + { + hts_log_error("Could not initialize the regions: %s",regions); + return -1; + } + mplp->nregions = regidx_nregs(mplp->regions_idx); + mplp->regions_itr = regitr_init(mplp->regions_idx); + regitr_loop(mplp->regions_itr); // region iterator now positioned at the first region + return 0; +} +static int mplp_add_bam(mpileup_t *mplp, char *bam, int is_file) +{ + struct stat sbuf; + if ( is_file ) + { + int i, nnames = 0; + char **names = hts_readlist(bam, 1, &nnames); + for (i=0; ibam_names,(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); + if ( !tmp ) + { + hts_log_error("Could not allocate %zu bytes",(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); + break; + } + mplp->bam_names = tmp; + mplp->bam_names[mplp->nbam_names++] = strdup(names[i]); + } + for (i=0; ibam_names,(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); + if ( !tmp ) + { + hts_log_error("Could not allocate %zu bytes",(mplp->nbam_names+1)*sizeof(*mplp->bam_names)); + return -1; + } + mplp->bam_names = tmp; + mplp->bam_names[mplp->nbam_names++] = strdup(bam); + return 0; +} + +static int mplp_check_header_contigs(mpileup_t *mplp, bam_hdr_t *hdr) +{ + static int warned = 0; + if ( !warned ) hts_log_error("todo: mplp_check_header_contigs"); + warned = 1; + return 0; +} +int mpileup_init(mpileup_t *mplp) +{ + mplp->bsmpl = bam_smpl_init(); + if ( mplp->smpl_ignore_rg ) bam_smpl_ignore_readgroups(mplp->bsmpl); + if ( mplp->read_groups_fname ) bam_smpl_add_readgroups(mplp->bsmpl, mplp->read_groups_fname, 1); + if ( mplp->samples && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples,0) ) + { + hts_log_error("Could not parse samples: %s", mplp->samples); + return -1; + } + if ( mplp->samples_fname && !bam_smpl_add_samples(mplp->bsmpl,mplp->samples_fname,0) ) + { + hts_log_error("Could not read samples: %s", mplp->samples_fname); + return -1; + } + + mplp->bam = (bam_aux_t*)calloc(mplp->nbam_names,sizeof(*mplp->bam)); + if ( !mplp->bam ) + { + hts_log_error("Could not allocate %zu bytes",mplp->nbam_names*sizeof(*mplp->bam)); + return -1; + } + int i; + for (i=0; inbam_names; i++) + { + char *fname = mplp->bam_names[i]; + bam_aux_t *baux = &mplp->bam[mplp->nbam]; + baux->cached_smpl = -1; + baux->fname = fname; + baux->fp = sam_open(fname,"rb"); + if ( !baux->fp ) + { + hts_log_error("Failed to open %s: %s",fname,strerror(errno)); + return -1; + } + if ( hts_set_opt(baux->fp, CRAM_OPT_DECODE_MD, 1) ) + { + hts_log_error("Failed to set CRAM_OPT_DECODE_MD value: %s\n",fname); + return -1; + } + if ( mplp->fasta_ref && hts_set_fai_filename(baux->fp,mplp->fasta_ref)!=0 ) + { + hts_log_error("Failed to process %s: %s\n",mplp->fasta_ref, strerror(errno)); + return -1; + } + // baux->conf = conf; + // baux->ref = &mp_ref; + bam_hdr_t *hdr = sam_hdr_read(baux->fp); + if ( !hdr ) + { + hts_log_error("Failed to read the header of %s\n",fname); + return -1; + } + baux->bam_id = bam_smpl_add_bam(mplp->bsmpl,hdr->text,fname); + if ( baux->bam_id < 0 ) + { + // no usable read groups in this bam, can be skipped + sam_close(baux->fp); + bam_hdr_destroy(hdr); + } + mplp->nbam++; + + // iterators + if ( mplp->regions_idx ) + { + hts_idx_t *idx = sam_index_load(baux->fp,baux->fname); + if ( !idx ) + { + hts_log_error("Failed to load index: %s\n",fname); + return 1; + } + mplp->tmp_str.l = 0; + //ksprintf(&mplp->tmp_str,"%s:%"PRIhts_pos"-%"PRIhts_pos,mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + ksprintf(&mplp->tmp_str,"%s:%u-%u",mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + baux->iter = sam_itr_querys(idx, hdr, mplp->tmp_str.s); + if ( !baux->iter ) + { + baux->iter = sam_itr_querys(idx, hdr, mplp->regions_itr->seq); + if ( baux->iter ) + { + hts_log_error("[Failed to parse the region: %s",mplp->tmp_str.s); + return 1; + } + hts_log_error("The sequence \"%s\" not found: %s",mplp->regions_itr->seq,fname); + return 1; + } + if ( mplp->nregions==1 ) // no need to keep the index in memory + hts_idx_destroy(idx); + else + baux->idx = idx; + } + + if ( !mplp->hdr ) mplp->hdr = hdr; + else + { + int ret = mplp_check_header_contigs(mplp,hdr); + bam_hdr_destroy(hdr); + if ( ret!=0 ) return -1; + } + baux->hdr = mplp->hdr; + baux->tid = -1; + } + bam_smpl_get_samples(mplp->bsmpl,&mplp->nsmpl); + if ( !mplp->nsmpl ) + { + hts_log_error("Failed to initialize samples"); + return -1; + } + mplp->read_buf = (read_buffer_t*) calloc(mplp->nsmpl,sizeof(*mplp->read_buf)); + mplp->plp = (bam_pileup1_t**) calloc(mplp->nsmpl,sizeof(*mplp->plp)); + mplp->nplp = (int*) calloc(mplp->nsmpl,sizeof(*mplp->nplp)); + + return 0; +} + + +// Clear the per-sample buffer, destroying all its data +static void read_buffer_clear(read_buffer_t *rbuf) +{ + int i; + for (i=0; im; i++) + { + if ( rbuf->buf[i].bam ) bam_destroy1(rbuf->buf[i].bam); + } + free(rbuf->buf); + memset(rbuf,0,sizeof(*rbuf)); +} + +// Reset the buffer to be ready for the next region +static void read_buffer_reset(read_buffer_t *rbuf) +{ + int i; + for (i=0; im; i++) rbuf->buf[i].is_set = 0; + rbuf->n = 0; + rbuf->tid = 0; + rbuf->pos = 0; + rbuf->empty = 0; +} + +/* + * Returns + * 0 .. on success (pushed onto the buffer, swapped rec for NULL or an empty record) + * 1 .. cannot push (different tid or not ready for the current position yet) + * 2 .. cannot push (past the current position) + * -1 .. error + */ +static int read_buffer_push(read_buffer_t *rbuf, hts_pos_t pos, bam1_t **rec_ptr) +{ +static int warned = 0; +if ( !warned ) hts_log_error("read_buffer_push - left softclip vs read start"); +warned = 1; + + bam1_t *rec = *rec_ptr; + if ( rec->core.tid != rbuf->tid ) return 1; + if ( rbuf->len < rec->core.l_qseq ) rbuf->len = rec->core.l_qseq; + if ( rec->core.pos + rbuf->len < pos ) return 2; + if ( rec->core.pos > pos + rbuf->len ) return 2; +//fprintf(stderr,"pushing %d: %d+%d in %d-%d\n",rbuf->n,(int)rec->core.pos,(int)rec->core.l_qseq,(int)pos,rbuf->len); + if ( rbuf->n==rbuf->m ) + { + // Need to increase the buffer size + if ( hts_resize(rb_node_t, rbuf->n + 1, &rbuf->m, &rbuf->buf, HTS_RESIZE_CLEAR)<0 ) + { + hts_log_error("Failed to allocate memory"); + return -1; + } + rbuf->empty = rbuf->n; // the first newly allocated block is the new empty + } + // claim an empty node and populate it + rb_node_t *node = &rbuf->buf[rbuf->empty]; + *rec_ptr = node->bam; + node->bam = rec; + node->cstate.ref_pos = rec->core.pos; + node->next = rbuf->empty; // the tail's next points to itself + if ( rbuf->n ) + rbuf->buf[rbuf->tail].next = rbuf->empty; + else + rbuf->head = rbuf->empty; + rbuf->tail = rbuf->empty; + + // Assign the head of empty nodes. Note that the latter can go beyond the allocated buffer or nonsense value, + // but that will be set above to a correct value later when the buffer size is increased + rbuf->empty = node->is_set ? node->next : rbuf->empty + 1; + node->is_set = 1; + rbuf->n++; + + if ( pos==0 && rbuf->n==1 ) + { + rbuf->tid = rbuf->buf[rbuf->head].bam->core.tid; + rbuf->pos = rbuf->buf[rbuf->head].bam->core.pos; + } + + return 0; +} + +/* + * Read from a bam and fill a read buffer (separate for each sample) to allow read sub-sampling. + * + * Returns 0 on success (buffers full), 1 when done (i.e. no new reads), or a negative value on error + */ +static int mplp_read_bam(mpileup_t *mplp, bam_aux_t *baux) +{ + // if new mplp position is requested (different tid or mplp pos outside of the last region) + // update the sam iterator baux->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); + + if ( baux->cached_smpl >= 0 ) + { + read_buffer_t *rbuf = &mplp->read_buf[baux->cached_smpl]; + int ret = read_buffer_push(rbuf, mplp->pos, &baux->cached_rec); + if ( ret<0 ) return ret; // an error (malloc) + if ( ret==1 ) return 0; // cannot push, full buffer, not ready for the current position + baux->cached_smpl = -1; + } + while (1) + { + if ( !baux->cached_rec ) baux->cached_rec = bam_init1(); + int ret = baux->iter ? sam_itr_next(baux->fp, baux->iter, baux->cached_rec) : sam_read1(baux->fp, baux->hdr, baux->cached_rec); + if ( ret<0 ) + { + if ( ret==-1 ) return 1; // no more data + return -1; // error + } + + // Check if the records have tids in ascending order. This is not required by the SAM specification but + // mpileup2 relies on it (for now) + bam1_t *rec = baux->cached_rec; + if ( baux->tid==-1 ) baux->tid = rec->core.tid, baux->pos = rec->core.pos; + else if ( baux->tid > rec->core.tid ) + { + hts_log_error("Failed assumption, sequence tids out of order in %s",baux->fname); + return -1; + } + else if ( baux->pos > rec->core.pos ) + { + hts_log_error("Unsorted alignments in %s",baux->fname); + return -1; + } + baux->pos = rec->core.pos; + + // exclude unmapped reads and any filters provided by the user + if ( rec->core.tid < 0 || (rec->core.flag&BAM_FUNMAP) ) continue; + if ( mplp->skip_any_unset && (mplp->skip_any_unset & rec->core.flag)!=mplp->skip_any_unset ) continue; + if ( mplp->skip_all_set && (mplp->skip_all_set & rec->core.flag)==mplp->skip_all_set ) continue; + if ( mplp->skip_all_unset && !(mplp->skip_all_unset & rec->core.flag) ) continue; + if ( mplp->skip_any_set && (mplp->skip_any_set & rec->core.flag) ) continue; + + // exclude if sample is not wanted + baux->cached_smpl = bam_smpl_get_sample_id(mplp->bsmpl, baux->bam_id, rec); + if ( baux->cached_smpl<0 ) continue; + + read_buffer_t *rbuf = &mplp->read_buf[baux->cached_smpl]; + ret = read_buffer_push(rbuf, mplp->pos, &baux->cached_rec); + if ( ret==1 ) return 0; // cannot push, full buffer, not ready for the current position + if ( ret<0 ) return ret; // an error (malloc) + baux->cached_smpl = -1; + } + return -1; // can never happen +} + +/* + * Returns + * BAM_CMATCH when read overlaps pos and has a base match or mismatch. Sets ipos + * BAM_CINS when the next base is an insertion. Sets ipos to the current base + * BAM_CDEL when the next or overlapping base is a deletion. Sets ipos to the current + * base when the next base is a deletion and to -1 when inside the deletion + * -1 when there is no base to show (e.g. soft clip) + * -2 when the read ends before pos + */ +static int cigar_get_pos(cigar_state_t *cs, bam1_t *bam, hts_pos_t pos, int32_t *ipos) +{ + int ncig = bam->core.n_cigar; + uint32_t *cigar = bam_get_cigar(bam); + while ( cs->ref_pos <= pos ) + { + if ( cs->icig >= ncig ) return -2; // the read ends before pos + int op = cigar[cs->icig] & BAM_CIGAR_MASK; + int len = cigar[cs->icig] >> BAM_CIGAR_SHIFT; + if ( op==BAM_CMATCH || op==BAM_CEQUAL || op==BAM_CDIFF ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->iseq += len; + cs->icig++; + continue; + } + *ipos = pos - cs->ref_pos + cs->iseq; + + // if the following base is an insertion, return BAM_CINS + if ( end_pos==pos && cs->icig+1 < ncig ) + { + int next_op = cigar[cs->icig+1] & BAM_CIGAR_MASK; + if ( next_op==BAM_CINS ) return BAM_CINS; + if ( next_op==BAM_CDEL ) return BAM_CDEL; + } + return BAM_CMATCH; + } + if ( op==BAM_CINS || op==BAM_CSOFT_CLIP ) + { + cs->iseq += len; + cs->icig++; + continue; + } + if ( op==BAM_CDEL ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->icig++; + continue; + } + *ipos = -1; + return BAM_CDEL; + } + if ( op==BAM_CREF_SKIP ) + { + hts_pos_t end_pos = cs->ref_pos + len - 1; + if ( end_pos < pos ) // pos not covered by this cigar block + { + cs->ref_pos += len; + cs->icig++; + continue; + } + return -1; + } + hts_log_error("todo: CIGAR operator %d", op); + assert(0); + } + return -1; +} + +// Go through all reads in the buffer and make the pileup structures ready for output +static int mplp_set_pileup(mpileup_t *mplp, read_buffer_t *rbuf) +{ + if ( !rbuf->n ) return 0; + rbuf->ndel = rbuf->nins = rbuf->nbase = 0; + int iprev = -1, i = rbuf->head; + while ( 1 ) + { + rb_node_t *node = &rbuf->buf[i]; // rb_node_t keeps info about a single read + int32_t ipos; + int ret = cigar_get_pos(&node->cstate, node->bam, mplp->pos, &ipos); + if ( ret==BAM_CMATCH ) // there is a base, match or mismatch + { + uint8_t *seq = bam_get_seq(node->bam); + fprintf(stderr," %c","ACGTN"[seq_nt16_int[bam_seqi(seq,ipos)]]); + rbuf->nbase++; + } + else if ( ret==BAM_CINS ) rbuf->nins++; + else if ( ret==BAM_CDEL && ipos!=-1 ) rbuf->ndel++; + else if ( ret==-2 ) // this read is done, remove the node + { + rbuf->n--; + node->is_set = 0; + int next = node->next; + node->next = rbuf->empty; + rbuf->empty = i; + if ( next==i ) // node that points to itself is the last node in the chain + { + assert( rbuf->tail==i ); + rbuf->tail = iprev; + if ( iprev==-1 ) rbuf->head = iprev; + else rbuf->buf[iprev].next = iprev; + break; + } + else if ( iprev==-1 ) // removing the first node + rbuf->head = next; + else // removing a middle node + rbuf->buf[iprev].next = next; + i = next; + continue; + } + + if ( rbuf->buf[i].next == i ) break; + iprev = i; + i = rbuf->buf[i].next; + } + fprintf(stderr,"\n"); + return 0; +} + +static int mplp_region_next(mpileup_t *mplp) +{ + int i; + + mplp->pos++; + if ( mplp->nregions && mplp->pos > mplp->regions_itr->end ) return 0; + + // fill buffers + for (i=0; inbam; i++) + { + int ret = mplp_read_bam(mplp, &mplp->bam[i]); + if ( ret<0 ) return ret; // an error occurred + } + + // this would be a good place for realignment + + // find the minimum position if first time here + if ( mplp->tid==-1 ) + { + mplp->tid = INT32_MAX; + for (i=0; insmpl; i++) + { + if ( mplp->tid > mplp->read_buf[i].tid ) mplp->tid = mplp->read_buf[i].tid, mplp->pos = HTS_POS_MAX; + if ( mplp->pos > mplp->read_buf[i].pos ) mplp->pos = mplp->read_buf[i].pos; + } + if ( mplp->nregions && mplp->pos < mplp->regions_itr->beg ) mplp->pos = mplp->regions_itr->beg; + } + + // prepare the pileup + int nreads = 0; + for (i=0; insmpl; i++) + { + int ret = mplp_set_pileup(mplp, &mplp->read_buf[i]); + if ( ret!=0 ) return -1; + nreads += mplp->read_buf[i].n; + } + return nreads>0 /* || nregions */ ? 1 : 0; +} + +int mpileup_next(mpileup_t *mplp) +{ + // if new region is needed + // o advance regitr_loop + // o reset read_buffers + // o reset mplp tid and pos + + if ( !mplp->nregions ) + return mplp_region_next(mplp); + + int i; + while (1) + { + int ret = mplp_region_next(mplp); + if ( ret>0 ) + { + if ( mplp->pos < mplp->regions_itr->beg ) continue; + if ( mplp->pos <= mplp->regions_itr->end ) return ret; + } + if ( !regitr_loop(mplp->regions_itr) ) return 0; + + for (i=0; insmpl; i++) read_buffer_reset(&mplp->read_buf[i]); + + mplp->tmp_str.l = 0; + //ksprintf(&mplp->tmp_str,"%s:%"PRIhts_pos"-%"PRIhts_pos,mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + ksprintf(&mplp->tmp_str,"%s:%u-%u",mplp->regions_itr->seq,mplp->regions_itr->beg+1,mplp->regions_itr->end+1); + for (i=0; inbam; i++) + { + bam_aux_t *baux = &mplp->bam[i]; + hts_itr_destroy(baux->iter); + baux->iter = sam_itr_querys(baux->idx, baux->hdr, mplp->tmp_str.s); + if ( !baux->iter ) + { + baux->iter = sam_itr_querys(baux->idx, baux->hdr, mplp->regions_itr->seq); + if ( baux->iter ) + { + hts_log_error("[Failed to parse the region: %s",mplp->tmp_str.s); + return 1; + } + hts_log_error("The sequence \"%s\" not found: %s",mplp->regions_itr->seq,baux->fname); + return -1; + } + baux->tid = baux->pos = -1; + } + mplp->tid = mplp->pos = -1; + } +} + diff --git a/mpileup2/mpileup.h b/mpileup2/mpileup.h new file mode 100644 index 000000000..7b6d66c25 --- /dev/null +++ b/mpileup2/mpileup.h @@ -0,0 +1,121 @@ +/* mileup2.h -- mpileup v2.0 API + + Copyright (C) 2022-2023 Genome Research Ltd. + + Author: Petr Danecek + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + + */ + +#ifndef __MPILEUP20_H__ +#define __MPILEUP20_H__ + +typedef struct mpileup_t_ mpileup_t; + +typedef enum +{ + // required + FASTA_REF, // char*, fasta reference + BAM, // char*, add another BAM/CRAM + BAM_FNAME, // char*, read list of BAMs/CRAMs from a file + + // optional + REGIONS, // char*, comma-separated regions + REGIONS_FNAME, // char*, file with a list of regions + TARGETS, + TARGETS_FNAME, + SAMPLES, // char*, comma-separated samples to include + SAMPLES_FNAME, // char*, file of samples to include + READ_GROUPS_FNAME, // char*, read groups to include or exclude, see `bcftools mpileup -G` documentation + + // bit flags + SMART_OVERLAPS, // int {0,1}, 1:disable read-pair overlap detection [0] + SMPL_IGNORE_RG, // int {0,1}, 1:ignore read groups, one bam = one sample [0] + SKIP_ANY_UNSET, // skip a read if any of the bits is not set [0] + SKIP_ALL_UNSET, // skip a read if all these bits are not set [0] + SKIP_ANY_SET, // skip a read if any of the bits is set [BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP] + SKIP_ALL_SET, // skip a read if all these bits are set [0] + + // numeric parameters + MAX_DP_PER_SAMPLE, // int, maximum depth per sample [250] + ADJUST_MQ, // int, --adjust-MQ value for downgrading reads with excessive mismatches, 0 to disable [0] + MIN_MQ, // int, skip alignments with mapQ smaller than MIN_MQ [0] + MIN_BQ, // int, skip bases with baseQ smaller than MIN_BQ [1] + MAX_BQ, // int, cap BQ for overly optimistics platforms [60] + DELTA_BQ, // int, --delta-BQ value for downgrading high BQ if neighbor_BQ is < BQ - 30 [30] + MIN_REALN_FRAC, // float, minimum fraction of reads with evidence for an indel to consider the site needing realignment [0.05] + MIN_REALN_DP, // int, minimum number of reads with evidence for an indel to consider the site needing realignment [2] + MAX_REALN_DP, // int, subsample to this many reads for realignment [250] + MAX_REALN_LEN, // int, realign reads this long max [500] + + // pileup related values + N_SAMPLES, // int, number of samples in the pileup + N_READS, // int*, number of reads per sample at the current position + PILEUP, // +} +mpileup_opt_t; + +/** + * mpileup_alloc() - allocate the mpileup structure + * + * Note that any default values set by the function can change without a warning and + * therefore should be set explicitly with `mpileup_set()` by the caller. + */ +mpileup_t *mpileup_alloc(void); + +/** + * mpileup_destroy - destroy the mpileup structure + */ +void mpileup_destroy(mpileup_t *mplp); + +/** + * mpileup_set() - set various options, see the mpileup_opt_t keys for the complete list + * + * Returns 0 if the call succeeded, or negative number on error. + */ +int mpileup_set(mpileup_t *mplp, mpileup_opt_t key, ...); // returns 0 on success + +/** + * mpileup_get() - get various options, see the mpileup_opt_t keys + * mpileup_get_val() - wrapper for `mpileup_get()` to return typed value + * + * The former returns pointer to the memory area populated by the requested setting, + * its type can be inferred from the mpileup_opt_t documentation. + */ +void *mpileup_get(mpileup_t *mplp, mpileup_opt_t key); +#define mpileup_get_val(mplp,type,key) (*(type*)mpileup_get(mplp, key)) + +/** + * mpileup_init() - inits regions, bams, iterators, etc. + * + * Should be called after fully set up, i.e. after all `mpileup_set()` calls. + * + * Returns 0 on success or negative number on error. + */ +int mpileup_init(mpileup_t *mplp); + +/** + * mpileup_next() - positions mpileup to the next genomic position + * + * Returns 1 when next position is available, 0 when all regions are done, negative value on error + */ +int mpileup_next(mpileup_t *mplp); + +#endif