Skip to content

Commit 1150b9c

Browse files
committed
Experimental: added 64-bit pos, mate pos and insert size to CRAM V4.0.
The BAM internal layout is 64-bit too, and SAM can fill this out with 64-bit fields. This permits SAM to CRAM to SAM round trips with long references, although BAM is still unable to handle this (and it doesn't report this failure yet).
1 parent 8a1ef43 commit 1150b9c

File tree

14 files changed

+414
-123
lines changed

14 files changed

+414
-123
lines changed

io_lib/bam.c

Lines changed: 97 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,12 +1172,13 @@ static int64_t inline STRTOL64(const char *v, const char **rv, int b) {
11721172
* -1 on error
11731173
*/
11741174
static int sam_next_seq(bam_file_t *b, bam_seq_t **bsp) {
1175-
int used_l, n, sign;
1175+
int used_l, sign;
1176+
int64_t n;
11761177
unsigned char *cpf, *cpt, *cp;
11771178
int cigar_len;
11781179
bam_seq_t *bs;
11791180
HashItem *hi;
1180-
int start, end;
1181+
int64_t start, end;
11811182
SAM_hdr *sh = b->header;
11821183

11831184
static const char lookup[256] = {
@@ -1619,12 +1620,12 @@ int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
16191620
return -1;
16201621
}
16211622

1622-
if (!*bsp || blk_size+20 > (*bsp)->alloc) {
1623-
/* 20 extra is for bs->alloc to bs->cigar_len plus next_len */
1624-
if (!(bs = realloc(*bsp, blk_size+20)))
1623+
if (!*bsp || blk_size+44 > (*bsp)->alloc) {
1624+
/* 44 extra is for bs->alloc to bs->cigar_len plus next_len */
1625+
if (!(bs = realloc(*bsp, blk_size+44)))
16251626
return -1;
16261627
*bsp = bs;
1627-
(*bsp)->alloc = blk_size+20;
1628+
(*bsp)->alloc = blk_size+44;
16281629
(*bsp)->blk_size = blk_size;
16291630
}
16301631
bs = *bsp;
@@ -1647,7 +1648,7 @@ int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
16471648

16481649
bs->blk_size = blk_size;
16491650
bs->ref = le_int4(bs->ref);
1650-
bs->pos = le_int4(bs->pos);
1651+
bs->pos = le_int4(bs->pos_32);
16511652

16521653
// order of bit-fields in struct is platform specific, so manually decode
16531654
i32 = le_int4(bs->bin_packed);
@@ -1661,8 +1662,8 @@ int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
16611662

16621663
bs->len = le_int4(bs->len);
16631664
bs->mate_ref = le_int4(bs->mate_ref);
1664-
bs->mate_pos = le_int4(bs->mate_pos);
1665-
bs->ins_size = le_int4(bs->ins_size);
1665+
bs->mate_pos = le_int4(bs->mate_pos_32);
1666+
bs->ins_size = le_int4(bs->ins_size_32);
16661667

16671668
if (10 == be_int4(10)) {
16681669
int i, cigar_len = bam_cigar_len(bs);
@@ -1715,7 +1716,7 @@ int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
17151716

17161717
bs->blk_size = blk_size;
17171718
bs->ref = le_int4(bs->ref);
1718-
bs->pos = le_int4(bs->pos);
1719+
bs->pos = le_int4(bs->pos_32);
17191720

17201721
// order of bit-fields in struct is platform specific, so manually decode
17211722
i32 = le_int4(bs->bin_packed);
@@ -1729,8 +1730,8 @@ int bam_get_seq(bam_file_t *b, bam_seq_t **bsp) {
17291730

17301731
bs->len = le_int4(bs->len);
17311732
bs->mate_ref = le_int4(bs->mate_ref);
1732-
bs->mate_pos = le_int4(bs->mate_pos);
1733-
bs->ins_size = le_int4(bs->ins_size);
1733+
bs->mate_pos = le_int4(bs->mate_pos_32);
1734+
bs->ins_size = le_int4(bs->ins_size_32);
17341735

17351736
/* Name */
17361737
if (bam_read(b, &bs->data, bam_name_len(bs)) != bam_name_len(bs))
@@ -2099,13 +2100,13 @@ int bam_construct_seq(bam_seq_t **b, size_t extra_len,
20992100
const char *qname, size_t qname_len,
21002101
int flag,
21012102
int rname, // Ref ID
2102-
int pos, // first aligned base (1-based)
2103-
int end, // last aligned base (to calculate bin)
2103+
int64_t pos, // first aligned base (1-based)
2104+
int64_t end, // last aligned base (to calculate bin)
21042105
int mapq,
21052106
uint32_t ncigar, const uint32_t *cigar,
21062107
int mrnm, // Mate Ref ID
2107-
int mpos,
2108-
int isize,
2108+
int64_t mpos,
2109+
int64_t isize,
21092110
int len,
21102111
const char *seq,
21112112
const char *qual) {
@@ -2826,6 +2827,76 @@ unsigned char *append_uint(unsigned char *cp, uint32_t i) {
28262827
return cp;
28272828
}
28282829

2830+
unsigned char *append_int64(unsigned char *cp, int64_t i) {
2831+
int64_t j;
2832+
2833+
if (i < 0) {
2834+
*cp++ = '-';
2835+
if (i == INT_MIN) {
2836+
*cp++ = '2'; *cp++ = '1'; *cp++ = '4'; *cp++ = '7';
2837+
*cp++ = '4'; *cp++ = '8'; *cp++ = '3'; *cp++ = '6';
2838+
*cp++ = '4'; *cp++ = '8';
2839+
return cp;
2840+
}
2841+
2842+
i = -i;
2843+
} else if (i == 0) {
2844+
*cp++ = '0';
2845+
return cp;
2846+
}
2847+
2848+
//if (i < 10) goto b0;
2849+
if (i < 100) goto b1;
2850+
//if (i < 1000) goto b2;
2851+
if (i < 10000) goto b3;
2852+
//if (i < 100000) goto b4;
2853+
if (i < 1000000) goto b5;
2854+
//if (i < 10000000) goto b6;
2855+
if (i < 100000000) goto b7;
2856+
2857+
if ((j = i / 1000000000000000000)) {*cp++ = j + '0'; i -= j*1000000000000000000; goto xh;}
2858+
if ((j = i / 100000000000000000)) {*cp++ = j + '0'; i -= j*100000000000000000; goto xg;}
2859+
if ((j = i / 10000000000000000)) {*cp++ = j + '0'; i -= j*10000000000000000; goto xf;}
2860+
if ((j = i / 1000000000000000)) {*cp++ = j + '0'; i -= j*1000000000000000; goto xe;}
2861+
if ((j = i / 100000000000000)) {*cp++ = j + '0'; i -= j*100000000000000; goto xd;}
2862+
if ((j = i / 10000000000000)) {*cp++ = j + '0'; i -= j*10000000000000; goto xc;}
2863+
if ((j = i / 1000000000000)) {*cp++ = j + '0'; i -= j*1000000000000; goto xb;}
2864+
if ((j = i / 100000000000)) {*cp++ = j + '0'; i -= j*100000000000; goto xa;}
2865+
if ((j = i / 10000000000)) {*cp++ = j + '0'; i -= j*10000000000; goto x9;}
2866+
if ((j = i / 1000000000)) {*cp++ = j + '0'; i -= j*1000000000; goto x8;}
2867+
if ((j = i / 100000000)) {*cp++ = j + '0'; i -= j*100000000; goto x7;}
2868+
b7: if ((j = i / 10000000)) {*cp++ = j + '0'; i -= j*10000000; goto x6;}
2869+
if ((j = i / 1000000)) {*cp++ = j + '0', i -= j*1000000; goto x5;}
2870+
b5: if ((j = i / 100000)) {*cp++ = j + '0', i -= j*100000; goto x4;}
2871+
if ((j = i / 10000)) {*cp++ = j + '0', i -= j*10000; goto x3;}
2872+
b3: if ((j = i / 1000)) {*cp++ = j + '0', i -= j*1000; goto x2;}
2873+
if ((j = i / 100)) {*cp++ = j + '0', i -= j*100; goto x1;}
2874+
b1: if ((j = i / 10)) {*cp++ = j + '0', i -= j*10; goto x0;}
2875+
if (i) *cp++ = i + '0';
2876+
return cp;
2877+
2878+
xh: *cp++ = i / 100000000000000000 + '0', i %= 100000000000000000;
2879+
xg: *cp++ = i / 10000000000000000 + '0', i %= 10000000000000000;
2880+
xf: *cp++ = i / 1000000000000000 + '0', i %= 1000000000000000;
2881+
xe: *cp++ = i / 100000000000000 + '0', i %= 100000000000000;
2882+
xd: *cp++ = i / 10000000000000 + '0', i %= 10000000000000;
2883+
xc: *cp++ = i / 1000000000000 + '0', i %= 1000000000000;
2884+
xb: *cp++ = i / 100000000000 + '0', i %= 100000000000;
2885+
xa: *cp++ = i / 10000000000 + '0', i %= 10000000000;
2886+
x9: *cp++ = i / 1000000000 + '0', i %= 1000000000;
2887+
x8: *cp++ = i / 100000000 + '0', i %= 100000000;
2888+
x7: *cp++ = i / 10000000 + '0', i %= 10000000;
2889+
x6: *cp++ = i / 1000000 + '0', i %= 1000000;
2890+
x5: *cp++ = i / 100000 + '0', i %= 100000;
2891+
x4: *cp++ = i / 10000 + '0', i %= 10000;
2892+
x3: *cp++ = i / 1000 + '0', i %= 1000;
2893+
x2: *cp++ = i / 100 + '0', i %= 100;
2894+
x1: *cp++ = i / 10 + '0', i %= 10;
2895+
x0: *cp++ = i + '0';
2896+
2897+
return cp;
2898+
}
2899+
28292900
/*
28302901
* This is set up so that count should never be more than BGZF_BUFF_SIZE.
28312902
* This has been chosen to deliberately be small enough such that the
@@ -3158,7 +3229,7 @@ int bam_put_seq(bam_file_t *fp, bam_seq_t *b) {
31583229
/* POS */
31593230
if (b->pos < -1) return -1;
31603231
if (end-fp->uncomp_p < 12) BF_FLUSH();
3161-
fp->uncomp_p = append_int(fp->uncomp_p, b->pos+1); *fp->uncomp_p++ = '\t';
3232+
fp->uncomp_p = append_int64(fp->uncomp_p, b->pos+1); *fp->uncomp_p++ = '\t';
31623233

31633234
/* MAPQ */
31643235
if (end-fp->uncomp_p < 5) BF_FLUSH();
@@ -3203,11 +3274,11 @@ int bam_put_seq(bam_file_t *fp, bam_seq_t *b) {
32033274

32043275
/* MPOS */
32053276
if (end-fp->uncomp_p < 12) BF_FLUSH();
3206-
fp->uncomp_p = append_int(fp->uncomp_p, b->mate_pos+1); *fp->uncomp_p++ = '\t';
3277+
fp->uncomp_p = append_int64(fp->uncomp_p, b->mate_pos+1); *fp->uncomp_p++ = '\t';
32073278

32083279
/* ISIZE */
32093280
if (end-fp->uncomp_p < 12) BF_FLUSH();
3210-
fp->uncomp_p = append_int(fp->uncomp_p, b->ins_size); *fp->uncomp_p++ = '\t';
3281+
fp->uncomp_p = append_int64(fp->uncomp_p, b->ins_size); *fp->uncomp_p++ = '\t';
32113282

32123283
/* SEQ */
32133284
n = (b->len+1)/2;
@@ -3520,17 +3591,21 @@ int bam_put_seq(bam_file_t *fp, bam_seq_t *b) {
35203591

35213592
#ifdef SP_BIG_ENDIAN
35223593
b->ref = le_int4(b->ref);
3523-
b->pos = le_int4(b->pos);
3594+
b->pos_32 = le_int4(b->pos);
35243595
b->bin_packed = le_int4(b->bin_packed);
35253596
b->flag_packed = le_int4(b->flag_packed);
35263597
b->len = le_int4(b->len);
35273598
b->mate_ref = le_int4(b->mate_ref);
3528-
b->mate_pos = le_int4(b->mate_pos);
3529-
b->ins_size = le_int4(b->ins_size);
3599+
b->mate_pos_32 = le_int4(b->mate_pos);
3600+
b->ins_size_32 = le_int4(b->ins_size);
35303601

35313602
for (i = 0; i < n; i++) {
35323603
cigar[i] = le_int4(cigar[i]);
35333604
}
3605+
#else
3606+
b->pos_32 = b->pos;
3607+
b->mate_pos_32 = b->mate_pos;
3608+
b->ins_size_32 = b->ins_size;
35343609
#endif
35353610

35363611
#ifdef ALLOW_UAC

io_lib/bam.h

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,15 @@ typedef struct bam_seq_s {
7070
uint32_t alloc; /* total size of this struct + 'data' onwards */
7171
uint32_t blk_size;
7272

73+
// 64-bit variant of our position, for general use
74+
int64_t pos;
75+
int64_t mate_pos;
76+
int64_t ins_size;
77+
7378
/* The raw bam block follows, in same order as on the disk */
7479
/* This is the analogue of a bam1_core_t in samtools */
7580
int32_t ref;
76-
int32_t pos;
81+
int32_t pos_32;
7782

7883
union {
7984
struct {
@@ -90,8 +95,8 @@ typedef struct bam_seq_s {
9095

9196
int32_t len;
9297
int32_t mate_ref;
93-
int32_t mate_pos;
94-
int32_t ins_size;
98+
int32_t mate_pos_32;
99+
int32_t ins_size_32;
95100

96101
/* Followed by arbitrary bytes of packed data */
97102
unsigned char data; /* unknown size */
@@ -709,13 +714,13 @@ int bam_construct_seq(bam_seq_t **b, size_t extra_len,
709714
const char *qname, size_t qname_len,
710715
int flag,
711716
int rname, // Ref ID
712-
int pos,
713-
int end, // aligned start/end coords
717+
int64_t pos,
718+
int64_t end, // aligned start/end coords
714719
int mapq,
715720
uint32_t ncigar, const uint32_t *cigar,
716721
int mrnm, // Mate Ref ID
717-
int mpos,
718-
int isize,
722+
int64_t mpos,
723+
int64_t isize,
719724
int len,
720725
const char *seq,
721726
const char *qual);

0 commit comments

Comments
 (0)