Skip to content

Commit c0ddabb

Browse files
committed
Make headers 64-bit compliant
Make sam_hdr_tid2len() return hts_pos_t. Change length stored in sam_hrec_sq_t to hts_pos_t. Make sam header parser use strtoll() instead of atoi(). Unfortunately changing the size of the header target_len array is difficult as some external software attempts to resize it as a multiple of sizeof(uint32_t). Work around this by storing large values as UINT32_MAX and repurpose the sdict pointer (unused since 7a853e8) as a way of passing the real size through. Code that supports long references will need to use sam_hdr_tid2len() to get the length. Adds tests for reading and writing SAM files.
1 parent 1f47675 commit c0ddabb

File tree

6 files changed

+257
-18
lines changed

6 files changed

+257
-18
lines changed

cram/cram_io.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1820,7 +1820,7 @@ static void sanitise_SQ_lines(cram_fd *fd) {
18201820

18211821
// Should we also check MD5sums here to ensure the correct
18221822
// reference was given?
1823-
hts_log_warning("Header @SQ length mismatch for ref %s, %d vs %d",
1823+
hts_log_warning("Header @SQ length mismatch for ref %s, %"PRIhts_pos" vs %d",
18241824
r->name, fd->header->hrecs->ref[i].len, (int)r->length);
18251825

18261826
// Fixing the parsed @SQ header will make MD:Z: strings work

header.c

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3838

3939
// Hash table for removing multiple lines from the header
4040
KHASH_SET_INIT_STR(rm)
41+
// Used for long refs in SAM files
42+
KHASH_DECLARE(s2i, kh_cstr_t, int64_t)
43+
4144
typedef khash_t(rm) rmhash_t;
4245

4346
static int sam_hdr_link_pg(sam_hdr_t *bh);
@@ -91,7 +94,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
9194
sam_hrec_tag_t *tag = h_type->tag;
9295
int nref = hrecs->nref;
9396
const char *name = NULL;
94-
int len = -1, r;
97+
hts_pos_t len = -1;
98+
int r;
9599
khint_t k;
96100

97101
while (tag) {
@@ -100,7 +104,7 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
100104
name = tag->str+3;
101105
} else if (tag->str[0] == 'L' && tag->str[1] == 'N') {
102106
assert(tag->len >= 3);
103-
len = atoi(tag->str+3);
107+
len = strtoll(tag->str+3, NULL, 10);
104108
}
105109
tag = tag->next;
106110
}
@@ -134,7 +138,8 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs,
134138
// Check lengths match; correct if not.
135139
if (len != hrecs->ref[nref].len) {
136140
char tmp[32];
137-
snprintf(tmp, sizeof(tmp), "%u", hrecs->ref[nref].len);
141+
snprintf(tmp, sizeof(tmp), "%" PRIhts_pos,
142+
hrecs->ref[nref].len);
138143
if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0)
139144
return -1;
140145
}
@@ -831,7 +836,11 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
831836
if (!bh->target_name[i])
832837
return -1;
833838
}
834-
bh->target_len[i] = hrecs->ref[i].len;
839+
if (hrecs->ref[i].len < UINT32_MAX) {
840+
bh->target_len[i] = hrecs->ref[i].len;
841+
} else {
842+
bh->target_len[i] = UINT32_MAX;
843+
}
835844
}
836845

837846
// Free up any names that have been removed
@@ -901,7 +910,17 @@ static int sam_hrecs_refs_from_targets_array(sam_hrecs_t *hrecs,
901910
int r;
902911
hrecs->ref[tid].name = string_dup(hrecs->str_pool, bh->target_name[tid]);
903912
if (!hrecs->ref[tid].name) goto fail;
904-
hrecs->ref[tid].len = bh->target_len[tid];
913+
if (bh->target_len[tid] < UINT32_MAX || !bh->sdict) {
914+
hrecs->ref[tid].len = bh->target_len[tid];
915+
} else {
916+
khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict;
917+
k = kh_get(s2i, long_refs, hrecs->ref[tid].name);
918+
if (k < kh_end(long_refs)) {
919+
hrecs->ref[tid].len = kh_val(long_refs, k);
920+
} else {
921+
hrecs->ref[tid].len = UINT32_MAX;
922+
}
923+
}
905924
hrecs->ref[tid].ty = NULL;
906925
k = kh_put(m_s2i, hrecs->ref_hash, hrecs->ref[tid].name, &r);
907926
if (r < 0) goto fail;
@@ -948,7 +967,7 @@ static int add_stub_ref_sq_lines(sam_hrecs_t *hrecs) {
948967

949968
for (tid = 0; tid < hrecs->nref; tid++) {
950969
if (hrecs->ref[tid].ty == NULL) {
951-
snprintf(len, sizeof(len), "%d", hrecs->ref[tid].len);
970+
snprintf(len, sizeof(len), "%"PRIhts_pos, hrecs->ref[tid].len);
952971
if (sam_hrecs_add(hrecs, "SQ",
953972
"SN", hrecs->ref[tid].name,
954973
"LN", len, NULL) != 0)
@@ -1841,7 +1860,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid) {
18411860
return NULL;
18421861
}
18431862

1844-
uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
1863+
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
18451864
sam_hrecs_t *hrecs;
18461865

18471866
if (!h)
@@ -1850,8 +1869,19 @@ uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid) {
18501869
if ((hrecs = h->hrecs) != NULL && tid < hrecs->nref) {
18511870
return hrecs->ref[tid].len;
18521871
} else {
1853-
if (tid < h->n_targets)
1854-
return h->target_len[tid];
1872+
if (tid < h->n_targets) {
1873+
if (h->target_len[tid] < UINT32_MAX || !h->sdict) {
1874+
return h->target_len[tid];
1875+
} else {
1876+
khash_t(s2i) *long_refs = (khash_t(s2i) *) h->sdict;
1877+
khint_t k = kh_get(s2i, long_refs, h->target_name[tid]);
1878+
if (k < kh_end(long_refs)) {
1879+
return kh_val(long_refs, k);
1880+
} else {
1881+
return UINT32_MAX;
1882+
}
1883+
}
1884+
}
18551885
}
18561886

18571887
return 0;

header.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ typedef struct sam_hrec_type_s {
133133
/*! Parsed \@SQ lines */
134134
typedef struct {
135135
const char *name;
136-
uint32_t len;
136+
hts_pos_t len;
137137
sam_hrec_type_t *ty;
138138
} sam_hrec_sq_t;
139139

htslib/sam.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ typedef struct sam_hdr_t {
7474
const int8_t *cigar_tab HTS_DEPRECATED("Use bam_cigar_table[] instead");
7575
char **target_name;
7676
char *text;
77-
void *sdict HTS_DEPRECATED("Unused since 1.10");
77+
void *sdict;
7878
sam_hrecs_t *hrecs;
7979
uint32_t ref_count;
8080
} sam_hdr_t;
@@ -698,7 +698,7 @@ const char *sam_hdr_tid2name(const sam_hdr_t *h, int tid);
698698
* Fetch the reference sequence length from the target length array,
699699
* using the numerical target id.
700700
*/
701-
uint32_t sam_hdr_tid2len(const sam_hdr_t *h, int tid);
701+
hts_pos_t sam_hdr_tid2len(const sam_hdr_t *h, int tid);
702702

703703
/// Alias of sam_hdr_name2tid(), for backwards compatibility.
704704
/*!

sam.c

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ void sam_hdr_destroy(sam_hdr_t *bh)
117117
free(bh->text);
118118
if (bh->hrecs)
119119
sam_hrecs_free(bh->hrecs);
120+
if (bh->sdict)
121+
kh_destroy(s2i, (khash_t(s2i) *) bh->sdict);
120122
free(bh);
121123
}
122124

@@ -1191,6 +1193,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
11911193
const char *q, *r;
11921194
char* sn = NULL;
11931195
khash_t(s2i) *d = kh_init(s2i);
1196+
khash_t(s2i) *long_refs = NULL;
11941197
if (!h || !d)
11951198
goto error;
11961199

@@ -1202,7 +1205,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
12021205

12031206
if (fp->line.l > 3 && strncmp(fp->line.s, "@SQ", 3) == 0) {
12041207
has_SQ = 1;
1205-
int ln = -1;
1208+
hts_pos_t ln = -1;
12061209
for (q = fp->line.s + 4;; ++q) {
12071210
if (strncmp(q, "SN:", 3) == 0) {
12081211
q += 3;
@@ -1220,7 +1223,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
12201223
q = r;
12211224
} else {
12221225
if (strncmp(q, "LN:", 3) == 0)
1223-
ln = strtol(q + 3, (char**)&q, 10);
1226+
ln = strtoll(q + 3, (char**)&q, 10);
12241227
}
12251228

12261229
while (*q != '\t' && *q != '\n' && *q != '\0')
@@ -1239,7 +1242,24 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
12391242
hts_log_warning("Duplicated sequence '%s'", sn);
12401243
free(sn);
12411244
} else {
1242-
kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1245+
if (ln >= UINT32_MAX) {
1246+
// Stash away ref length that
1247+
// doesn't fit in target_len array
1248+
int k2;
1249+
if (!long_refs) {
1250+
long_refs = kh_init(s2i);
1251+
if (!long_refs)
1252+
goto error;
1253+
}
1254+
k2 = kh_put(s2i, long_refs, sn, &absent);
1255+
if (absent < 0)
1256+
goto error;
1257+
kh_val(long_refs, k2) = ln;
1258+
kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1259+
| UINT32_MAX);
1260+
} else {
1261+
kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1262+
}
12431263
}
12441264
} else {
12451265
hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn);
@@ -1281,6 +1301,8 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
12811301

12821302
while (line.l = 0, kgetline(&line, (kgets_func*) hgets, f) >= 0) {
12831303
char* tab = strchr(line.s, '\t');
1304+
hts_pos_t ln;
1305+
12841306
if (tab == NULL)
12851307
continue;
12861308

@@ -1292,18 +1314,38 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
12921314
if (absent < 0)
12931315
break;
12941316

1317+
ln = strtoll(tab, NULL, 10);
1318+
12951319
if (!absent) {
12961320
hts_log_warning("Duplicated sequence '%s'", sn);
12971321
free(sn);
12981322
} else {
1299-
kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | atol(tab);
1323+
if (ln >= UINT32_MAX) {
1324+
// Stash away ref length that
1325+
// doesn't fit in target_len array
1326+
khint_t k2;
1327+
int absent = -1;
1328+
if (!long_refs) {
1329+
long_refs = kh_init(s2i);
1330+
if (!long_refs)
1331+
goto error;
1332+
}
1333+
k2 = kh_put(s2i, long_refs, sn, &absent);
1334+
if (absent < 0)
1335+
goto error;
1336+
kh_val(long_refs, k2) = ln;
1337+
kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32
1338+
| UINT32_MAX);
1339+
} else {
1340+
kh_val(d, k) = (int64_t) (kh_size(d) - 1) << 32 | ln;
1341+
}
13001342
has_SQ = 1;
13011343
}
13021344

13031345
e |= kputs("@SQ\tSN:", &str) < 0;
13041346
e |= kputsn(line.s, tab - line.s, &str) < 0;
13051347
e |= kputs("\tLN:", &str) < 0;
1306-
e |= kputl(atol(tab), &str) < 0;
1348+
e |= kputll(ln, &str) < 0;
13071349
e |= kputc('\n', &str) < 0;
13081350
if (e)
13091351
break;
@@ -1340,6 +1382,9 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
13401382
}
13411383
}
13421384

1385+
// Repurpose sdict to hold any references longer than UINT32_MAX
1386+
h->sdict = long_refs;
1387+
13431388
kh_destroy(s2i, d);
13441389

13451390
if (str.l == 0)
@@ -1355,6 +1400,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) {
13551400
sam_hdr_destroy(h);
13561401
KS_FREE(&str);
13571402
kh_destroy(s2i, d);
1403+
kh_destroy(s2i, long_refs);
13581404
if (sn) free(sn);
13591405
return NULL;
13601406
}

0 commit comments

Comments
 (0)