Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 25 additions & 10 deletions sam_mods.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
char *cp = (char *)mm+1;
int mod_num = 0;
int implicit = 1;
int failed = 0;
while (*cp) {
for (; *cp; cp++) {
// cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*;
Expand All @@ -295,7 +296,12 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
char *cp_end = NULL;
int chebi = 0;
if (isdigit_c(*cp)) {
chebi = strtol(cp, &cp_end, 10);
chebi = (int)hts_str2uint(cp, &cp_end, 31, &failed);
if (cp_end == cp || failed) {
hts_log_error("%s: malformed MM tag (invalid ChEBI code)",
bam_get_qname(b));
return -1;
}
cp = cp_end;
ms = cp-1;
} else {
Expand Down Expand Up @@ -341,8 +347,8 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
if (*cp == 0 || *cp == ';')
break;

delta = strtol(cp, &cp_end, 10);
if (cp_end == cp) {
delta = (long)hts_str2uint(cp, &cp_end, 31, &failed);
if (cp_end == cp || failed) {
hts_log_error("%s: Hit end of MM tag. Missing "
"semicolon?", bam_get_qname(b));
return -1;
Expand All @@ -354,11 +360,14 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state,
}
delta = freq[seqi_rc[btype]] - total_seq; // remainder
} else {
delta = *cp == ','
? strtol(cp+1, &cp_end, 10)
: 0;
if (!cp_end) {
// empty list
if (*cp == ',') {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jkbonfield This site made me think for a minute; the interplay of the ternary and the if(!cp_end) worried me. I believe I have the logic correct, it reads a little more direct now as a consequence.

Copy link
Contributor

@jkbonfield jkbonfield Oct 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This highlights a bug in the old code infact.

Earlier on we have:

            char *ms = cp, *me; // mod code start and end                       
            char *cp_end = NULL;                                                
            int chebi = 0;                                                      
            if (isdigit_c(*cp)) {                                               
                chebi = strtol(cp, &cp_end, 10);                                
                cp = cp_end;                                                    
                ms = cp-1;                                                      
            } else {                                                            
                while (*cp && isalpha_c(*cp))                                   
                    cp++;                                                       
                if (*cp == '\0')                                                
                    return -1;                                                  
            }                                                                   

However cp_end is non-NULL if we have a CHEBI code, and NULL otherwise.

Moving on to the code you commented on above: if we have no comma, and hence an empty list, then we're now affected by the CHEBI vs otherwise logic. Indeed this test file shows a problem:

$ cat /tmp/MM-chebi.sam
*	0	*	0	0	*	*	0	0	ACGCT	*	Mm:Z:C+m;C+76792;N+n;

$ ./test/test_mod /tmp/MM-chebi.sam  2>/dev/null
0	A
1	C	C+(76792).
2	G
3	C
4	T
---
Present: m. #-76792. n.
1	C	C+(76792).

If I put a cp_end = NULL after the above code block so it's always NULL regardless of code-vs-CHEBI then the rogue mod vanishes.

The revised code side-steps the issue and I agree it has easier to understand logic. A good spot.

delta = (long)hts_str2uint(cp+1, &cp_end, 31, &failed);
if (cp_end == cp+1 || failed) {
hts_log_error("%s: Failed to parse integer from MM tag",
bam_get_qname(b));
return -1;
}
} else {
delta = INT_MAX;
cp_end = cp;
}
Expand Down Expand Up @@ -494,9 +503,11 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
? -state->MLstride[i]
: +state->MLstride[i];

int failed = 0;
if (b->core.flag & BAM_FREVERSE) {
// process MM list backwards
char *cp;
char *tmp;

if (state->MMend[i]-1 < state->MM[i]) {
// Should be impossible to hit if coding is correct
Expand All @@ -508,15 +519,19 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state,
break;
state->MMend[i] = cp;
if (cp != state->MM[i])
state->MMcount[i] = strtol(cp+1, NULL, 10);
state->MMcount[i] = (int)hts_str2uint(cp + 1, &tmp, 31, &failed);
else
state->MMcount[i] = INT_MAX;
} else {
if (*state->MM[i] == ',')
state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10);
state->MMcount[i] = (int)hts_str2uint(state->MM[i] + 1, &state->MM[i], 31, &failed);
else
state->MMcount[i] = INT_MAX;
}
if (failed) {
hts_log_error("%s: Error parsing unsigned integer from MM tag", bam_get_qname(b));
return -1;
}

// Multiple mods at the same coords.
for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) {
Expand Down