@@ -2490,6 +2490,12 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
24902490        if  (esign  ==  '-' ) e  =  - e ;
24912491    }
24922492
2493+     switch  (* s ) {
2494+     case  'k' : case  'K' : e  +=  3 ; s ++ ; break ;
2495+     case  'm' : case  'M' : e  +=  6 ; s ++ ; break ;
2496+     case  'g' : case  'G' : e  +=  9 ; s ++ ; break ;
2497+     }
2498+ 
24932499    e  -=  decimals ;
24942500    while  (e  >  0 ) n  *= 10 , e -- ;
24952501    while  (e  <  0 ) lost  +=  n  % 10 , n  /= 10 , e ++ ;
@@ -2501,45 +2507,90 @@ long long hts_parse_decimal(const char *str, char **strend, int flags)
25012507    if  (strend ) {
25022508        * strend  =  (char  * )s ;
25032509    } else  if  (* s ) {
2504-         hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s  -  str ), str , s );
2510+         if  ((flags  &  HTS_PARSE_THOUSANDS_SEP ) ||  (!(flags  &  HTS_PARSE_THOUSANDS_SEP ) &&  * s  !=  ',' ))
2511+             hts_log_warning ("Ignoring unknown characters after %.*s[%s]" , (int )(s  -  str ), str , s );
25052512    }
25062513
25072514    return  (sign  ==  '+' )? n  : - n ;
25082515}
25092516
2517+ static  void  * hts_memrchr (const  void  * s , int  c , size_t  n ) {
2518+     size_t  i ;
2519+     unsigned char   * u  =  (unsigned char   * )s ;
2520+     for  (i  =  n ; i  >  0 ; i -- ) {
2521+         if  (u [i - 1 ] ==  c )
2522+             return  u + i - 1 ;
2523+     }
2524+ 
2525+     return  NULL ;
2526+ }
2527+ 
25102528/* 
25112529 * A variant of hts_parse_reg which is reference-id aware.  It uses 
25122530 * the iterator name2id callbacks to validate the region tokenisation works. 
25132531 * 
25142532 * This is necessary due to GRCh38 HLA additions which have reference names 
25152533 * like "HLA-DRB1*12:17". 
25162534 * 
2517-  * getid is optional and may be passed in as NULL.  If given it is used to 
2518-  * validate the reference name exists and is unambiguously parseable.  If not 
2519-  * given the best guess will be made but no has guarantees in validity. 
2535+  * All parameters are mandatory. 
2536+  * 
2537+  * To work around ambiguous parsing issues, eg both "chr1" and "chr1:100-200" 
2538+  * are reference names, we may quote using curly braces. 
2539+  * Thus "{chr1}:100-200" and "{chr1:100-200}" disambiguate the above example. 
2540+  * 
2541+  * Flags are used to control how parsing works, and can be one of the below. 
2542+  * 
2543+  * HTS_PARSE_LIST: 
2544+  *     If present, the region is assmed to be a comma separated list and 
2545+  *     position parsing will not contain commas (this implicitly 
2546+  *     clears HTS_PARSE_THOUSANDS_SEP in the call to hts_parse_decimal). 
2547+  *     On success the return pointer will be the start of the next region, ie 
2548+  *     the character after the comma.  (If *ret != '\0' then the caller can 
2549+  *     assume another region is present in the list.) 
2550+  * 
2551+  *     If not set then positions may contain commas.  In this case the return 
2552+  *     value should point to the end of the string, or NULL on failure. 
25202553 * 
2521-  * To work around these issues quoting is also permitted via {ref}:start-end. 
2522-  * In this case, the return value will point to '}' and not the end of the 
2523-  * reference (but this is a useful indication that it started with '{'). 
2554+  * HTS_PARSE_ONE_COORD: 
2555+  *     If present, X:100 is treated as the single base pair region X:100-100. 
2556+  *     In this case X:-100 is shorthand for X:1-100 and X:100- is X:100-<end>. 
2557+  *     (This is the standard bcftools region convention.) 
2558+  * 
2559+  *     When not set X:100 is considered to be X:100-<end> where <end> is 
2560+  *     the end of chromosome X (set to INT_MAX here).  X:100- and X:-100 are 
2561+  *     invalid. 
2562+  *     (This is the standard samtools region convention.) 
2563+  * 
2564+  * Note the supplied string expects 1 based inclusive coordinates, but the 
2565+  * returned coordinates start from 0 and are half open, so pos0 is valid 
2566+  * for use in e.g. "for (pos0 = beg; pos0 < end; pos0++) {...}" 
25242567 * 
25252568 * On success the end of the reference is returned (colon or end of string) 
25262569 *            beg/end will be set, plus tid if getid has been supplied. 
25272570 * On failure NULL is returned. 
25282571 */ 
2529- const  char  * hts_parse_reg2 (const  char  * s , int  * tid , int  * beg , int  * end ,
2530-                            hts_name2id_f  getid , void  * hdr )
2572+ const  char  * hts_parse_region (const  char  * s , int  * tid , int  * beg , int  * end ,
2573+                               hts_name2id_f  getid , void  * hdr ,  int   flags )
25312574{
2532-     // FIXME: do we need to permit tid=-1 for reference "*" to indicate unmapped 
2533-     // reads, and strictly have NULL as failure test? 
2534-     int  tid_ , s_len  =  strlen (s ); // int is sufficient given beg/end types 
2535-     if  (!tid ) tid  =  & tid_ ;       // simplifies code below 
2575+     if  (!s  ||  !tid  ||  !beg  ||  !end  ||  !getid )
2576+         return  NULL ;
25362577
2537-     const  char  * colon  =  NULL ;
2578+     int  s_len  =  strlen (s ); // int is sufficient given beg/end types 
2579+     kstring_t  ks  =  { 0 , 0 , NULL  };
2580+ 
2581+     const  char  * colon  =  NULL , * comma  =  NULL ;
25382582    int  quoted  =  0 ;
25392583
2584+     if  (flags  &  HTS_PARSE_LIST )
2585+         flags  &= ~HTS_PARSE_THOUSANDS_SEP ;
2586+     else 
2587+         flags  |= HTS_PARSE_THOUSANDS_SEP ;
2588+ 
2589+     const  char  * s_end  =  s  +  s_len ;
2590+ 
25402591    // Braced quoting of references is permitted to resolve ambiguities. 
25412592    if  (* s  ==  '{' ) {
2542-         const  char  * close  =  strrchr (s , '}' );
2593+         const  char  * close  =  memchr (s , '}' ,  s_len );
25432594        if  (!close ) {
25442595            hts_log_error ("Mismatching braces in \"%s\"" , s );
25452596            return  NULL ;
@@ -2549,36 +2600,56 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25492600        if  (close [1 ] ==  ':' )
25502601            colon  =  close + 1 ;
25512602        quoted  =  1 ; // number of trailing characters to trim 
2603+ 
2604+         // Truncate to this item only, if appropriate. 
2605+         if  (flags  &  HTS_PARSE_LIST ) {
2606+             comma  =  strchr (close , ',' );
2607+             if  (comma ) {
2608+                 s_len  =  comma - s ;
2609+                 s_end  =  comma + 1 ;
2610+             }
2611+         }
25522612    } else  {
2553-         colon  =  strrchr (s , ':' );
2613+         // Truncate to this item only, if appropriate. 
2614+         if  (flags  &  HTS_PARSE_LIST ) {
2615+             comma  =  strchr (s , ',' );
2616+             if  (comma ) {
2617+                 s_len  =  comma - s ;
2618+                 s_end  =  comma + 1 ;
2619+             }
2620+         }
2621+ 
2622+         colon  =  hts_memrchr (s , ':' , s_len );
25542623    }
25552624
2625+     // No colon is simplest case; just check and return. 
25562626    if  (colon  ==  NULL ) {
25572627        * beg  =  0 ; * end  =  INT_MAX ;
2558-         if  (getid ) {
2559-             kstring_t  ks  =  { 0 , 0 , NULL  };
2560-             kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string 
2561-             if  (!ks .s ) {
2562-                 * tid  =  -1 ;
2563-                 return  NULL ;
2564-             }
2565- 
2566-             * tid  =  getid (hdr , ks .s );
2567-             free (ks .s );
2568-         } else  {
2569-             * tid  =  0 ;
2628+         kputsn (s , s_len - quoted , & ks ); // convert to nul terminated string 
2629+         if  (!ks .s ) {
2630+             * tid  =  -1 ;
2631+             return  NULL ;
25702632        }
2571-         return  * tid  >= 0  ? s  +  s_len  : NULL ;
2633+ 
2634+         * tid  =  getid (hdr , ks .s );
2635+         free (ks .s );
2636+ 
2637+         return  * tid  >= 0  ? s_end  : NULL ;
25722638    }
25732639
25742640    // Has a colon, but check whole name first. 
2575-     if  (!quoted   &&   getid ) {
2641+     if  (!quoted ) {
25762642        * beg  =  0 ; * end  =  INT_MAX ;
2577-         if  ((* tid  =  getid (hdr , s )) >= 0 ) {
2643+         kputsn (s , s_len , & ks ); // convert to nul terminated string 
2644+         if  (!ks .s ) {
2645+             * tid  =  -1 ;
2646+             return  NULL ;
2647+         }
2648+         if  ((* tid  =  getid (hdr , ks .s )) >= 0 ) {
25782649            // Entire name matches, but also check this isn't 
25792650            // ambiguous.  eg we have ref chr1 and ref chr1:100-200 
25802651            // both present. 
2581-             kstring_t   ks  =  {  0 ,  0 ,  NULL  } ;
2652+             ks . l  =  0 ;
25822653            kputsn (s , colon - s , & ks ); // convert to nul terminated string 
25832654            if  (!ks .s ) {
25842655                * tid  =  -1 ;
@@ -2594,39 +2665,81 @@ const char *hts_parse_reg2(const char *s, int *tid, int *beg, int *end,
25942665            }
25952666            free (ks .s );
25962667
2597-             return  s   +   s_len ;
2668+             return  s_end ;
25982669        }
25992670    }
26002671
2601-     char  * hyphen ;
2602-     * beg  =  hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) -  1 ;
2603-     if  (* beg  <  0 ) * beg  =  0 ;
2604- 
2605-     if  (* hyphen  ==  '\0' ) * end  =  INT_MAX ;
2606-     else  if  (* hyphen  ==  '-' ) * end  =  hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2607-     else  return  NULL ;
2672+     // Quoted, or unquoted and whole string isn't a name. 
2673+     // Check the pre-colon part is valid. 
2674+     ks .l  =  0 ;
2675+     kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string 
2676+     if  (!ks .s ) {
2677+         * tid  =  -1 ;
2678+         return  NULL ;
2679+     }
2680+     * tid  =  getid (hdr , ks .s );
2681+     free (ks .s );
2682+     if  (* tid  <  0 )
2683+         return  NULL ;
26082684
2609-     if  (* beg  >= * end ) return  NULL ;
2610-     if  (getid ) {
2611-         kstring_t  ks  =  { 0 , 0 , NULL  };
2612-         kputsn (s , colon - s - quoted , & ks ); // convert to nul terminated string 
2613-         if  (!ks .s ) {
2614-             * tid  =  -1 ;
2685+     // Finally parse the post-colon coordinates 
2686+     char  * hyphen ;
2687+     * beg  =  hts_parse_decimal (colon + 1 , & hyphen , flags ) -  1 ;
2688+     if  (* beg  <  0 ) {
2689+         if  (isdigit (* hyphen ) ||  * hyphen  ==  '\0'  ||  * hyphen  ==  ',' ) {
2690+             // interpret chr:-100 as chr:1-100 
2691+             * end  =  * beg == -1  ? INT_MAX  : - (* beg + 1 );
2692+             * beg  =  0 ;
2693+             return  s_end ;
2694+         } else  if  (* hyphen  ==  '-' ) {
2695+             * beg  =  0 ;
2696+         } else  {
2697+             hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
26152698            return  NULL ;
26162699        }
2617-         * tid  =  getid (hdr , ks .s );
2618-         free (ks .s );
2619-         if  (* tid  <  0 )
2700+     }
2701+ 
2702+     if  (* hyphen  ==  '\0'  ||  ((flags  &  HTS_PARSE_LIST ) &&  * hyphen  ==  ',' )) {
2703+         * end  =  flags  &  HTS_PARSE_ONE_COORD  ? * beg + 1  : INT_MAX ;
2704+     } else  if  (* hyphen  ==  '-' ) {
2705+         * end  =  hts_parse_decimal (hyphen + 1 , & hyphen , flags );
2706+         if  (* hyphen  !=  '\0'  &&  * hyphen  !=  ',' ) {
2707+             hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
26202708            return  NULL ;
2709+         }
26212710    } else  {
2622-         * tid  =  0 ;
2711+         hts_log_error ("Unexpected string \"%s\" after region" , hyphen );
2712+         return  NULL ;
26232713    }
2624-     return  colon ;
2714+ 
2715+     if  (* end  ==  0 )
2716+         * end  =  INT_MAX ; // interpret chr:100- as chr:100-<end> 
2717+ 
2718+     if  (* beg  >= * end ) return  NULL ;
2719+ 
2720+     return  s_end ;
26252721}
26262722
2723+ // Next release we should mark this as deprecated? 
2724+ // Use hts_parse_region above instead. 
26272725const  char  * hts_parse_reg (const  char  * s , int  * beg , int  * end )
26282726{
2629-     return  hts_parse_reg2 (s , NULL , beg , end , NULL , NULL );
2727+     char  * hyphen ;
2728+     const  char  * colon  =  strrchr (s , ':' );
2729+     if  (colon  ==  NULL ) {
2730+         * beg  =  0 ; * end  =  INT_MAX ;
2731+         return  s  +  strlen (s );
2732+     }
2733+ 
2734+     * beg  =  hts_parse_decimal (colon + 1 , & hyphen , HTS_PARSE_THOUSANDS_SEP ) -  1 ;
2735+     if  (* beg  <  0 ) * beg  =  0 ;
2736+ 
2737+     if  (* hyphen  ==  '\0' ) * end  =  INT_MAX ;
2738+     else  if  (* hyphen  ==  '-' ) * end  =  hts_parse_decimal (hyphen + 1 , NULL , HTS_PARSE_THOUSANDS_SEP );
2739+     else  return  NULL ;
2740+ 
2741+     if  (* beg  >= * end ) return  NULL ;
2742+     return  colon ;
26302743}
26312744
26322745hts_itr_t  * hts_itr_querys (const  hts_idx_t  * idx , const  char  * reg , hts_name2id_f  getid , void  * hdr , hts_itr_query_func  * itr_query , hts_readrec_func  * readrec )
@@ -2638,7 +2751,7 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g
26382751    else  if  (strcmp (reg , "*" ) ==  0 )
26392752        return  itr_query (idx , HTS_IDX_NOCOOR , 0 , 0 , readrec );
26402753
2641-     if  (!hts_parse_reg2 (reg , & tid , & beg , & end , getid , hdr ))
2754+     if  (!hts_parse_region (reg , & tid , & beg , & end , getid , hdr ,  HTS_PARSE_THOUSANDS_SEP ))
26422755        return  NULL ;
26432756
26442757    return  itr_query (idx , tid , beg , end , readrec );
0 commit comments