@@ -4549,19 +4549,47 @@ S_intuit_more(pTHX_ char *s, char *e)
45494549    /* Here is '[': maybe we have a character class.  Examine the guts */ 
45504550    s ++ ;
45514551
4552-     /* '^' implies a character class; An empty '[]' isn't legal, but  it does  
4553-      * mean  there isn't more to come */ 
4552+     /* '^' implies a character class; An empty '[]' isn't legal, and  it means  
4553+      * there isn't more to come */ 
45544554    if  (s [0 ] ==  ']'  ||  s [0 ] ==  '^' )
45554555        return  FALSE;
45564556
4557-     /* Find matching ']'.  khw: This means any s[1] below is guaranteed to 
4558-      * exist */ 
4557+     /* khw: If the context of this call is $foo[...], we may be able to avoid 
4558+      * the heuristics below.  The possibilities are: 
4559+      *  strict    @foo    $foo 
4560+      *   vars?   exists   exists 
4561+      *    y         n       n      This is an error; return false now 
4562+      *    y         n       y      must be a a charclass 
4563+      *    y         y       n      must be a a subscript 
4564+      *    y         y       y      ambiguous; do heuristics below 
4565+      *    n         n       n      ambiguous; do heuristics below 
4566+      *    n         n       y      ambiguous; do heuristics below, but I 
4567+      *                             wonder if the initial bias should be a 
4568+      *                             little towards charclass 
4569+      *    n         y       n      ambiguous; do heuristics below, but I 
4570+      *                             wonder if the initial bias should be a 
4571+      *                             little towards subscript 
4572+      *    n         y       y      ambiguous; do heuristics below 
4573+      */ 
4574+ 
4575+     /* Find matching ']'.  khw: Actually it finds the next ']' and assumes it 
4576+      * matches the '['.  In order to account for the possibility of the ']' 
4577+      * being inside the scope of \Q or preceded by an even number of 
4578+      * backslashes, this should be rewritten */ 
45594579    const  char  *  const  send  =  (char  * ) memchr (s , ']' , e  -  s );
45604580    if  (! send )		/* has to be an expression */ 
45614581        return  TRUE;
45624582
4583+     /* Below here, the heuristics start.  One idea from alh is, given 'use 
4584+      * 5.43.x', that for all digits, that if we have to resort to heuristics, 
4585+      * we instead raise an error with an explanation of how to make it 
4586+      * unambiguous: ${foo}[123] */ 
4587+ 
45634588    /* If the construct consists entirely of one or two digits, call it a 
4564-      * subscript. */ 
4589+      * subscript. 
4590+      * 
4591+      * khw: No one writes 03 to mean 3.  Any string of digits beginning with 
4592+      * '0' is likely to be a charclass, including length 2 ones. */ 
45654593    if  (isDIGIT (s [0 ]) &&  send  -  s  <= 2  &&  (send  -  s  ==  1  ||  (isDIGIT (s [1 ])))) {
45664594        return  TRUE;
45674595    }
@@ -4596,6 +4624,13 @@ S_intuit_more(pTHX_ char *s, char *e)
45964624    Zero (seen , 256 , char );
45974625
45984626    /* Examine each character in the construct */ 
4627+     /* That this knows nothing of UTF-8 can lead to opposite results if the 
4628+      * text is encoded in UTF-8 or not; another relic of the Unicode Bug. 
4629+      * Suppose a string consistis of various un-repeated code points between 
4630+      * 0x128 and 0x255.  When encoded in UTF-8 their start bytes will all be 
4631+      * \xC2 or \xC3.  The heuristics below will count those as repeated bytes, 
4632+      * and thus lean more towards this being a character class than when not 
4633+      * in UTF-8. */ 
45994634    bool  first_time  =  true;
46004635    for  (; s  <  send ; s ++ , first_time  =  false) {
46014636        unsigned char   prev_un_char  =  un_char ;
@@ -4617,15 +4652,42 @@ S_intuit_more(pTHX_ char *s, char *e)
46174652                char  tmpbuf [sizeof  PL_tokenbuf  *  4 ];
46184653                scan_ident (s , tmpbuf , sizeof  tmpbuf , FALSE);
46194654                len  =  (int )strlen (tmpbuf );
4655+ 
4656+                 /* khw: This only looks at global variables; lexicals came 
4657+                  * later, and this hasn't been updated.  Ouch!! */ 
46204658                if  (   len  >  1 
46214659                    &&  gv_fetchpvn_flags (tmpbuf ,
46224660                                         len ,
46234661                                         UTF  ? SVf_UTF8  : 0 ,
46244662                                         SVt_PV ))
4663+                 {
46254664                    weight  -=  100 ;
4626-                 else     /* Not a multi-char identifier already known in the 
4627-                            program; is somewhat likely to be a subscript */ 
4665+ 
4666+                     /* khw: Below we keep track of repeated characters;  People 
4667+                      * rarely say qr/[aba]/, as the second a is pointless. 
4668+                      * (Some do it though as a mnemonic that is meaningful to 
4669+                      * them.)  But generally, repeated characters makes things 
4670+                      * more likely to be a charclass.  But we have here that 
4671+                      * this an identifier so likely a subscript.  Its spelling 
4672+                      * should be irrelevant to the repeated characters test. 
4673+                      * So, we should advance past it.  Suppose it is a hash 
4674+                      * element, like $subscripts{$which}.  We should advance 
4675+                      * past the braces and key */ 
4676+                 }
4677+                 else  {
4678+                     /* Not a multi-char identifier already known in the 
4679+                      * program; is somewhat likely to be a subscript. 
4680+                      * 
4681+                      * khw: Our test suite contains several constructs like 
4682+                      * [$A-Z].  Excluding length 1 identifiers in the 
4683+                      * conditional above means such are much less likely to be 
4684+                      * mistaken for subscripts.  I would argue that if the next 
4685+                      * character is a '-' followed by an alpha, that would make 
4686+                      * it much more likely to be a charclass.  It would only 
4687+                      * make sense to be an expression if that alpha string is a 
4688+                      * bareword with meaning; something like [$A-ord] */ 
46284689                    weight  -=  10 ;
4690+                 }
46294691            }
46304692            else  if  (   s [0 ] ==  '$' 
46314693                     &&  s [1 ]
@@ -4642,13 +4704,51 @@ S_intuit_more(pTHX_ char *s, char *e)
46424704            }
46434705            break ;
46444706
4707+           /* khw:  [:blank:] strongly indicates a charclass */ 
4708+           /* khw: Z-A definitely subscript 
4709+            *      Z-Z likely subscript 
4710+            *      "x - z" with blanks very likely subscript 
4711+            *      \N without { must be subscript 
4712+            *      \R must be subscript 
4713+            *      \? must be subscript for things like \d, but not \a. 
4714+            */ 
4715+ 
4716+ 
46454717          case  '\\' :
46464718            if  (s [1 ]) {
4647-                 if  (memCHRs ("wds]" , s [1 ]))
4719+                 if  (memCHRs ("wds]" , s [1 ])) { 
46484720                    weight  +=  100 ;  /* \w \d \s => strongly charclass */ 
4649-                     /* khw: Why not \W \D \S \h \v, etc as well? */ 
4650-                 else  if  (seen [(U8 )'\'' ] ||  seen [(U8 )'"' ])
4651-                     weight  +=  1 ;    /* \' => mildly charclass */ 
4721+                     /* khw: \] can't happen, as any ']' is beyond our search. 
4722+                      * Why not \W \D \S \h \v, etc as well?  Should they have 
4723+                      * the same weights as \w \d \s or should all or some be 
4724+                      * in the 'abcfnrtvx' below? */ 
4725+                 } else  if  (seen [(U8 )'\'' ] ||  seen [(U8 )'"' ]) {
4726+                     weight  +=  1 ;
4727+                     /* khw: This is problematic.  Enough so, that I misread 
4728+                      * it, and added a wrong comment about what it does in 
4729+                      * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee.  Note that it 
4730+                      * doesn't look at the current character.  What it 
4731+                      * actually does is: if any quote has been seen in the 
4732+                      * parse, don't do the rest of the else's below, but for 
4733+                      * every subsequent backslashed character encountered 
4734+                      * (except \0 \w \s \d), increment the weight to lean a 
4735+                      * bit more towards being a charclass.  That means that 
4736+                      * every backslash sequence following the first occurrence 
4737+                      * of a quote increments the weight regardless of what the 
4738+                      * sequence is.  Again, \0 \w \d and \s are not controlled 
4739+                      * by this else, so they change the weight by a lot more. 
4740+                      * But what makes them so special that they aren't subject 
4741+                      * to this.  Any why does having a quote change the 
4742+                      * behavior from then on.  And why only backslashed 
4743+                      * sequences get this treatment?  This code has been 
4744+                      * unchanged since this function was added in 1993.  I 
4745+                      * don't get it.  Instead, it does seem to me that it is 
4746+                      * especially unlikely to repeat a quote in a charclass, 
4747+                      * but that having just a single quote is indicative of a 
4748+                      * charclass, and having pairs of quotes is indicative of 
4749+                      * a subscript.  Similarly for things that could indicate 
4750+                      * nesting of braces or parens. */ 
4751+                 }
46524752                else  if  (memCHRs ("abcfnrtvx" , s [1 ]))
46534753                    weight  +=  40 ;   /* \n, etc => charclass */ 
46544754                    /* khw: Why not \e etc as well? */ 
@@ -4657,6 +4757,19 @@ S_intuit_more(pTHX_ char *s, char *e)
46574757                    while  (s [1 ] &&  isDIGIT (s [1 ]))
46584758                        s ++ ;
46594759                }
4760+ 
4761+                 /* khw: There are lots more possible escape sequences.  Some, 
4762+                  * like \A,\z have no special meaning to charclasses, so might 
4763+                  * indicate a subscript, but I don't know what they would be 
4764+                  * doing there either.  Some have been added to the language 
4765+                  * after this code was written, but no one thought to, or 
4766+                  * could wade through this function, to add them.  Things like 
4767+                  * \p{} for properties, \N and \N{}, for example. 
4768+                  * 
4769+                  * It's problematic that \a is treated as plain 'a' for 
4770+                  * purposes of the 'seen' array.  Whatever is matched by these 
4771+                  * backslashed sequences should not be added to 'seen'.  That 
4772+                  * includes the backslash. */ 
46604773            }
46614774            else  /* \ followed by NUL strongly indicates character class */ 
46624775                weight  +=  100 ;
@@ -4702,7 +4815,25 @@ S_intuit_more(pTHX_ char *s, char *e)
47024815                &&  isALPHA (s [1 ]))
47034816            {
47044817                /* Here it's \W (that isn't [$@&] ) followed immediately by two 
4705-                  * alphas in a row.  Accumulate all the consecutive alphas */ 
4818+                  * alphas in a row.  Accumulate all the consecutive alphas. 
4819+                  * 
4820+                  * khw: The code below was changed in 2015 by 
4821+                  * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a 
4822+                  * buffer overrun.  Prior to that commit, the code copied all 
4823+                  * the consecutive alphas to a temporary.  The problem was 
4824+                  * that temporary's size could be exceeded, and the temporary 
4825+                  * wasn't even needed (at least by 2015).  The called 
4826+                  * keyword() function doesn't need a copy.  It takes a pointer 
4827+                  * to the first character and a length, hence it can operate 
4828+                  * on the original source text.  It is intended to catch cases 
4829+                  * like $a[ord].  If it does match a keyword, we don't want 
4830+                  * the spelling of that keyword to affect the seen[] array. 
4831+                  * But if it isn't a keyword we do want to fall back to the 
4832+                  * normal behavior.  And the 2015 commit removed that.  It 
4833+                  * absorbs every bareword regardless, defeating the intent of 
4834+                  * the algorithm implementing the heuristics.  That not many 
4835+                  * bugs have surfaced since indicates this whole thing doesn't 
4836+                  * get applied very much */ 
47064837                char  * d  =  s ;
47074838                while  (isALPHA (s [0 ]))
47084839                    s ++ ;
@@ -4712,7 +4843,12 @@ S_intuit_more(pTHX_ char *s, char *e)
47124843                if  (keyword (d , s  -  d , 0 ))
47134844                    weight  -=  150 ;
47144845
4715-                 /* khw: Should those alphas be marked as seen? */ 
4846+                 /* khw: Barewords could also be subroutine calls, and these 
4847+                  * would also indicate a subscript.  Like [green] where 
4848+                  * 'green' has been declared, for example, in 'use constant' 
4849+                  * Or maybe it should just call intuit_method() which checks 
4850+                  * for keyword, subs, and methods. 
4851+                  * */ 
47164852            }
47174853
47184854            /* Consecutive chars like [...12...] and [...ab...] are presumed 
@@ -4727,23 +4863,36 @@ S_intuit_more(pTHX_ char *s, char *e)
47274863            /* But repeating a character inside a character class does nothing, 
47284864             * like [aba], so less likely that someone makes such a class, more 
47294865             * likely that it is a subscript; the more repeats, the less 
4730-              * likely. */ 
4866+              * likely. 
4867+              * 
4868+              * khw: I think this changes the weight too rapidly.  Each time 
4869+              * through the loop compounds the previous times.  Instead, it 
4870+              * would be better to have a separate loop after all the rest that 
4871+              * changes the weight once based on how many times each character 
4872+              * gets repeated */ 
47314873            weight  -=  seen [un_char ];
47324874            break ;
47334875        }   /* End of switch */ 
47344876
47354877        /* khw: 'seen' is declared as a char.  This ++ can cause it to wrap. 
47364878         * This gives different results with compilers for which a plain 'char' 
4737-          * is actually unsigned, versus those where it is signed.  I believe it 
4738-          * is undefined behavior to wrap a 'signed'.  I think it should be 
4739-          * instead declared an unsigned int to make the chances of wrapping 
4740-          * essentially zero. 
4879+          * is actually unsigned, versus those where it is signed.  The C99 
4880+          * standard allows a compiler to raise a signal when a 'signed' char 
4881+          * is incremented outside its permissible range.  I think 'seen' 
4882+          * should be instead declared an unsigned, and a conditional added 
4883+          * to prevent wrapping. 
47414884         * 
47424885         * And I believe that extra backslashes are different from other 
4743-          * repeated characters. */ 
4886+          * repeated characters.  There may be others, like I have mentioned 
4887+          * quotes and paired delimiters  */ 
47444888        seen [un_char ]++ ;
47454889    }   /* End of loop through each character of the construct */ 
47464890
4891+     /* khw: People on #irc have suggested things that I think boil down to: 
4892+      * under 'use 5.43.x', output a warning like existing warnings for 
4893+      * similar situations "Ambiguous use of [], resolved as ..."  Perhaps 
4894+      * suppress the message if all (or maybe almost all) the evidence points 
4895+      * to the same outcome.  This would involve two weight variables */ 
47474896    if  (weight  >= 0 )	/* probably a character class */ 
47484897        return  FALSE;
47494898
0 commit comments