@@ -4495,7 +4495,14 @@ S_intuit_more(pTHX_ char *s, char *e)
44954495 * 'scariness', and lack of comments. khw has gone through and done some
44964496 * cleanup, while finding various instances of problematic behavior.
44974497 * Rather than change this base-level function immediately, khw has added
4498- * commentary to those areas. */
4498+ * commentary to those areas.
4499+ *
4500+ * khw: $0 in square brackets is never going to mean the expansion of $0.
4501+ * How could that help in calculating a subscript? And one would never
4502+ * want to match any one of the characters in this call to the program.
4503+ * No, $0 is going to want to mean this a charclass that matches dollar
4504+ * and digit0. But, code outside this function chooses the value of the
4505+ * variable $0. I think this should be special cased. */
44994506
45004507 /* If recursed within brackets, there is more to the expression */
45014508 if (PL_lex_brackets )
@@ -4546,19 +4553,47 @@ S_intuit_more(pTHX_ char *s, char *e)
45464553 /* Here is '[': maybe we have a character class. Examine the guts */
45474554 s ++ ;
45484555
4549- /* '^' implies a character class; An empty '[]' isn't legal, but it does
4550- * mean there isn't more to come */
4556+ /* '^' implies a character class; An empty '[]' isn't legal, and it means
4557+ * there isn't more to come */
45514558 if (s [0 ] == ']' || s [0 ] == '^' )
45524559 return FALSE;
45534560
4554- /* Find matching ']'. khw: This means any s[1] below is guaranteed to
4555- * exist */
4561+ /* khw: If the context of this call is $foo[...], we may be able to avoid
4562+ * the heuristics below. The possibilities are:
4563+ * strict @foo $foo
4564+ * vars? exists exists
4565+ * y n n This is an error; return false now
4566+ * y n y must be a a charclass
4567+ * y y n must be a a subscript
4568+ * y y y ambiguous; do heuristics below
4569+ * n n n ambiguous; do heuristics below
4570+ * n n y ambiguous; do heuristics below, but I
4571+ * wonder if the initial bias should be a
4572+ * little towards charclass
4573+ * n y n ambiguous; do heuristics below, but I
4574+ * wonder if the initial bias should be a
4575+ * little towards subscript
4576+ * n y y ambiguous; do heuristics below
4577+ */
4578+
4579+ /* Find matching ']'. khw: Actually it finds the next ']' and assumes it
4580+ * matches the '['. In order to account for the possibility of the ']'
4581+ * being inside the scope of \Q or preceded by an even number of
4582+ * backslashes, this should be rewritten */
45564583 const char * const send = (char * ) memchr (s , ']' , e - s );
45574584 if (! send ) /* has to be an expression */
45584585 return TRUE;
45594586
4587+ /* Below here, the heuristics start. One idea from alh is, given 'use
4588+ * 5.43.x', that for all digits, that if we have to resort to heuristics,
4589+ * we instead raise an error with an explanation of how to make it
4590+ * unambiguous: ${foo}[123] */
4591+
45604592 /* If the construct consists entirely of one or two digits, call it a
4561- * subscript. */
4593+ * subscript.
4594+ *
4595+ * khw: No one writes 03 to mean 3. Any string of digits beginning with
4596+ * '0' is likely to be a charclass, including length 2 ones. */
45624597 if (isDIGIT (s [0 ]) && send - s <= 2 && (send - s == 1 || (isDIGIT (s [1 ])))) {
45634598 return TRUE;
45644599 }
@@ -4593,6 +4628,13 @@ S_intuit_more(pTHX_ char *s, char *e)
45934628 Zero (seen , 256 , char );
45944629
45954630 /* Examine each character in the construct */
4631+ /* That this knows nothing of UTF-8 can lead to opposite results if the
4632+ * text is encoded in UTF-8 or not; another relic of the Unicode Bug.
4633+ * Suppose a string consistis of various un-repeated code points between
4634+ * 0x128 and 0x255. When encoded in UTF-8 their start bytes will all be
4635+ * \xC2 or \xC3. The heuristics below will count those as repeated bytes,
4636+ * and thus lean more towards this being a character class than when not
4637+ * in UTF-8. */
45964638 bool first_time = true;
45974639 for (; s < send ; s ++ , first_time = false) {
45984640 unsigned char prev_un_char = un_char ;
@@ -4608,21 +4650,73 @@ S_intuit_more(pTHX_ char *s, char *e)
46084650
46094651 /* Following one of these characters, we look to see if there is an
46104652 * identifier already found in the program by that name. If so,
4611- * strongly suspect this isn't a character class */
4653+ * strongly suspect this isn't a character class
4654+ *
4655+ * khw: But under 'strict' if the identifier doesn't exist, it
4656+ * has to be a charclass
4657+ *
4658+ * khw: [...$0...] is never going to mean the name of the program;
4659+ * it's always going to be a charclass. $1 could mean either, but
4660+ * as the number increases, the more likely to be a charclass, as
4661+ * the chance of there being a pattern with that many capture
4662+ * groups goes rapidly down.
4663+ *
4664+ * khw: Using \w here misses the possibility of lots of other
4665+ * syntaxes of variables, like $::foo or ${foo}, that scan_ident
4666+ * looks for.
4667+ *
4668+ */
46124669 if (isWORDCHAR_lazy_if_safe (s + 1 , PL_bufend , UTF )) {
46134670 Size_t len ;
4671+
4672+ /* khw: where did the magic number 4 come from?. This buffer
4673+ * was 4 times as large as tokenbuf in 1997, and had not
4674+ * changed since the code was first added */
46144675 char tmpbuf [ C_ARRAY_LENGTH (PL_tokenbuf ) * 4 ];
4676+
4677+ /* khw: scan_ident shouldn't be used as-is. It has side
4678+ * effects and can end up calling this function recursively.
4679+ *
4680+ * khw: If what follows can't be an identifier, say it is too
4681+ * long or is $001, then it must be a charclass */
46154682 scan_ident (s , tmpbuf , C_ARRAY_END (tmpbuf ), FALSE);
46164683 len = strlen (tmpbuf );
4684+
4685+ /* khw: This only looks at global variables; lexicals came
4686+ * later, and this hasn't been updated. Ouch!! */
46174687 if ( len > 1
46184688 && gv_fetchpvn_flags (tmpbuf ,
46194689 len ,
46204690 UTF ? SVf_UTF8 : 0 ,
46214691 SVt_PV ))
4692+ {
46224693 weight -= 100 ;
4623- else /* Not a multi-char identifier already known in the
4624- program; is somewhat likely to be a subscript */
4694+
4695+ /* khw: Below we keep track of repeated characters; People
4696+ * rarely say qr/[aba]/, as the second a is pointless.
4697+ * (Some do it though as a mnemonic that is meaningful to
4698+ * them.) But generally, repeated characters makes things
4699+ * more likely to be a charclass. But we have here that
4700+ * this an identifier so likely a subscript. Its spelling
4701+ * should be irrelevant to the repeated characters test.
4702+ * So, we should advance past it. Suppose it is a hash
4703+ * element, like $subscripts{$which}. We should advance
4704+ * past the braces and key */
4705+ }
4706+ else {
4707+ /* Not a multi-char identifier already known in the
4708+ * program; is somewhat likely to be a subscript.
4709+ *
4710+ * khw: Our test suite contains several constructs like
4711+ * [$A-Z]. Excluding length 1 identifiers in the
4712+ * conditional above means such are much less likely to be
4713+ * mistaken for subscripts. I would argue that if the next
4714+ * character is a '-' followed by an alpha, that would make
4715+ * it much more likely to be a charclass. It would only
4716+ * make sense to be an expression if that alpha string is a
4717+ * bareword with meaning; something like [$A-ord] */
46254718 weight -= 10 ;
4719+ }
46264720 }
46274721 else if ( s [0 ] == '$'
46284722 && s [1 ]
@@ -4639,13 +4733,51 @@ S_intuit_more(pTHX_ char *s, char *e)
46394733 }
46404734 break ;
46414735
4736+ /* khw: [:blank:] strongly indicates a charclass */
4737+ /* khw: Z-A definitely subscript
4738+ * Z-Z likely subscript
4739+ * "x - z" with blanks very likely subscript
4740+ * \N without { must be subscript
4741+ * \R must be subscript
4742+ * \? must be subscript for things like \d, but not \a.
4743+ */
4744+
4745+
46424746 case '\\' :
46434747 if (s [1 ]) {
4644- if (memCHRs ("wds]" , s [1 ]))
4748+ if (memCHRs ("wds]" , s [1 ])) {
46454749 weight += 100 ; /* \w \d \s => strongly charclass */
4646- /* khw: Why not \W \D \S \h \v, etc as well? */
4647- else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ])
4648- weight += 1 ; /* \' => mildly charclass */
4750+ /* khw: \] can't happen, as any ']' is beyond our search.
4751+ * Why not \W \D \S \h \v, etc as well? Should they have
4752+ * the same weights as \w \d \s or should all or some be
4753+ * in the 'abcfnrtvx' below? */
4754+ } else if (seen [(U8 )'\'' ] || seen [(U8 )'"' ]) {
4755+ weight += 1 ;
4756+ /* khw: This is problematic. Enough so, that I misread
4757+ * it, and added a wrong comment about what it does in
4758+ * 57ae1f3a8e669082e3d5ec6a8cdffbdc39d87bee. Note that it
4759+ * doesn't look at the current character. What it
4760+ * actually does is: if any quote has been seen in the
4761+ * parse, don't do the rest of the else's below, but for
4762+ * every subsequent backslashed character encountered
4763+ * (except \0 \w \s \d), increment the weight to lean a
4764+ * bit more towards being a charclass. That means that
4765+ * every backslash sequence following the first occurrence
4766+ * of a quote increments the weight regardless of what the
4767+ * sequence is. Again, \0 \w \d and \s are not controlled
4768+ * by this else, so they change the weight by a lot more.
4769+ * But what makes them so special that they aren't subject
4770+ * to this. Any why does having a quote change the
4771+ * behavior from then on. And why only backslashed
4772+ * sequences get this treatment? This code has been
4773+ * unchanged since this function was added in 1993. I
4774+ * don't get it. Instead, it does seem to me that it is
4775+ * especially unlikely to repeat a quote in a charclass,
4776+ * but that having just a single quote is indicative of a
4777+ * charclass, and having pairs of quotes is indicative of
4778+ * a subscript. Similarly for things that could indicate
4779+ * nesting of braces or parens. */
4780+ }
46494781 else if (memCHRs ("abcfnrtvx" , s [1 ]))
46504782 weight += 40 ; /* \n, etc => charclass */
46514783 /* khw: Why not \e etc as well? */
@@ -4654,6 +4786,19 @@ S_intuit_more(pTHX_ char *s, char *e)
46544786 while (s [1 ] && isDIGIT (s [1 ]))
46554787 s ++ ;
46564788 }
4789+
4790+ /* khw: There are lots more possible escape sequences. Some,
4791+ * like \A,\z have no special meaning to charclasses, so might
4792+ * indicate a subscript, but I don't know what they would be
4793+ * doing there either. Some have been added to the language
4794+ * after this code was written, but no one thought to, or
4795+ * could wade through this function, to add them. Things like
4796+ * \p{} for properties, \N and \N{}, for example.
4797+ *
4798+ * It's problematic that \a is treated as plain 'a' for
4799+ * purposes of the 'seen' array. Whatever is matched by these
4800+ * backslashed sequences should not be added to 'seen'. That
4801+ * includes the backslash. */
46574802 }
46584803 else /* \ followed by NUL strongly indicates character class */
46594804 weight += 100 ;
@@ -4699,7 +4844,25 @@ S_intuit_more(pTHX_ char *s, char *e)
46994844 && isALPHA (s [1 ]))
47004845 {
47014846 /* Here it's \W (that isn't [$@&] ) followed immediately by two
4702- * alphas in a row. Accumulate all the consecutive alphas */
4847+ * alphas in a row. Accumulate all the consecutive alphas.
4848+ *
4849+ * khw: The code below was changed in 2015 by
4850+ * 56f81afc0f2d331537f38e6f12b86a850187cb8a to solve a
4851+ * buffer overrun. Prior to that commit, the code copied all
4852+ * the consecutive alphas to a temporary. The problem was
4853+ * that temporary's size could be exceeded, and the temporary
4854+ * wasn't even needed (at least by 2015). The called
4855+ * keyword() function doesn't need a copy. It takes a pointer
4856+ * to the first character and a length, hence it can operate
4857+ * on the original source text. It is intended to catch cases
4858+ * like $a[ord]. If it does match a keyword, we don't want
4859+ * the spelling of that keyword to affect the seen[] array.
4860+ * But if it isn't a keyword we do want to fall back to the
4861+ * normal behavior. And the 2015 commit removed that. It
4862+ * absorbs every bareword regardless, defeating the intent of
4863+ * the algorithm implementing the heuristics. That not many
4864+ * bugs have surfaced since indicates this whole thing doesn't
4865+ * get applied very much */
47034866 char * d = s ;
47044867 while (isALPHA (s [0 ]))
47054868 s ++ ;
@@ -4709,7 +4872,12 @@ S_intuit_more(pTHX_ char *s, char *e)
47094872 if (keyword (d , s - d , 0 ))
47104873 weight -= 150 ;
47114874
4712- /* khw: Should those alphas be marked as seen? */
4875+ /* khw: Barewords could also be subroutine calls, and these
4876+ * would also indicate a subscript. Like [green] where
4877+ * 'green' has been declared, for example, in 'use constant'
4878+ * Or maybe it should just call intuit_method() which checks
4879+ * for keyword, subs, and methods.
4880+ * */
47134881 }
47144882
47154883 /* Consecutive chars like [...12...] and [...ab...] are presumed
@@ -4724,23 +4892,36 @@ S_intuit_more(pTHX_ char *s, char *e)
47244892 /* But repeating a character inside a character class does nothing,
47254893 * like [aba], so less likely that someone makes such a class, more
47264894 * likely that it is a subscript; the more repeats, the less
4727- * likely. */
4895+ * likely.
4896+ *
4897+ * khw: I think this changes the weight too rapidly. Each time
4898+ * through the loop compounds the previous times. Instead, it
4899+ * would be better to have a separate loop after all the rest that
4900+ * changes the weight once based on how many times each character
4901+ * gets repeated */
47284902 weight -= seen [un_char ];
47294903 break ;
47304904 } /* End of switch */
47314905
47324906 /* khw: 'seen' is declared as a char. This ++ can cause it to wrap.
47334907 * This gives different results with compilers for which a plain 'char'
4734- * is actually unsigned, versus those where it is signed. I believe it
4735- * is undefined behavior to wrap a 'signed'. I think it should be
4736- * instead declared an unsigned int to make the chances of wrapping
4737- * essentially zero.
4908+ * is actually unsigned, versus those where it is signed. The C99
4909+ * standard allows a compiler to raise a signal when a 'signed' char
4910+ * is incremented outside its permissible range. I think 'seen'
4911+ * should be instead declared an unsigned, and a conditional added
4912+ * to prevent wrapping.
47384913 *
47394914 * And I believe that extra backslashes are different from other
4740- * repeated characters. */
4915+ * repeated characters. There may be others, like I have mentioned
4916+ * quotes and paired delimiters */
47414917 seen [un_char ]++ ;
47424918 } /* End of loop through each character of the construct */
47434919
4920+ /* khw: People on #irc have suggested things that I think boil down to:
4921+ * under 'use 5.43.x', output a warning like existing warnings for
4922+ * similar situations "Ambiguous use of [], resolved as ..." Perhaps
4923+ * suppress the message if all (or maybe almost all) the evidence points
4924+ * to the same outcome. This would involve two weight variables */
47444925 if (weight >= 0 ) /* probably a character class */
47454926 return FALSE;
47464927
0 commit comments