Skip to content

Commit 237899f

Browse files
authored
Support a new experimental feature called scan substring (#445)
Co-authored-by: Zoltan Herczeg <[email protected]>
1 parent 5bff405 commit 237899f

File tree

12 files changed

+461
-88
lines changed

12 files changed

+461
-88
lines changed

ChangeLog

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ with JIT was correct.
7575
12. Add a new error code (PCRE2_ERROR_JIT_UNSUPPORTED) which is yielded
7676
for unsupported jit features.
7777

78+
13. Add a new experimental feature called scan substring. This feature
79+
is a new type of assertion which matches the content of a capturing block
80+
to a sub pattern.
81+
7882

7983
Version 10.44 07-June-2024
8084
--------------------------

src/pcre2_compile.c

Lines changed: 137 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -231,63 +231,65 @@ code (meta_extra_lengths, just below) must be updated to remain in step. */
231231
#define META_COND_RNAME 0x80130000u /* (?(R&name)... */
232232
#define META_COND_RNUMBER 0x80140000u /* (?(Rdigits)... */
233233
#define META_COND_VERSION 0x80150000u /* (?(VERSION<op>x.y)... */
234-
#define META_DOLLAR 0x80160000u /* $ metacharacter */
235-
#define META_DOT 0x80170000u /* . metacharacter */
236-
#define META_ESCAPE 0x80180000u /* \d and friends */
237-
#define META_KET 0x80190000u /* closing parenthesis */
238-
#define META_NOCAPTURE 0x801a0000u /* no capture parens */
239-
#define META_OPTIONS 0x801b0000u /* (?i) and friends */
240-
#define META_POSIX 0x801c0000u /* POSIX class item */
241-
#define META_POSIX_NEG 0x801d0000u /* negative POSIX class item */
242-
#define META_RANGE_ESCAPED 0x801e0000u /* range with at least one escape */
243-
#define META_RANGE_LITERAL 0x801f0000u /* range defined literally */
244-
#define META_RECURSE 0x80200000u /* Recursion */
245-
#define META_RECURSE_BYNAME 0x80210000u /* (?&name) */
246-
#define META_SCRIPT_RUN 0x80220000u /* (*script_run:...) */
234+
#define META_SCS_NAME 0x80160000u /* (*scan_substring:(<name>)... */
235+
#define META_SCS_NUMBER 0x80170000u /* (*scan_substring:(digits)... */
236+
#define META_DOLLAR 0x80180000u /* $ metacharacter */
237+
#define META_DOT 0x80190000u /* . metacharacter */
238+
#define META_ESCAPE 0x801a0000u /* \d and friends */
239+
#define META_KET 0x801b0000u /* closing parenthesis */
240+
#define META_NOCAPTURE 0x801c0000u /* no capture parens */
241+
#define META_OPTIONS 0x801d0000u /* (?i) and friends */
242+
#define META_POSIX 0x801e0000u /* POSIX class item */
243+
#define META_POSIX_NEG 0x801f0000u /* negative POSIX class item */
244+
#define META_RANGE_ESCAPED 0x80200000u /* range with at least one escape */
245+
#define META_RANGE_LITERAL 0x80210000u /* range defined literally */
246+
#define META_RECURSE 0x80220000u /* Recursion */
247+
#define META_RECURSE_BYNAME 0x80230000u /* (?&name) */
248+
#define META_SCRIPT_RUN 0x80240000u /* (*script_run:...) */
247249

248250
/* These must be kept together to make it easy to check that an assertion
249251
is present where expected in a conditional group. */
250252

251-
#define META_LOOKAHEAD 0x80230000u /* (?= */
252-
#define META_LOOKAHEADNOT 0x80240000u /* (?! */
253-
#define META_LOOKBEHIND 0x80250000u /* (?<= */
254-
#define META_LOOKBEHINDNOT 0x80260000u /* (?<! */
253+
#define META_LOOKAHEAD 0x80250000u /* (?= */
254+
#define META_LOOKAHEADNOT 0x80260000u /* (?! */
255+
#define META_LOOKBEHIND 0x80270000u /* (?<= */
256+
#define META_LOOKBEHINDNOT 0x80280000u /* (?<! */
255257

256258
/* These cannot be conditions */
257259

258-
#define META_LOOKAHEAD_NA 0x80270000u /* (*napla: */
259-
#define META_LOOKBEHIND_NA 0x80280000u /* (*naplb: */
260+
#define META_LOOKAHEAD_NA 0x80290000u /* (*napla: */
261+
#define META_LOOKBEHIND_NA 0x802a0000u /* (*naplb: */
260262

261263
/* These must be kept in this order, with consecutive values, and the _ARG
262264
versions of COMMIT, PRUNE, SKIP, and THEN immediately after their non-argument
263265
versions. */
264266

265-
#define META_MARK 0x80290000u /* (*MARK) */
266-
#define META_ACCEPT 0x802a0000u /* (*ACCEPT) */
267-
#define META_FAIL 0x802b0000u /* (*FAIL) */
268-
#define META_COMMIT 0x802c0000u /* These */
269-
#define META_COMMIT_ARG 0x802d0000u /* pairs */
270-
#define META_PRUNE 0x802e0000u /* must */
271-
#define META_PRUNE_ARG 0x802f0000u /* be */
272-
#define META_SKIP 0x80300000u /* kept */
273-
#define META_SKIP_ARG 0x80310000u /* in */
274-
#define META_THEN 0x80320000u /* this */
275-
#define META_THEN_ARG 0x80330000u /* order */
267+
#define META_MARK 0x802b0000u /* (*MARK) */
268+
#define META_ACCEPT 0x802c0000u /* (*ACCEPT) */
269+
#define META_FAIL 0x802d0000u /* (*FAIL) */
270+
#define META_COMMIT 0x802e0000u /* These */
271+
#define META_COMMIT_ARG 0x802f0000u /* pairs */
272+
#define META_PRUNE 0x80300000u /* must */
273+
#define META_PRUNE_ARG 0x80310000u /* be */
274+
#define META_SKIP 0x80320000u /* kept */
275+
#define META_SKIP_ARG 0x80330000u /* in */
276+
#define META_THEN 0x80340000u /* this */
277+
#define META_THEN_ARG 0x80350000u /* order */
276278

277279
/* These must be kept in groups of adjacent 3 values, and all together. */
278280

279-
#define META_ASTERISK 0x80340000u /* * */
280-
#define META_ASTERISK_PLUS 0x80350000u /* *+ */
281-
#define META_ASTERISK_QUERY 0x80360000u /* *? */
282-
#define META_PLUS 0x80370000u /* + */
283-
#define META_PLUS_PLUS 0x80380000u /* ++ */
284-
#define META_PLUS_QUERY 0x80390000u /* +? */
285-
#define META_QUERY 0x803a0000u /* ? */
286-
#define META_QUERY_PLUS 0x803b0000u /* ?+ */
287-
#define META_QUERY_QUERY 0x803c0000u /* ?? */
288-
#define META_MINMAX 0x803d0000u /* {n,m} repeat */
289-
#define META_MINMAX_PLUS 0x803e0000u /* {n,m}+ repeat */
290-
#define META_MINMAX_QUERY 0x803f0000u /* {n,m}? repeat */
281+
#define META_ASTERISK 0x80360000u /* * */
282+
#define META_ASTERISK_PLUS 0x80370000u /* *+ */
283+
#define META_ASTERISK_QUERY 0x80380000u /* *? */
284+
#define META_PLUS 0x80390000u /* + */
285+
#define META_PLUS_PLUS 0x803a0000u /* ++ */
286+
#define META_PLUS_QUERY 0x803b0000u /* +? */
287+
#define META_QUERY 0x803c0000u /* ? */
288+
#define META_QUERY_PLUS 0x803d0000u /* ?+ */
289+
#define META_QUERY_QUERY 0x803e0000u /* ?? */
290+
#define META_MINMAX 0x803f0000u /* {n,m} repeat */
291+
#define META_MINMAX_PLUS 0x80400000u /* {n,m}+ repeat */
292+
#define META_MINMAX_QUERY 0x80410000u /* {n,m}? repeat */
291293

292294
#define META_FIRST_QUANTIFIER META_ASTERISK
293295
#define META_LAST_QUANTIFIER META_MINMAX_QUERY
@@ -326,6 +328,8 @@ static unsigned char meta_extra_lengths[] = {
326328
1+SIZEOFFSET, /* META_COND_RNAME */
327329
1+SIZEOFFSET, /* META_COND_RNUMBER */
328330
3, /* META_COND_VERSION */
331+
1+SIZEOFFSET, /* META_SCS_NAME */
332+
1+SIZEOFFSET, /* META_SCS_NUMBER */
329333
0, /* META_DOLLAR */
330334
0, /* META_DOT */
331335
0, /* META_ESCAPE - one more for ESC_P and ESC_p */
@@ -656,6 +660,8 @@ static const char alasnames[] =
656660
STRING_non_atomic_positive_lookbehind0
657661
STRING_negative_lookahead0
658662
STRING_negative_lookbehind0
663+
STRING_scs0
664+
STRING_scan_substring0
659665
STRING_atomic0
660666
STRING_sr0
661667
STRING_asr0
@@ -675,6 +681,8 @@ static const alasitem alasmeta[] = {
675681
{ 30, META_LOOKBEHIND_NA },
676682
{ 18, META_LOOKAHEADNOT },
677683
{ 19, META_LOOKBEHINDNOT },
684+
{ 3, META_SCS_NUMBER }, /* placeholder, updated later */
685+
{ 14, META_SCS_NUMBER }, /* placeholder, updated later */
678686
{ 6, META_ATOMIC },
679687
{ 2, META_SCRIPT_RUN }, /* sr = script run */
680688
{ 3, META_ATOMIC_SCRIPT_RUN }, /* asr = atomic script run */
@@ -1152,6 +1160,19 @@ for (;;)
11521160
fprintf(stderr, "%zd", offset);
11531161
break;
11541162

1163+
case META_SCS_NAME:
1164+
fprintf(stderr, "META (*scan_substring:(<name>) length=%d offset=", *pptr++);
1165+
GETOFFSET(offset, pptr);
1166+
fprintf(stderr, "%zd", offset);
1167+
break;
1168+
1169+
case META_SCS_NUMBER:
1170+
fprintf(stderr, "META_SCS_NUMBER %d offset=", pptr[SIZEOFFSET]);
1171+
GETOFFSET(offset, pptr);
1172+
fprintf(stderr, "%zd", offset);
1173+
pptr++;
1174+
break;
1175+
11551176
case META_MARK:
11561177
fprintf(stderr, "META (*MARK:");
11571178
goto SHOWARG;
@@ -4053,6 +4074,67 @@ while (ptr < ptrend)
40534074
case META_LOOKAHEADNOT:
40544075
goto NEGATIVE_LOOK_AHEAD;
40554076

4077+
case META_SCS_NUMBER:
4078+
nest_depth++;
4079+
4080+
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4081+
4082+
if (*ptr != CHAR_LEFT_PARENTHESIS)
4083+
{
4084+
errorcode = ERR15;
4085+
goto FAILED;
4086+
}
4087+
4088+
ptr++;
4089+
4090+
/* Handle (scan_substring:([+-]number)... */
4091+
if (read_number(&ptr, ptrend, cb->bracount, MAX_GROUP_NUMBER, ERR61,
4092+
&i, &errorcode))
4093+
{
4094+
if (i <= 0)
4095+
{
4096+
errorcode = ERR15;
4097+
goto FAILED;
4098+
}
4099+
*parsed_pattern++ = META_SCS_NUMBER;
4100+
offset = (PCRE2_SIZE)(ptr - cb->start_pattern - 2);
4101+
PUTOFFSET(offset, parsed_pattern);
4102+
*parsed_pattern++ = i;
4103+
}
4104+
else if (errorcode != 0) goto FAILED; /* Number too big */
4105+
else
4106+
{
4107+
if (ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4108+
4109+
/* Handle (*scan_substring:('name') or (*scan_substring:(<name>) */
4110+
if (*ptr == CHAR_LESS_THAN_SIGN)
4111+
terminator = CHAR_GREATER_THAN_SIGN;
4112+
else if (*ptr == CHAR_APOSTROPHE)
4113+
terminator = CHAR_APOSTROPHE;
4114+
else
4115+
{
4116+
errorcode = ERR15;
4117+
goto FAILED;
4118+
}
4119+
4120+
if (!read_name(&ptr, ptrend, utf, terminator, &offset, &name,
4121+
&namelen, &errorcode, cb)) goto FAILED;
4122+
4123+
*parsed_pattern++ = META_SCS_NAME;
4124+
*parsed_pattern++ = namelen;
4125+
PUTOFFSET(offset, parsed_pattern);
4126+
}
4127+
4128+
if (++ptr >= ptrend) goto UNCLOSED_PARENTHESIS;
4129+
4130+
if (*ptr != CHAR_RIGHT_PARENTHESIS)
4131+
{
4132+
errorcode = ERR24;
4133+
break;
4134+
}
4135+
ptr++;
4136+
break;
4137+
40564138
case META_LOOKBEHIND:
40574139
case META_LOOKBEHINDNOT:
40584140
case META_LOOKBEHIND_NA:
@@ -6642,7 +6724,8 @@ for (;; pptr++)
66426724
case META_COND_RNUMBER: /* (?(Rdigits) */
66436725
case META_COND_NAME: /* (?(name) or (?'name') or ?(<name>) */
66446726
case META_COND_RNAME: /* (?(R&name) - test for recursion */
6645-
bravalue = OP_COND;
6727+
case META_SCS_NAME: /* (*scan_substring:'name') or (*scan_substring:(<name>)) */
6728+
bravalue = meta == META_SCS_NAME ? OP_ASSERT_SCS : OP_COND;
66466729
{
66476730
int count, index;
66486731
unsigned int i;
@@ -6736,7 +6819,9 @@ for (;; pptr++)
67366819
PUT2(code, 2+LINK_SIZE, index);
67376820
PUT2(code, 2+LINK_SIZE+IMM2_SIZE, count);
67386821
}
6739-
goto GROUP_PROCESS_NOTE_EMPTY;
6822+
if (meta != META_SCS_NAME) goto GROUP_PROCESS_NOTE_EMPTY;
6823+
cb->assert_depth += 1;
6824+
goto GROUP_PROCESS;
67406825

67416826
/* The DEFINE condition is always false. Its internal groups may never
67426827
be called, so matched_char must remain false, hence the jump to
@@ -6752,7 +6837,8 @@ for (;; pptr++)
67526837
/* Conditional test of a group's being set. */
67536838

67546839
case META_COND_NUMBER:
6755-
bravalue = OP_COND;
6840+
case META_SCS_NUMBER:
6841+
bravalue = meta == META_SCS_NUMBER ? OP_ASSERT_SCS : OP_COND;
67566842
GETPLUSOFFSET(offset, pptr);
67576843
groupnumber = *(++pptr);
67586844
if (groupnumber > cb->bracount)
@@ -6762,11 +6848,14 @@ for (;; pptr++)
67626848
return 0;
67636849
}
67646850
if (groupnumber > cb->top_backref) cb->top_backref = groupnumber;
6765-
offset -= 2; /* Point at initial ( for too many branches error */
6851+
/* Point at initial ( for too many branches error */
6852+
if (meta != META_SCS_NUMBER) offset -= 2;
67666853
code[1+LINK_SIZE] = OP_CREF;
67676854
skipunits = 1+IMM2_SIZE;
67686855
PUT2(code, 2+LINK_SIZE, groupnumber);
6769-
goto GROUP_PROCESS_NOTE_EMPTY;
6856+
if (meta != META_SCS_NUMBER) goto GROUP_PROCESS_NOTE_EMPTY;
6857+
cb->assert_depth += 1;
6858+
goto GROUP_PROCESS;
67706859

67716860
/* Test for the PCRE2 version. */
67726861

@@ -6900,7 +6989,7 @@ for (;; pptr++)
69006989

69016990
/* If we've just compiled an assertion, pop the assert depth. */
69026991

6903-
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NA)
6992+
if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERT_SCS)
69046993
cb->assert_depth -= 1;
69056994

69066995
/* At the end of compiling, code is still pointing to the start of the

src/pcre2_dfa_match.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ static const uint8_t coptable[] = {
175175
0, /* Assert behind not */
176176
0, /* NA assert */
177177
0, /* NA assert behind */
178+
0, /* Assert scan substring */
178179
0, /* ONCE */
179180
0, /* SCRIPT_RUN */
180181
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
@@ -253,6 +254,7 @@ static const uint8_t poptable[] = {
253254
0, /* Assert behind not */
254255
0, /* NA assert */
255256
0, /* NA assert behind */
257+
0, /* Assert scan substring */
256258
0, /* ONCE */
257259
0, /* SCRIPT_RUN */
258260
0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */

0 commit comments

Comments
 (0)