@@ -142,6 +142,8 @@ struct RExC_state_t {
142
142
U32 seen;
143
143
SSize_t size; /* Number of regnode equivalents in
144
144
pattern */
145
+ Size_t sets_depth; /* Counts recursion depth of already-
146
+ compiled regex set patterns */
145
147
146
148
/* position beyond 'precomp' of the warning message furthest away from
147
149
* 'precomp'. During the parse, no warnings are raised for any problems
@@ -266,6 +268,7 @@ struct RExC_state_t {
266
268
#define RExC_paren_names (pRExC_state->paren_names)
267
269
#define RExC_recurse (pRExC_state->recurse)
268
270
#define RExC_recurse_count (pRExC_state->recurse_count)
271
+ #define RExC_sets_depth (pRExC_state->sets_depth)
269
272
#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
270
273
#define RExC_study_chunk_recursed_bytes \
271
274
(pRExC_state->study_chunk_recursed_bytes)
@@ -6421,6 +6424,11 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
6421
6424
if (trie->jump) /* no more substrings -- for now /grr*/
6422
6425
flags &= ~SCF_DO_SUBSTR;
6423
6426
}
6427
+ else if (OP(scan) == REGEX_SET) {
6428
+ Perl_croak(aTHX_ "panic: %s regnode should be resolved"
6429
+ " before optimization", reg_name[REGEX_SET]);
6430
+ }
6431
+
6424
6432
#endif /* old or new */
6425
6433
#endif /* TRIE_STUDY_OPT */
6426
6434
@@ -7670,6 +7678,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
7670
7678
RExC_study_chunk_recursed = NULL;
7671
7679
RExC_study_chunk_recursed_bytes= 0;
7672
7680
RExC_recurse_count = 0;
7681
+ RExC_sets_depth = 0;
7673
7682
pRExC_state->code_index = 0;
7674
7683
7675
7684
/* Initialize the string in the compiled pattern. This is so that there is
@@ -16229,6 +16238,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16229
16238
&& UCHARAT(RExC_parse + 1) == '?'
16230
16239
&& UCHARAT(RExC_parse + 2) == '^')
16231
16240
{
16241
+ const regnode_offset orig_emit = RExC_emit;
16242
+ SV * resultant_invlist;
16243
+
16232
16244
/* If is a '(?^', could be an embedded '(?^flags:(?[...])'.
16233
16245
* This happens when we have some thing like
16234
16246
*
@@ -16238,62 +16250,33 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16238
16250
*
16239
16251
* Here we would be handling the interpolated
16240
16252
* '$thai_or_lao'. We handle this by a recursive call to
16241
- * ourselves which returns the inversion list the
16242
- * interpolated expression evaluates to. We use the flags
16243
- * from the interpolated pattern. */
16244
- U32 save_flags = RExC_flags;
16245
- const char * save_parse;
16246
-
16247
- RExC_parse += 2; /* Skip past the '(?' */
16248
- save_parse = RExC_parse;
16249
-
16250
- /* Parse the flags for the '(?'. We already know the first
16251
- * flag to parse is a '^' */
16252
- parse_lparen_question_flags(pRExC_state);
16253
-
16254
- if ( RExC_parse >= RExC_end - 4
16255
- || UCHARAT(RExC_parse) != ':'
16256
- || UCHARAT(++RExC_parse) != '('
16257
- || UCHARAT(++RExC_parse) != '?'
16258
- || UCHARAT(++RExC_parse) != '[')
16259
- {
16253
+ * reg which returns the inversion list the
16254
+ * interpolated expression evaluates to. Actually, the
16255
+ * return is a special regnode containing a pointer to that
16256
+ * inversion list. If the return isn't that regnode alone,
16257
+ * we know that this wasn't such an interpolation, which is
16258
+ * an error: we need to get a single inversion list back
16259
+ * from the recursion */
16260
16260
16261
- /* In combination with the above, this moves the
16262
- * pointer to the point just after the first erroneous
16263
- * character. */
16264
- if (RExC_parse >= RExC_end - 4) {
16265
- RExC_parse = RExC_end;
16266
- }
16267
- else if (RExC_parse != save_parse) {
16268
- RExC_parse += (UTF)
16269
- ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
16270
- : 1;
16271
- }
16272
- vFAIL("Expecting '(?flags:(?[...'");
16273
- }
16274
-
16275
- /* Recurse, with the meat of the embedded expression */
16276
16261
RExC_parse++;
16277
- if (! handle_regex_sets(pRExC_state, ¤t, flagp,
16278
- depth+1, oregcomp_parse))
16279
- {
16280
- RETURN_FAIL_ON_RESTART(*flagp, flagp);
16281
- }
16262
+ RExC_sets_depth++;
16282
16263
16283
- /* Here, 'current' contains the embedded expression's
16284
- * inversion list, and RExC_parse points to the trailing
16285
- * ']'; the next character should be the ')' */
16286
- RExC_parse++;
16287
- if (UCHARAT(RExC_parse) != ')')
16288
- vFAIL("Expecting close paren for nested extended charclass");
16264
+ node = reg(pRExC_state, 2, flagp, depth+1);
16265
+ RETURN_FAIL_ON_RESTART(*flagp, flagp);
16289
16266
16290
- /* Then the ')' matching the original '(' handled by this
16291
- * case: statement */
16292
- RExC_parse++;
16293
- if (UCHARAT(RExC_parse) != ')')
16294
- vFAIL("Expecting close paren for wrapper for nested extended charclass");
16267
+ if ( OP(REGNODE_p(node)) != REGEX_SET
16268
+ /* If more than a single node returned, the nested
16269
+ * parens evaluated to more than just a (?[...]),
16270
+ * which isn't legal */
16271
+ || node != 1) {
16272
+ vFAIL("Expecting interpolated extended charclass");
16273
+ }
16274
+ resultant_invlist = (SV *) ARGp(REGNODE_p(node));
16275
+ current = invlist_clone(resultant_invlist, NULL);
16276
+ SvREFCNT_dec(resultant_invlist);
16295
16277
16296
- RExC_flags = save_flags;
16278
+ RExC_sets_depth--;
16279
+ RExC_emit = orig_emit;
16297
16280
goto handle_operand;
16298
16281
}
16299
16282
@@ -16681,6 +16664,13 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16681
16664
return END;
16682
16665
}
16683
16666
16667
+ if (RExC_sets_depth) { /* If within a recursive call, return in a special
16668
+ regnode */
16669
+ RExC_parse++;
16670
+ node = regpnode(pRExC_state, REGEX_SET, (void *) final);
16671
+ }
16672
+ else {
16673
+
16684
16674
/* Otherwise generate a resultant node, based on 'final'. regclass() is
16685
16675
* expecting a string of ranges and individual code points */
16686
16676
invlist_iterinit(final);
@@ -16764,6 +16754,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
16764
16754
ANYOF_FLAGS(REGNODE_p(node))
16765
16755
|= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
16766
16756
}
16757
+ }
16767
16758
16768
16759
nextchar(pRExC_state);
16769
16760
Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
@@ -20216,6 +20207,22 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
20216
20207
return(ret);
20217
20208
}
20218
20209
20210
+ /*
20211
+ - regpnode - emit a temporary node with a void* argument
20212
+ */
20213
+ STATIC regnode_offset /* Location. */
20214
+ S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, void * arg)
20215
+ {
20216
+ const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regvnode");
20217
+ regnode_offset ptr = ret;
20218
+
20219
+ PERL_ARGS_ASSERT_REGPNODE;
20220
+
20221
+ FILL_ADVANCE_NODE_ARGp(ptr, op, arg);
20222
+ RExC_emit = ptr;
20223
+ return(ret);
20224
+ }
20225
+
20219
20226
STATIC regnode_offset
20220
20227
S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
20221
20228
{
0 commit comments