Skip to content

Commit 0fa70a0

Browse files
committed
simpify and speed up /.*.../ handling
See RT ##123743. A pattern that starts /.*/ has a fake MBOL or SBOL flag added, along with PREGf_IMPLICIT. The idea is that, with /.*.../s, if the NFA don't match when started at pos 0, then it's not going to match if started at any other position either; while /.*.../ won't match at any other start position up until the next \n. However, the branch in regexec() that implemented this was a bit a mess (like much in the perl core, it had gradually accreted), and caused intuit-enabled /.*.../ and /.*...patterns to go quadratic. The branch looked roughly like: if (anchored) { if (regtry(s)) goto success; if (can_intuit) { while (s < end) { s = intuit(s+1); if (!s) goto fail; if (regtry(s)) goto success; } } else { while (s < end) { s = skip_to_next_newline(s); if (regtry(s)) goto success; } } } The problem is that in the presence of a .* at the start of the pattern, intuit() will always return either NULL on failure, or the start position, rather than any later position. So the can_intuit branch above calls regtry() on every character position. This commit fixes this by changing the structure of the code to be like this, where it only tries things on newline boundaries: if (anchored) { if (regtry(s)) goto success; while (1) { s = skip_to_next_newline(s); if (can_intuit) { s = intuit(s+1); if (!s) goto fail; } if (regtry(s)) goto success; } } This makes the code a lot simpler, and mostly avoids quadratic behaviour (you can still get it with a string consisting mainly of newlines).
1 parent 5904c5c commit 0fa70a0

File tree

2 files changed

+58
-80
lines changed

2 files changed

+58
-80
lines changed

regexec.c

Lines changed: 45 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ Perl_re_intuit_start(pTHX_
754754

755755
/* ml_anch: check after \n?
756756
*
757-
* A note about IMPLICIT: on an un-anchored pattern beginning
757+
* A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
758758
* with /.*.../, these flags will have been added by the
759759
* compiler:
760760
* /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
@@ -2755,86 +2755,52 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
27552755
));
27562756
}
27572757

2758-
/* Simplest case: anchored match need be tried only once. */
2759-
/* [unless only anchor is MBOL - implying multiline is set] */
2758+
/* Simplest case: anchored match need be tried only once, or with
2759+
* MBOL, only at the beginning of each line.
2760+
*
2761+
* Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
2762+
* PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
2763+
* match at the start of the string then it won't match anywhere else
2764+
* either; while with /.*.../, if it doesn't match at the beginning,
2765+
* the earliest it could match is at the start of the next line */
2766+
27602767
if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2761-
if (s == startpos && regtry(reginfo, &s))
2768+
char *end;
2769+
2770+
if (regtry(reginfo, &s))
27622771
goto got_it;
2763-
else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2764-
{
2765-
char *end;
2766-
2767-
if (minlen)
2768-
dontbother = minlen - 1;
2769-
end = HOP3c(strend, -dontbother, strbeg) - 1;
2770-
/* for multiline we only have to try after newlines */
2771-
if (prog->check_substr || prog->check_utf8) {
2772-
/* because of the goto we can not easily reuse the macros for bifurcating the
2773-
unicode/non-unicode match modes here like we do elsewhere - demerphq */
2774-
if (utf8_target) {
2775-
if (s == startpos)
2776-
goto after_try_utf8;
2777-
while (1) {
2778-
if (regtry(reginfo, &s)) {
2779-
goto got_it;
2780-
}
2781-
after_try_utf8:
2782-
if (s > end) {
2783-
goto phooey;
2784-
}
2785-
if (prog->extflags & RXf_USE_INTUIT) {
2786-
s = re_intuit_start(rx, sv, strbeg,
2787-
s + UTF8SKIP(s), strend, flags, NULL);
2788-
if (!s) {
2789-
goto phooey;
2790-
}
2791-
}
2792-
else {
2793-
s += UTF8SKIP(s);
2794-
}
2795-
}
2796-
} /* end search for check string in unicode */
2797-
else {
2798-
if (s == startpos) {
2799-
goto after_try_latin;
2800-
}
2801-
while (1) {
2802-
if (regtry(reginfo, &s)) {
2803-
goto got_it;
2804-
}
2805-
after_try_latin:
2806-
if (s > end) {
2807-
goto phooey;
2808-
}
2809-
if (prog->extflags & RXf_USE_INTUIT) {
2810-
s = re_intuit_start(rx, sv, strbeg,
2811-
s + 1, strend, flags, NULL);
2812-
if (!s) {
2813-
goto phooey;
2814-
}
2815-
}
2816-
else {
2817-
s++;
2818-
}
2819-
}
2820-
} /* end search for check string in latin*/
2821-
} /* end search for check string */
2822-
else { /* search for newline */
2823-
if (s > startpos) {
2824-
/*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2825-
s--;
2826-
}
2827-
/* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2828-
while (s <= end) { /* note it could be possible to match at the end of the string */
2829-
if (*s++ == '\n') { /* don't need PL_utf8skip here */
2830-
if (regtry(reginfo, &s))
2831-
goto got_it;
2832-
}
2833-
}
2834-
} /* end search for newline */
2835-
} /* end anchored/multiline check string search */
2836-
goto phooey;
2837-
} else if (prog->intflags & PREGf_ANCH_GPOS)
2772+
2773+
if (!(prog->intflags & PREGf_ANCH_MBOL))
2774+
goto phooey;
2775+
2776+
/* didn't match at start, try at other newline positions */
2777+
2778+
if (minlen)
2779+
dontbother = minlen - 1;
2780+
end = HOP3c(strend, -dontbother, strbeg) - 1;
2781+
2782+
/* skip to next newline */
2783+
2784+
while (s <= end) { /* note it could be possible to match at the end of the string */
2785+
/* NB: newlines are the same in unicode as they are in latin */
2786+
if (*s++ != '\n')
2787+
continue;
2788+
if (prog->check_substr || prog->check_utf8) {
2789+
/* note that with PREGf_IMPLICIT, intuit can only fail
2790+
* or return the start position, so it's of limited utility.
2791+
* Nevertheless, I made the decision that the potential for
2792+
* quick fail was still worth it - DAPM */
2793+
s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
2794+
if (!s)
2795+
goto phooey;
2796+
}
2797+
if (regtry(reginfo, &s))
2798+
goto got_it;
2799+
}
2800+
goto phooey;
2801+
} /* end anchored search */
2802+
2803+
if (prog->intflags & PREGf_ANCH_GPOS)
28382804
{
28392805
/* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
28402806
assert(prog->intflags & PREGf_GPOS_SEEN);

t/re/speed.t

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ BEGIN {
2323
skip_all_without_unicode_tables();
2424
}
2525

26-
plan tests => 9; # Update this when adding/deleting tests.
26+
plan tests => 17; # Update this when adding/deleting tests.
2727

2828
use strict;
2929
use warnings;
@@ -98,6 +98,18 @@ sub run_tests {
9898
$s =~ /^XX\d{1,10}cde/ for 1..100;
9999
pass("abs anchored float string should fail quickly");
100100

101+
# if /.*.../ fails to be optimised well (PREGf_IMPLICIT),
102+
# things tend to go quadratic (RT #123743)
103+
104+
$s = ('0' x 200_000) . '::: 0c';
105+
ok ($s !~ /.*:::\s*ab/, 'PREGf_IMPLICIT');
106+
ok ($s !~ /.*:::\s*ab/i, 'PREGf_IMPLICIT/i');
107+
ok ($s !~ /.*:::\s*ab/m, 'PREGf_IMPLICIT/m');
108+
ok ($s !~ /.*:::\s*ab/mi, 'PREGf_IMPLICIT/mi');
109+
ok ($s !~ /.*:::\s*ab/s, 'PREGf_IMPLICIT/s');
110+
ok ($s !~ /.*:::\s*ab/si, 'PREGf_IMPLICIT/si');
111+
ok ($s !~ /.*:::\s*ab/ms, 'PREGf_IMPLICIT/ms');
112+
ok ($s !~ /.*:::\s*ab/msi,'PREGf_IMPLICIT/msi');
101113
}
102114

103115
} # End of sub run_tests

0 commit comments

Comments
 (0)