Skip to content

Commit 0abd0d7

Browse files
committed
disable non-unicode case insensitive trie matching
Also revert 8902bb0 as it merely masked one symptom of the deeper problems. Also fixes RT #69973, which was a segfault which was exposed by 8902bb0, see the ticket for further details. http://rt.perl.org/rt3//Public/Bug/Display.html?id=69973 At the code of this is the problem that in unicode matching a bunch of code points have case folding rules beyond just A-Z/a-z. Since the case folding rules are decided at runtime by the string, we cant use the same TRIE tables for both unicode/non-unicode matching. Until this is reconciled or some other solution is found case insensitive matching only gets the TRIE optimisation when the pattern is uniocde. From CaseFolding.txt: 00B5; C; 03BC; # MICRO SIGN 00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE 00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE 00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX 00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE 00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS 00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE 00C6; C; 00E6; # LATIN CAPITAL LETTER AE 00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA 00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE 00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE 00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX 00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS 00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE 00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE 00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX 00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS 00D0; C; 00F0; # LATIN CAPITAL LETTER ETH 00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE 00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE 00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE 00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX 00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE 00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS 00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE 00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE 00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE 00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX 00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS 00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE 00DE; C; 00FE; # LATIN CAPITAL LETTER THORN 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
1 parent fe88edf commit 0abd0d7

File tree

3 files changed

+19
-19
lines changed

3 files changed

+19
-19
lines changed

ext/re/t/regop.t

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,12 @@ anchored "ABC" at 0
231231
#Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."......
232232
%MATCHED%
233233
floating ""$ at 3..4 (checking floating)
234-
1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0]
235-
stclass EXACTF <.> minlen 3
236-
Found floating substr ""$ at offset 30...
237-
Does not contradict STCLASS...
238-
Guessed: match at offset 26
239-
Matching stclass EXACTF <.> against ".exe"
234+
#1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0]
235+
#stclass EXACTF <.> minlen 3
236+
#Found floating substr ""$ at offset 30...
237+
#Does not contradict STCLASS...
238+
#Guessed: match at offset 26
239+
#Matching stclass EXACTF <.> against ".exe"
240240
---
241241
#Compiling REx "[q]"
242242
#size 12 nodes Got 100 bytes for offset annotations.

regcomp.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2833,13 +2833,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
28332833
}
28342834
} else {
28352835
/*
2836-
Currently we assume that the trie can handle unicode and ascii
2837-
matches fold cased matches. If this proves true then the following
2838-
define will prevent tries in this situation.
2839-
2840-
#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2841-
*/
2836+
Currently we do not believe that the trie logic can
2837+
handle case insensitive matching properly when the
2838+
pattern is not unicode (thus forcing unicode semantics).
2839+
2840+
If/when this is fixed the following define can be swapped
2841+
in below to fully enable trie logic.
2842+
28422843
#define TRIE_TYPE_IS_SAFE 1
2844+
2845+
*/
2846+
#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
2847+
28432848
if ( last && TRIE_TYPE_IS_SAFE ) {
28442849
make_trie( pRExC_state,
28452850
startbranch, first, cur, tail, count,

regexec.c

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1105,16 +1105,15 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
11051105

11061106
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
11071107
uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1108-
UV uvc_unfolded = 0; \
11091108
switch (trie_type) { \
11101109
case trie_utf8_fold: \
11111110
if ( foldlen>0 ) { \
1112-
uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1111+
uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
11131112
foldlen -= len; \
11141113
uscan += len; \
11151114
len=0; \
11161115
} else { \
1117-
uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1116+
uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
11181117
uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
11191118
foldlen -= UNISKIP( uvc ); \
11201119
uscan = foldbuf + UNISKIP( uvc ); \
@@ -1140,7 +1139,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
11401139
uvc = (UV)*uc; \
11411140
len = 1; \
11421141
} \
1143-
\
11441142
if (uvc < 256) { \
11451143
charid = trie->charmap[ uvc ]; \
11461144
} \
@@ -1153,9 +1151,6 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
11531151
charid = (U16)SvIV(*svpp); \
11541152
} \
11551153
} \
1156-
if (!charid && trie_type == trie_utf8_fold && !UTF) { \
1157-
charid = trie->charmap[uvc_unfolded]; \
1158-
} \
11591154
} STMT_END
11601155

11611156
#define REXEC_FBC_EXACTISH_CHECK(CoNd) \

0 commit comments

Comments
 (0)