Skip to content

Commit c7a638b

Browse files
committed
Remove "offsets" debugging code from regcomp.c
This code was added by Mark Jason Dominus to aid a regex debugger he wrote. The basic premise is that every opcode in a regex can be attributed back to parts of the pattern. This assumption has not been true ever since the TRIE optimizations were added, and I believe that the debugger is no longer in use anyway. The regex compiler is complicated enough without having to maintain this logic. There are essentially no tests for it, and the few tests that do cover it do so as a byproduct of testing other things. Despite the offsets logic only being used in debug supporting it does have a cost to non-debug logic as various internal routines include parameters related to it that are otherwise unused. I spoke to him many years ago about whether it was ok to remove it from the regex engine and he said yes. As part of this patch I also changed the name of the "parse_start" and "oregcomp_parse" variables in certain contexts so that the code is a bit more clear, this was partly because the offsets logic used its own parse_start variable in certain contexts and changing the names of the others made it easier to clean up. This patch also includes updates to the perlreguts documentation for the C<struct regexp_internal> which had gotten out of date with the current state of the code, and I needed to remove the docs for the "offsets" member anyway.
1 parent afded0c commit c7a638b

File tree

7 files changed

+110
-354
lines changed

7 files changed

+110
-354
lines changed

embed.fnc

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2029,9 +2029,7 @@ ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \
20292029
ES |regnode_offset|reg |NN RExC_state_t *pRExC_state \
20302030
|I32 paren|NN I32 *flagp|U32 depth
20312031
ES |regnode_offset|regnode_guts|NN RExC_state_t *pRExC_state \
2032-
|const U8 op \
2033-
|const STRLEN extra_len \
2034-
|NN const char* const name
2032+
|const STRLEN extra_len
20352033
ES |void |change_engine_size|NN RExC_state_t *pRExC_state|const Ptrdiff_t size
20362034
ES |regnode_offset|reganode|NN RExC_state_t *pRExC_state|U8 op \
20372035
|U32 arg
@@ -2108,13 +2106,12 @@ ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \
21082106
|NN SV** invlist
21092107
ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \
21102108
|NN I32 *flagp \
2111-
|NN char * parse_start \
2109+
|NN char * backref_parse_start \
21122110
|char ch
21132111
ESTR |unsigned int|regex_set_precedence|const U8 my_operator
21142112
ES |regnode_offset|handle_regex_sets|NN RExC_state_t *pRExC_state \
21152113
|NULLOK SV ** return_invlist \
2116-
|NN I32 *flagp|U32 depth \
2117-
|NN char * const oregcomp_parse
2114+
|NN I32 *flagp|U32 depth
21182115
ES |void |set_regex_pv |NN RExC_state_t *pRExC_state|NN REGEXP *Rx
21192116
# if defined(DEBUGGING) && defined(ENABLE_REGEX_SETS_DEBUGGING)
21202117
ES |void |dump_regex_sets_structures \

embed.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1077,7 +1077,7 @@
10771077
#define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d)
10781078
#define handle_names_wildcard(a,b,c,d) S_handle_names_wildcard(aTHX_ a,b,c,d)
10791079
#define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e)
1080-
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
1080+
#define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d)
10811081
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j)
10821082
#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b)
10831083
#define invlist_is_iterating S_invlist_is_iterating
@@ -1104,7 +1104,7 @@
11041104
#define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i)
11051105
#define regex_set_precedence S_regex_set_precedence
11061106
#define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d)
1107-
#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d)
1107+
#define regnode_guts(a,b) S_regnode_guts(aTHX_ a,b)
11081108
#define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c)
11091109
#define regpnode(a,b,c) S_regpnode(aTHX_ a,b,c)
11101110
#define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d)

ext/re/t/regop.t

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,6 @@ Freeing REx: "[f][o][o][b][a][r]"
140140
minlen 3
141141
---
142142
# Compiling REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)"
143-
# Got 164 bytes for offset annotations.
144143
# TRIE(NATIVE): W:6 C:24 Uq:7 Min:4 Max:4
145144
# Char : Match Base Ofs A B C P G E D
146145
# State|---------------------------------------------------
@@ -166,8 +165,6 @@ minlen 3
166165
# <D>
167166
# 20: END (0)
168167
# anchored "ABC" at 0 (checking anchored) minlen 4
169-
# Offsets: [20]
170-
# 1:4[3] 3:4[15] 19:32[0] 20:34[0]
171168
# Guessing start of match in sv for REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD"
172169
# Found anchored substr "ABC" at offset 0...
173170
# Guessed: match at offset 0
@@ -210,8 +207,6 @@ anchored "ABC" at 0
210207
# 47: EOL(48)
211208
# 48: END(0)
212209
#floating ""$ at 3..4 (checking floating) stclass "EXACTF <.>" minlen 3
213-
#Offsets: [48]
214-
# 1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0]
215210
#Guessing start of match, REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|..." against "D:dev/perl/ver/28321_/perl.exe"...
216211
#Found floating substr ""$ at offset 30...
217212
#Starting position does not contradict /^/m...
@@ -233,30 +228,23 @@ anchored "ABC" at 0
233228
#Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."......
234229
%MATCHED%
235230
floating ""$ at 3..4 (checking floating)
236-
#1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0]
237231
#stclass EXACTF <.> minlen 3
238232
#Found floating substr ""$ at offset 30...
239233
#Does not contradict STCLASS...
240234
#Guessed: match at offset 26
241235
#Matching stclass EXACTF <.> against ".exe"
242236
---
243237
#Compiling REx "[q]"
244-
#size 3 nodes Got 7 bytes for offset annotations.
245238
#first at 1
246239
#Final program:
247240
# 1: EXACT <q>(3)
248241
# 3: END(0)
249242
#anchored "q" at 0 (checking anchored isall) minlen 1
250-
#Offsets: [3]
251-
# 1:1[3] 3:4[0]
252243
#Guessing start of match, REx "[q]" against "q"...
253244
#Found anchored substr "q" at offset 0...
254245
#Guessed: match at offset 0
255246
#%MATCHED%
256247
#Freeing REx: "[q]"
257-
Got 7 bytes for offset annotations.
258-
Offsets: [3]
259-
1:1[3] 3:4[0]
260248
%MATCHED%
261249
Freeing REx: "[q]"
262250
---
@@ -281,7 +269,6 @@ Freeing REx: "[q]"
281269
Freeing REx: "^(\S{1,9}):\s*(\d+)$"
282270
---
283271
#Compiling REx "(?(DEFINE)(?<foo>foo))(?(DEFINE)(?<bar>(?&foo)bar))(?(DEFINE"...
284-
#Got 532 bytes for offset annotations.
285272
study_chunk_recursed_count: 5
286273
#Final program:
287274
# 1: DEFINEP (3)
@@ -317,8 +304,6 @@ study_chunk_recursed_count: 5
317304
# 61: TAIL (62)
318305
# 62: END (0)
319306
minlen 0
320-
#Offsets: [66]
321-
# 1:3[0] 3:10[0] 5:17[1] 7:18[3] 9:21[1] 11:21[0] 13:22[0] 14:25[0] 16:32[0] 18:39[1] 20:41[3] 23:47[3] 25:50[1] 27:50[0] 29:51[0] 30:54[0] 32:61[0] 34:68[1] 36:70[3] 39:76[3] 41:79[1] 43:79[0] 45:80[0] 46:83[0] 48:90[0] 50:97[1] 52:99[3] 55:105[3] 57:108[1] 59:108[0] 61:109[0] 62:110[0]
322307
#Matching REx "(?(DEFINE)(?<foo>foo))(?(DEFINE)(?<bar>(?&foo)bar))(?(DEFINE"... against ""
323308
# 0 <> <> | 1:DEFINEP(3)
324309
# 0 <> <> | 3:IFTHEN(14)

pod/perlreguts.pod

Lines changed: 46 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -827,29 +827,18 @@ The following structure is used as the C<pprivate> struct by perl's
827827
regex engine. Since it is specific to perl it is only of curiosity
828828
value to other engine implementations.
829829

830-
typedef struct regexp_internal {
831-
U32 *offsets; /* offset annotations 20001228 MJD
832-
* data about mapping the program to
833-
* the string*/
834-
regnode *regstclass; /* Optional startclass as identified or
835-
* constructed by the optimiser */
836-
struct reg_data *data; /* Additional miscellaneous data used
837-
* by the program. Used to make it
838-
* easier to clone and free arbitrary
839-
* data that the regops need. Often the
840-
* ARG field of a regop is an index
841-
* into this structure */
842-
regnode program[1]; /* Unwarranted chumminess with
843-
* compiler. */
844-
} regexp_internal;
830+
typedef struct regexp_internal {
831+
regnode *regstclass;
832+
struct reg_data *data;
833+
struct reg_code_blocks *code_blocks;
834+
U32 proglen;
835+
int name_list_idx;
836+
regnode program[1];
837+
} regexp_internal;
845838

846-
=over 5
847-
848-
=item C<offsets>
839+
Descriptions of the fields are as follows:
849840

850-
Offsets holds a mapping of offset in the C<program>
851-
to offset in the C<precomp> string. This is only used by ActiveState's
852-
visual regex debugger.
841+
=over 5
853842

854843
=item C<regstclass>
855844

@@ -878,6 +867,42 @@ what array. During compilation regops that need special structures stored
878867
will add an element to each array using the add_data() routine and then store
879868
the index in the regop.
880869

870+
In modern perls the 0th element of this structure is reserved and is NEVER
871+
used to store anything of use. This is to allow things that need to index
872+
into this array to represent "no value".
873+
874+
=item C<code_blocks>
875+
876+
This optional structure is used to manage C<(?{})> constructs in the
877+
pattern. It is made up of the following structures.
878+
879+
/* record the position of a (?{...}) within a pattern */
880+
struct reg_code_block {
881+
STRLEN start;
882+
STRLEN end;
883+
OP *block;
884+
REGEXP *src_regex;
885+
};
886+
887+
/* array of reg_code_block's plus header info */
888+
struct reg_code_blocks {
889+
int refcnt; /* we may be pointed to from a regex
890+
and from the savestack */
891+
int count; /* how many code blocks */
892+
struct reg_code_block *cb; /* array of reg_code_block's */
893+
};
894+
895+
=item C<proglen>
896+
897+
Stores the length of the compiled program in units of regops.
898+
899+
=item C<name_list_idx>
900+
901+
This is the index into the data array where an AV is stored that contains
902+
the names of any named capture buffers in the pattern, should there be
903+
any. This is only used in the debugging version of the regex engine. It
904+
will be 0 if there is no such data.
905+
881906
=item C<program>
882907

883908
Compiled program. Inlined into the structure so the entire struct can be

proto.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5877,18 +5877,18 @@ STATIC U32 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, const char *
58775877
STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth);
58785878
#define PERL_ARGS_ASSERT_GROK_BSLASH_N \
58795879
assert(pRExC_state); assert(flagp)
5880-
STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch);
5880+
STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * backref_parse_start, char ch);
58815881
#define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \
5882-
assert(pRExC_state); assert(flagp); assert(parse_start)
5882+
assert(pRExC_state); assert(flagp); assert(backref_parse_start)
58835883
STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings);
58845884
#define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \
58855885
assert(wname); assert(prop_definition); assert(strings)
58865886
STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only);
58875887
#define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \
58885888
assert(pRExC_state); assert(s)
5889-
STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse);
5889+
STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth);
58905890
#define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \
5891-
assert(pRExC_state); assert(flagp); assert(oregcomp_parse)
5891+
assert(pRExC_state); assert(flagp)
58925892
STATIC SV * S_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level);
58935893
#define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \
58945894
assert(name); assert(contents); assert(user_defined_ptr); assert(msg)
@@ -5990,9 +5990,9 @@ STATIC unsigned int S_regex_set_precedence(const U8 my_operator)
59905990
STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, const regnode_offset operand, const U32 depth);
59915991
#define PERL_ARGS_ASSERT_REGINSERT \
59925992
assert(pRExC_state)
5993-
STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len, const char* const name);
5993+
STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_len);
59945994
#define PERL_ARGS_ASSERT_REGNODE_GUTS \
5995-
assert(pRExC_state); assert(name)
5995+
assert(pRExC_state)
59965996
STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth);
59975997
#define PERL_ARGS_ASSERT_REGPIECE \
59985998
assert(pRExC_state); assert(flagp)

0 commit comments

Comments
 (0)