From 8f1467df575b437a188039801af58aa00721b0dc Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 16 Feb 2022 03:53:38 +0100 Subject: [PATCH 1/4] perlreguts.pod: synchronize regexp_internal docs with code Various changes have been made to struct regexp_internal over time which have not been documented. This updates the docs to match the code as it is now in preparation of changing the docs in subsequent commits. --- pod/perlreguts.pod | 64 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 15 deletions(-) diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index b0a8d8f92234..e58aa425355e 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -827,21 +827,19 @@ The following structure is used as the C struct by perl's regex engine. Since it is specific to perl it is only of curiosity value to other engine implementations. - typedef struct regexp_internal { - U32 *offsets; /* offset annotations 20001228 MJD - * data about mapping the program to - * the string*/ - regnode *regstclass; /* Optional startclass as identified or - * constructed by the optimiser */ - struct reg_data *data; /* Additional miscellaneous data used - * by the program. Used to make it - * easier to clone and free arbitrary - * data that the regops need. Often the - * ARG field of a regop is an index - * into this structure */ - regnode program[1]; /* Unwarranted chumminess with - * compiler. */ - } regexp_internal; + typedef struct regexp_internal { + union { + U32 *offsets; + U32 proglen; + } u; + regnode *regstclass; + struct reg_data *data; + struct reg_code_blocks *code_blocks; + int name_list_idx; + regnode program[1]; + } regexp_internal; + +Description of the attributes is as follows: =over 5 @@ -851,6 +849,10 @@ Offsets holds a mapping of offset in the C to offset in the C string. This is only used by ActiveState's visual regex debugger. +=item C + +Stores the length of the compiled program in units of regops. + =item C Special regop that is used by C to check if a pattern @@ -878,6 +880,38 @@ what array. During compilation regops that need special structures stored will add an element to each array using the add_data() routine and then store the index in the regop. +In modern perls the 0th element of this structure is reserved and is NEVER +used to store anything of use. This is to allow things that need to index +into this array to represent "no value". + +=item C + +This optional structure is used to manage C<(?{})> constructs in the +pattern. It is made up of the following structures. + + /* record the position of a (?{...}) within a pattern */ + struct reg_code_block { + STRLEN start; + STRLEN end; + OP *block; + REGEXP *src_regex; + }; + + /* array of reg_code_block's plus header info */ + struct reg_code_blocks { + int refcnt; /* we may be pointed to from a regex + and from the savestack */ + int count; /* how many code blocks */ + struct reg_code_block *cb; /* array of reg_code_block's */ + }; + +=item C + +This is the index into the data array where an AV is stored that contains +the names of any named capture buffers in the pattern, should there be +any. This is only used in the debugging version of the regex engine. It +will be 0 if there is no such data. + =item C Compiled program. Inlined into the structure so the entire struct can be From 52becc4567e6e156c5e11ee5d55bd35978240993 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Fri, 11 Feb 2022 06:30:45 +0100 Subject: [PATCH 2/4] regcomp.c,re.pm: Remove "offsets" debugging code This code was added by Mark Jason Dominus to aid a regex debugger he wrote for ActiveState. The basic premise is that every opcode in a regex can be attributed back to a contiguous sequence of characters that make up the pattern. This assumption has not been true ever since the "jump" TRIE optimizations were added to the engine. I spoke to MJD many years ago about whether it was ok to remove this from the regex engine and he said he had no objections. An example of a pattern that cannot be handled correctly by this logic is /(?: a x+ | b y+ | c z+ )/x where the (?:a ... | b ... | c ...) parts will now be handled by the TRIE logic and not by the BRANCH/EXACT opcodes that it would have been in the past. The offset debug output cannot handle this type of transformation, and produce nonsense output that mention opcodes that have been optimized away from the final program. The regex compiler is complicated enough without having to maintain this logic. There are essentially no tests for it, and the few tests that do cover it do so as a byproduct of testing other things. Despite the offsets logic only being used in debug supporting it does have a cost to non-debug logic as various internal routines include parameters related to it that are otherwise unused. Note this output is only usable or visible by enabling special flags in re.pm, there is no formal API to access it short of parsing the output of the debug mode of the regex engine, which has changed multiple time over the past years. --- embed.fnc | 12 +- embed.h | 5 +- ext/re/re.pm | 31 +---- ext/re/t/regop.pl | 2 +- ext/re/t/regop.t | 15 --- pod/perlreguts.pod | 19 +-- proto.h | 11 +- regcomp.c | 307 ++++++--------------------------------------- regcomp.h | 32 +---- 9 files changed, 71 insertions(+), 363 deletions(-) diff --git a/embed.fnc b/embed.fnc index 45c6fd2b2648..c54294cf7e69 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2029,9 +2029,12 @@ ERS |SV* |make_exactf_invlist |NN RExC_state_t *pRExC_state \ ES |regnode_offset|reg |NN RExC_state_t *pRExC_state \ |I32 paren|NN I32 *flagp|U32 depth ES |regnode_offset|regnode_guts|NN RExC_state_t *pRExC_state \ - |const U8 op \ - |const STRLEN extra_len \ - |NN const char* const name + |const STRLEN extra_len +#ifdef DEBUGGING +ES |regnode_offset|regnode_guts_debug|NN RExC_state_t *pRExC_state \ + |const U8 op \ + |const STRLEN extra_len +#endif ES |void |change_engine_size|NN RExC_state_t *pRExC_state|const Ptrdiff_t size ES |regnode_offset|reganode|NN RExC_state_t *pRExC_state|U8 op \ |U32 arg @@ -2113,8 +2116,7 @@ ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \ ESTR |unsigned int|regex_set_precedence|const U8 my_operator ES |regnode_offset|handle_regex_sets|NN RExC_state_t *pRExC_state \ |NULLOK SV ** return_invlist \ - |NN I32 *flagp|U32 depth \ - |NN char * const oregcomp_parse + |NN I32 *flagp|U32 depth ES |void |set_regex_pv |NN RExC_state_t *pRExC_state|NN REGEXP *Rx # if defined(DEBUGGING) && defined(ENABLE_REGEX_SETS_DEBUGGING) ES |void |dump_regex_sets_structures \ diff --git a/embed.h b/embed.h index c08806128c94..691228bbe646 100644 --- a/embed.h +++ b/embed.h @@ -1030,6 +1030,7 @@ #endif #define regdump_extflags(a,b) S_regdump_extflags(aTHX_ a,b) #define regdump_intflags(a,b) S_regdump_intflags(aTHX_ a,b) +#define regnode_guts_debug(a,b,c) S_regnode_guts_debug(aTHX_ a,b,c) #define regtail_study(a,b,c,d) S_regtail_study(aTHX_ a,b,c,d) # endif # if defined(PERL_IN_REGEXEC_C) @@ -1077,7 +1078,7 @@ #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) #define handle_names_wildcard(a,b,c,d) S_handle_names_wildcard(aTHX_ a,b,c,d) #define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e) -#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e) +#define handle_regex_sets(a,b,c,d) S_handle_regex_sets(aTHX_ a,b,c,d) #define handle_user_defined_property(a,b,c,d,e,f,g,h,i,j) S_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i,j) #define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b) #define invlist_is_iterating S_invlist_is_iterating @@ -1104,7 +1105,7 @@ #define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i) #define regex_set_precedence S_regex_set_precedence #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) -#define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) +#define regnode_guts(a,b) S_regnode_guts(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) #define regpnode(a,b,c) S_regpnode(aTHX_ a,b,c) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) diff --git a/ext/re/re.pm b/ext/re/re.pm index d1db4625c006..791d680d5771 100644 --- a/ext/re/re.pm +++ b/ext/re/re.pm @@ -4,7 +4,7 @@ package re; use strict; use warnings; -our $VERSION = "0.41"; +our $VERSION = "0.42"; our @ISA = qw(Exporter); our @EXPORT_OK = qw{ is_regexp regexp_pattern @@ -71,8 +71,6 @@ my %flags = ( EXTRA => 0x3FF0000, TRIEM => 0x0010000, - OFFSETS => 0x0020000, - OFFSETSDBG => 0x0040000, STATE => 0x0080000, OPTIMISEM => 0x0100000, STACK => 0x0280000, @@ -81,9 +79,7 @@ my %flags = ( DUMP_PRE_OPTIMIZE => 0x1000000, WILDCARD => 0x2000000, ); -$flags{ALL} = -1 & ~($flags{OFFSETS} - |$flags{OFFSETSDBG} - |$flags{BUFFERS} +$flags{ALL} = -1 & ~($flags{BUFFERS} |$flags{DUMP_PRE_OPTIMIZE} |$flags{WILDCARD} ); @@ -626,26 +622,6 @@ Enable debugging of the \G modifier. Enable enhanced optimisation debugging and start-point optimisations. Probably not useful except when debugging the regexp engine itself. -=item OFFSETS - -Dump offset information. This can be used to see how regops correlate -to the pattern. Output format is - - NODENUM:POSITION[LENGTH] - -Where 1 is the position of the first char in the string. Note that position -can be 0, or larger than the actual length of the pattern, likewise length -can be zero. - -=item OFFSETSDBG - -Enable debugging of offsets information. This emits copious -amounts of trace information and doesn't mesh well with other -debug options. - -Almost definitely only useful to people hacking -on the offsets part of the debug engine. - =item DUMP_PRE_OPTIMIZE Enable the dumping of the compiled pattern before the optimization phase. @@ -687,8 +663,7 @@ These are useful shortcuts to save on the typing. =item ALL -Enable all options at once except OFFSETS, OFFSETSDBG, BUFFERS, WILDCARD, and -DUMP_PRE_OPTIMIZE. +Enable all options at once except BUFFERS, WILDCARD, and DUMP_PRE_OPTIMIZE. (To get every single option without exception, use both ALL and EXTRA, or starting in 5.30 on a C<-DDEBUGGING>-enabled perl interpreter, use the B<-Drv> command-line switches.) diff --git a/ext/re/t/regop.pl b/ext/re/t/regop.pl index 86976ee0da38..c725b73a9e6a 100644 --- a/ext/re/t/regop.pl +++ b/ext/re/t/regop.pl @@ -1,4 +1,4 @@ -use re Debug=>qw(DUMP EXECUTE OFFSETS TRIEC TEST); +use re Debug=>qw(DUMP EXECUTE TRIEC TEST); my @tests=( XY => 'X(A|[B]Q||C|D)Y' , foobar => '[f][o][o][b][a][r]', diff --git a/ext/re/t/regop.t b/ext/re/t/regop.t index cf35d71fb090..20e9586c3329 100644 --- a/ext/re/t/regop.t +++ b/ext/re/t/regop.t @@ -140,7 +140,6 @@ Freeing REx: "[f][o][o][b][a][r]" minlen 3 --- # Compiling REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" -# Got 164 bytes for offset annotations. # TRIE(NATIVE): W:6 C:24 Uq:7 Min:4 Max:4 # Char : Match Base Ofs A B C P G E D # State|--------------------------------------------------- @@ -166,8 +165,6 @@ minlen 3 # # 20: END (0) # anchored "ABC" at 0 (checking anchored) minlen 4 -# Offsets: [20] -# 1:4[3] 3:4[15] 19:32[0] 20:34[0] # Guessing start of match in sv for REx "(?:ABCP|ABCG|ABCE|ABCB|ABCA|ABCD)" against "ABCD" # Found anchored substr "ABC" at offset 0... # Guessed: match at offset 0 @@ -210,8 +207,6 @@ anchored "ABC" at 0 # 47: EOL(48) # 48: END(0) #floating ""$ at 3..4 (checking floating) stclass "EXACTF <.>" minlen 3 -#Offsets: [48] -# 1:1[1] 3:2[1] 5:2[81] 45:83[1] 47:84[1] 48:85[0] #Guessing start of match, REx "(\.COM|\.EXE|\.BAT|\.CMD|\.VBS|\.VBE|\.JS|\.JSE|\.WSF|\.WSH|..." against "D:dev/perl/ver/28321_/perl.exe"... #Found floating substr ""$ at offset 30... #Starting position does not contradict /^/m... @@ -233,7 +228,6 @@ anchored "ABC" at 0 #Freeing REx: "(\\.COM|\\.EXE|\\.BAT|\\.CMD|\\.VBS|\\.VBE|\\.JS|\\.JSE|\\."...... %MATCHED% floating ""$ at 3..4 (checking floating) -#1:1[1] 3:2[1] 5:2[64] 45:83[1] 47:84[1] 48:85[0] #stclass EXACTF <.> minlen 3 #Found floating substr ""$ at offset 30... #Does not contradict STCLASS... @@ -241,22 +235,16 @@ floating ""$ at 3..4 (checking floating) #Matching stclass EXACTF <.> against ".exe" --- #Compiling REx "[q]" -#size 3 nodes Got 7 bytes for offset annotations. #first at 1 #Final program: # 1: EXACT (3) # 3: END(0) #anchored "q" at 0 (checking anchored isall) minlen 1 -#Offsets: [3] -# 1:1[3] 3:4[0] #Guessing start of match, REx "[q]" against "q"... #Found anchored substr "q" at offset 0... #Guessed: match at offset 0 #%MATCHED% #Freeing REx: "[q]" -Got 7 bytes for offset annotations. -Offsets: [3] -1:1[3] 3:4[0] %MATCHED% Freeing REx: "[q]" --- @@ -281,7 +269,6 @@ Freeing REx: "[q]" Freeing REx: "^(\S{1,9}):\s*(\d+)$" --- #Compiling REx "(?(DEFINE)(?foo))(?(DEFINE)(?(?&foo)bar))(?(DEFINE"... -#Got 532 bytes for offset annotations. study_chunk_recursed_count: 5 #Final program: # 1: DEFINEP (3) @@ -317,8 +304,6 @@ study_chunk_recursed_count: 5 # 61: TAIL (62) # 62: END (0) minlen 0 -#Offsets: [66] -# 1:3[0] 3:10[0] 5:17[1] 7:18[3] 9:21[1] 11:21[0] 13:22[0] 14:25[0] 16:32[0] 18:39[1] 20:41[3] 23:47[3] 25:50[1] 27:50[0] 29:51[0] 30:54[0] 32:61[0] 34:68[1] 36:70[3] 39:76[3] 41:79[1] 43:79[0] 45:80[0] 46:83[0] 48:90[0] 50:97[1] 52:99[3] 55:105[3] 57:108[1] 59:108[0] 61:109[0] 62:110[0] #Matching REx "(?(DEFINE)(?foo))(?(DEFINE)(?(?&foo)bar))(?(DEFINE"... against "" # 0 <> <> | 1:DEFINEP(3) # 0 <> <> | 3:IFTHEN(14) diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index e58aa425355e..2aae739d9b86 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -828,13 +828,10 @@ regex engine. Since it is specific to perl it is only of curiosity value to other engine implementations. typedef struct regexp_internal { - union { - U32 *offsets; - U32 proglen; - } u; regnode *regstclass; struct reg_data *data; struct reg_code_blocks *code_blocks; + U32 proglen; int name_list_idx; regnode program[1]; } regexp_internal; @@ -843,16 +840,6 @@ Description of the attributes is as follows: =over 5 -=item C - -Offsets holds a mapping of offset in the C -to offset in the C string. This is only used by ActiveState's -visual regex debugger. - -=item C - -Stores the length of the compiled program in units of regops. - =item C Special regop that is used by C to check if a pattern @@ -905,6 +892,10 @@ pattern. It is made up of the following structures. struct reg_code_block *cb; /* array of reg_code_block's */ }; +=item C + +Stores the length of the compiled program in units of regops. + =item C This is the index into the data array where an AV is stored that contains diff --git a/proto.h b/proto.h index 2df7f0a30fc5..6f7619d3b475 100644 --- a/proto.h +++ b/proto.h @@ -4720,6 +4720,9 @@ STATIC void S_regdump_extflags(pTHX_ const char *lead, const U32 flags); #define PERL_ARGS_ASSERT_REGDUMP_EXTFLAGS STATIC void S_regdump_intflags(pTHX_ const char *lead, const U32 flags); #define PERL_ARGS_ASSERT_REGDUMP_INTFLAGS +STATIC regnode_offset S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len); +#define PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG \ + assert(pRExC_state) STATIC bool S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, const regnode_offset val, U32 depth) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_REGTAIL_STUDY \ @@ -5886,9 +5889,9 @@ STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char* const s, char ** updated_parse_ptr, AV** posix_warnings, const bool check_only); #define PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX \ assert(pRExC_state); assert(s) -STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse); +STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth); #define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \ - assert(pRExC_state); assert(flagp); assert(oregcomp_parse) + assert(pRExC_state); assert(flagp) STATIC SV * S_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, const bool deferrable, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level); #define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \ assert(name); assert(contents); assert(user_defined_ptr); assert(msg) @@ -5990,9 +5993,9 @@ STATIC unsigned int S_regex_set_precedence(const U8 my_operator) STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, const regnode_offset operand, const U32 depth); #define PERL_ARGS_ASSERT_REGINSERT \ assert(pRExC_state) -STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_len, const char* const name); +STATIC regnode_offset S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_len); #define PERL_ARGS_ASSERT_REGNODE_GUTS \ - assert(pRExC_state); assert(name) + assert(pRExC_state) STATIC regnode_offset S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth); #define PERL_ARGS_ASSERT_REGPIECE \ assert(pRExC_state); assert(flagp) diff --git a/regcomp.c b/regcomp.c index cec3194efb7b..d735e39587b7 100644 --- a/regcomp.c +++ b/regcomp.c @@ -294,10 +294,6 @@ struct RExC_state_t { #define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs under /d from /u ? */ -#ifdef RE_TRACK_PATTERN_OFFSETS -# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the - others */ -#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_sawback (pRExC_state->sawback) @@ -1058,70 +1054,8 @@ static const scan_data_t zero_scan_data = { #define REGNODE_p(offset) (RExC_emit_start + (offset)) #define REGNODE_OFFSET(node) ((node) - RExC_emit_start) -/* Macros for recording node offsets. 20001227 mjd@plover.com - * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in - * element 2*n-1 of the array. Element #2n holds the byte length node #n. - * Element 0 holds the number n. - * Position is 1 indexed. - */ -#ifndef RE_TRACK_PATTERN_OFFSETS -#define Set_Node_Offset_To_R(offset,byte) -#define Set_Node_Offset(node,byte) -#define Set_Cur_Node_Offset -#define Set_Node_Length_To_R(node,len) -#define Set_Node_Length(node,len) -#define Set_Node_Cur_Length(node,start) -#define Node_Offset(n) -#define Node_Length(n) -#define Set_Node_Offset_Length(node,offset,len) -#define ProgLen(ri) ri->u.proglen -#define SetProgLen(ri,x) ri->u.proglen = x -#define Track_Code(code) -#else -#define ProgLen(ri) ri->u.offsets[0] -#define SetProgLen(ri,x) ri->u.offsets[0] = x -#define Set_Node_Offset_To_R(offset,byte) STMT_START { \ - MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ - __LINE__, (int)(offset), (int)(byte))); \ - if((offset) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Offset macro", \ - (int)(offset)); \ - } else { \ - RExC_offsets[2*(offset)-1] = (byte); \ - } \ -} STMT_END - -#define Set_Node_Offset(node,byte) \ - Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start) -#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse) - -#define Set_Node_Length_To_R(node,len) STMT_START { \ - MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \ - __LINE__, (int)(node), (int)(len))); \ - if((node) < 0) { \ - Perl_croak(aTHX_ "value of node is %d in Length macro", \ - (int)(node)); \ - } else { \ - RExC_offsets[2*(node)] = (len); \ - } \ -} STMT_END - -#define Set_Node_Length(node,len) \ - Set_Node_Length_To_R(REGNODE_OFFSET(node), len) -#define Set_Node_Cur_Length(node, start) \ - Set_Node_Length(node, RExC_parse - start) - -/* Get offsets and lengths */ -#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1]) -#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))]) - -#define Set_Node_Offset_Length(node,offset,len) STMT_START { \ - Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \ - Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \ -} STMT_END - -#define Track_Code(code) STMT_START { code } STMT_END -#endif +#define ProgLen(ri) ri->proglen +#define SetProgLen(ri,x) ri->proglen = x #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN @@ -3516,11 +3450,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #ifdef DEBUGGING regnode *optimize = NULL; -#ifdef RE_TRACK_PATTERN_OFFSETS - - U32 mjd_offset = 0; - U32 mjd_nodelen = 0; -#endif /* RE_TRACK_PATTERN_OFFSETS */ #endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, @@ -3534,28 +3463,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); -#ifdef RE_TRACK_PATTERN_OFFSETS - DEBUG_r({ - mjd_offset= Node_Offset((convert)); - mjd_nodelen= Node_Length((convert)); - }); -#endif /* whole branch chain */ } -#ifdef RE_TRACK_PATTERN_OFFSETS - else { - DEBUG_r({ - const regnode *nop = NEXTOPER( convert ); - mjd_offset= Node_Offset((nop)); - mjd_nodelen= Node_Length((nop)); - }); - } - DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n", - depth+1, - (UV)mjd_offset, (UV)mjd_nodelen) - ); -#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -3673,15 +3582,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, DEBUG_r_TEST #endif ) { - regnode *fix = convert; U32 word = trie->wordcount; -#ifdef RE_TRACK_PATTERN_OFFSETS - mjd_nodelen++; -#endif - Set_Node_Offset_Length(convert, mjd_offset, state - 1); - while( ++fix < n ) { - Set_Node_Offset_Length(fix, 0, 0); - } while (word--) { SV ** const tmp = av_fetch( trie_words, word, 0 ); if (tmp) { @@ -3741,22 +3642,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* needed for dumping*/ DEBUG_r(if (optimize) { - regnode *opt = convert; - - while ( ++opt < optimize) { - Set_Node_Offset_Length(opt, 0, 0); - } /* Try to clean up some of the debris left after the optimisation. */ while( optimize < jumper ) { - Track_Code( mjd_nodelen += Node_Length((optimize)); ); OP( optimize ) = OPTIMIZED; - Set_Node_Offset_Length(optimize, 0, 0); optimize++; } - Set_Node_Offset_Length(convert, mjd_offset, mjd_nodelen); }); } /* end node insert */ @@ -8012,28 +7905,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_lastparse=NULL; }); -#ifdef RE_TRACK_PATTERN_OFFSETS - DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ - "%s %" UVuf " bytes for offset annotations.\n", - RExC_offsets ? "Got" : "Couldn't get", - (UV)((RExC_offsets[0] * 2 + 1)))); - DEBUG_OFFSETS_r(if (RExC_offsets) { - const STRLEN len = RExC_offsets[0]; - STRLEN i; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - Perl_re_printf( aTHX_ - "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]); - for (i = 1; i <= len; i++) { - if (RExC_offsets[i*2-1] || RExC_offsets[i*2]) - Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ", - (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]); - } - Perl_re_printf( aTHX_ "\n"); - }); - -#else SetProgLen(RExC_rxi,RExC_size); -#endif DEBUG_DUMP_PRE_OPTIMIZE_r({ SV * const sv = sv_newmortal(); @@ -11163,9 +11035,6 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, num); *flagp |= HASWIDTH; - Set_Node_Offset(REGNODE_p(ret), parse_start+1); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); - nextchar(pRExC_state); return ret; } @@ -11913,10 +11782,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) (IV)ARG2L(REGNODE_p(ret)))); RExC_seen |= REG_RECURSE_SEEN; - Set_Node_Length(REGNODE_p(ret), - 1 + regarglen[OP(REGNODE_p(ret))]); /* MJD */ - Set_Node_Offset(REGNODE_p(ret), parse_start); /* MJD */ - *flagp |= POSTPONED; assert(*RExC_parse == ')'); nextchar(pRExC_state); @@ -11990,12 +11855,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (! REGTAIL(pRExC_state, ret, eval)) { REQUIRE_BRANCHJ(flagp, 0); } - /* deal with the length of this later - MJD */ return ret; } ret = reg2Lanode(pRExC_state, EVAL, n, 0); - Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); - Set_Node_Offset(REGNODE_p(ret), parse_start); return ret; } case '(': /* (?(?{...})...) and (?(?=...)...) */ @@ -12218,8 +12080,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) vFAIL("Unknown switch condition (?(...))"); } case '[': /* (?[ ... ]) */ - return handle_regex_sets(pRExC_state, NULL, flagp, depth+1, - oregcomp_parse); + return handle_regex_sets(pRExC_state, NULL, flagp, depth+1); case 0: /* A NUL */ RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); @@ -12308,8 +12169,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_open_parens[parno]= ret; } - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ - Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */ is_open = 1; } else { /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ @@ -12322,7 +12181,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) parse_rest: /* Pick up the branches, linking them together. */ - parse_start = RExC_parse; /* MJD */ + parse_start = RExC_parse; br = regbranch(pRExC_state, &flags, 1, depth+1); /* branch_len = (paren != 0); */ @@ -12335,10 +12194,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_use_BRANCHJ) { reginsert(pRExC_state, BRANCHJ, br, depth+1); } - else { /* MJD */ + else { reginsert(pRExC_state, BRANCH, br, depth+1); - Set_Node_Length(REGNODE_p(br), paren != 0); - Set_Node_Offset_To_R(br, parse_start-RExC_start); } have_branch = 1; } @@ -12404,8 +12261,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) if (RExC_nestroot == parno) RExC_nestroot = 0; } - Set_Node_Offset(REGNODE_p(ender), RExC_parse+1); /* MJD */ - Set_Node_Length(REGNODE_p(ender), 1); /* MJD */ break; case 's': ender = reg_node(pRExC_state, SRCLOSE); @@ -12534,8 +12389,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } reginsert(pRExC_state, node, ret, depth+1); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); - Set_Node_Offset(REGNODE_p(ret), parse_start + 1); FLAGS(REGNODE_p(ret)) = flag; if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) { @@ -12608,7 +12461,6 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) ret = reganode(pRExC_state, BRANCHJ, 0); else { ret = reg_node(pRExC_state, BRANCH); - Set_Node_Length(REGNODE_p(ret), 1); } } @@ -12843,9 +12695,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) const char * const origparse = RExC_parse; I32 min; I32 max = REG_INFTY; -#ifdef RE_TRACK_PATTERN_OFFSETS - char *parse_start; -#endif /* Save the original in case we change the emitted regop to a FAIL. */ const regnode_offset orig_emit = RExC_emit; @@ -12862,10 +12711,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regatom returned failure, flags=%#" UVxf, (UV) flags); } -#ifdef RE_TRACK_PATTERN_OFFSETS - parse_start = RExC_parse; -#endif - op = *RExC_parse; switch (op) { const char * regcurly_return[5]; @@ -13007,8 +12852,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); - Set_Node_Offset(REGNODE_p(ret), parse_start+1); /* MJD */ - Set_Node_Cur_Length(REGNODE_p(ret), parse_start); } else { /* not SIMPLE */ const regnode_offset w = reg_node(pRExC_state, WHILEM); @@ -13023,10 +12866,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */ } reginsert(pRExC_state, CURLYX, ret, depth+1); - /* MJD hk */ - Set_Node_Offset(REGNODE_p(ret), parse_start+1); - Set_Node_Length(REGNODE_p(ret), - op == '{' ? (RExC_parse - parse_start) : 1); if (RExC_use_BRANCHJ) NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to @@ -13230,7 +13069,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); - Set_Node_Length(REGNODE_p(*(node_p)), 1); /* MJD */ return TRUE; } @@ -13588,6 +13426,14 @@ S_backref_value(char *p, char *e) return I32_MAX; } +#ifdef DEBUGGING +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts_debug(state,op,extra_size) +#else +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts(state,extra_size) +#endif + /* - regatom - the lowest level @@ -13686,7 +13532,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, MBOL); else ret = reg_node(pRExC_state, SBOL); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '$': nextchar(pRExC_state); @@ -13696,7 +13541,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, MEOL); else ret = reg_node(pRExC_state, SEOL); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '.': nextchar(pRExC_state); @@ -13706,7 +13550,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; MARK_NAUGHTY(1); - Set_Node_Length(REGNODE_p(ret), 1); /* MJD */ break; case '[': { @@ -13728,7 +13571,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL("Unmatched ["); } nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(ret), RExC_parse - oregcomp_parse + 1); /* MJD */ break; } case '(': @@ -14019,8 +13861,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse += 2; vFAIL("Unescaped left brace in regex is illegal here"); } - Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */ nextchar(pRExC_state); break; case 'N': @@ -14238,9 +14078,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } *flagp |= HASWIDTH; - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start-1); skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); } @@ -14353,8 +14190,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Allocate an EXACT node. The node_type may change below to * another EXACTish node, but since the size of the node doesn't * change, it works */ - ret = regnode_guts(pRExC_state, node_type, current_string_nodes, - "exact"); + ret = REGNODE_GUTS(pRExC_state, node_type, current_string_nodes); FILL_NODE(ret, node_type); RExC_emit++; @@ -15599,7 +15435,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= HASWIDTH | maybe_SIMPLE; } - Set_Node_Length(REGNODE_p(ret), p - parse_start - 1); RExC_parse = p; { @@ -16551,8 +16386,7 @@ S_regex_set_precedence(const U8 my_operator) { STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, - I32 *flagp, U32 depth, - char * const oregcomp_parse) + I32 *flagp, U32 depth) { /* Handle the (?[...]) construct to do set operations */ @@ -16581,7 +16415,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - PERL_UNUSED_ARG(oregcomp_parse); /* Only for Set_Node_Length */ DEBUG_PARSE("xcls"); @@ -17236,7 +17069,6 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */ return node; regclass_failed: @@ -19290,8 +19122,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* If optimized to something else and emitted, clean up and return */ if (ret >= 0) { - Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start, - RExC_parse - orig_parse);; SvREFCNT_dec(cp_list);; SvREFCNT_dec(only_utf8_locale_list); SvREFCNT_dec(upper_latin1_only_utf8_matches); @@ -19318,7 +19148,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } - ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof"); + ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); FILL_NODE(ret, op); /* We set the argument later */ RExC_emit += 1 + regarglen[op]; ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags; @@ -19855,7 +19685,7 @@ S_optimize_regclass(pTHX_ len = (UTF) ? UVCHR_SKIP(value) : 1; - *ret = regnode_guts(pRExC_state, op, len, "exact"); + *ret = REGNODE_GUTS(pRExC_state, op, len); FILL_NODE(*ret, op); RExC_emit += 1 + STR_SZ(len); setSTR_LEN(REGNODE_p(*ret), len); @@ -20202,9 +20032,8 @@ S_optimize_regclass(pTHX_ } else { op = ANYOFHs; - *ret = regnode_guts(pRExC_state, op, - regarglen[op] + STR_SZ(len), - "anyofhs"); + *ret = REGNODE_GUTS(pRExC_state, op, + regarglen[op] + STR_SZ(len)); FILL_NODE(*ret, op); ((struct regnode_anyofhs *) REGNODE_p(*ret))->str_len = len; @@ -20712,54 +20541,39 @@ S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) if (size > 0) { Zero(REGNODE_p(RExC_emit), size, regnode); } - -#ifdef RE_TRACK_PATTERN_OFFSETS - Renew(RExC_offsets, 2*RExC_size+1, U32); - if (size > 0) { - Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32); - } - RExC_offsets[0] = RExC_size; -#endif } STATIC regnode_offset -S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) +S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const STRLEN extra_size) { - /* Allocate a regnode for 'op', with 'extra_size' extra (smallest) regnode - * equivalents space. It aligns and increments RExC_size + /* Allocate a regnode that is (1 + extra_size) times as big as the + * smallest regnode worth of space, and also aligns and increments + * RExC_size appropriately. * * It returns the regnode's offset into the regex engine program */ const regnode_offset ret = RExC_emit; - DECLARE_AND_GET_RE_DEBUG_FLAGS; - PERL_ARGS_ASSERT_REGNODE_GUTS; SIZE_ALIGN(RExC_size); change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size); NODE_ALIGN_FILL(REGNODE_p(ret)); -#ifndef RE_TRACK_PATTERN_OFFSETS - PERL_UNUSED_ARG(name); - PERL_UNUSED_ARG(op); -#else + return(ret); +} + +#ifdef DEBUGGING + +STATIC regnode_offset +S_regnode_guts_debug(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size) { + PERL_ARGS_ASSERT_REGNODE_GUTS_DEBUG; assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF); + return S_regnode_guts(aTHX_ pRExC_state, extra_size); +} - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n", - name, __LINE__, - PL_reg_name[op], - (UV)(RExC_emit) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)(RExC_emit), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Node_Offset(REGNODE_p(RExC_emit), RExC_parse + (op == END)); - } #endif - return(ret); -} + + /* - reg_node - emit a node @@ -20767,7 +20581,7 @@ S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_ STATIC regnode_offset /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG_NODE; @@ -20785,7 +20599,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) STATIC regnode_offset /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGANODE; @@ -20804,7 +20618,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) STATIC regnode_offset /* Location. */ S_regpnode(pTHX_ RExC_state_t *pRExC_state, U8 op, SV * arg) { - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "regpnode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REGPNODE; @@ -20819,7 +20633,7 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const { /* emit a node with U32 and I32 arguments */ - const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); + const regnode_offset ret = REGNODE_GUTS(pRExC_state, op, regarglen[op]); regnode_offset ptr = ret; PERL_ARGS_ASSERT_REG2LANODE; @@ -20901,41 +20715,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, while (src > REGNODE_p(operand)) { StructCopy(--src, --dst, regnode); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD 20010112 */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n", - "reginsert", - __LINE__, - PL_reg_name[op], - (UV)(REGNODE_OFFSET(dst)) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)REGNODE_OFFSET(src), - (UV)REGNODE_OFFSET(dst), - (UV)RExC_offsets[0])); - Set_Node_Offset_To_R(REGNODE_OFFSET(dst), Node_Offset(src)); - Set_Node_Length_To_R(REGNODE_OFFSET(dst), Node_Length(src)); - } -#endif } place = REGNODE_p(operand); /* Op node, where operand used to be. */ -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n", - "reginsert", - __LINE__, - PL_reg_name[op], - (UV)REGNODE_OFFSET(place) > RExC_offsets[0] - ? "Overwriting end of array!\n" : "OK", - (UV)REGNODE_OFFSET(place), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Node_Offset(place, RExC_parse); - Set_Node_Length(place, 1); - } -#endif src = NEXTOPER(place); FLAGS(place) = 0; FILL_NODE(operand, op); @@ -22065,10 +21847,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) } }); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (ri->u.offsets) - Safefree(ri->u.offsets); /* 20010421 MJD */ -#endif if (ri->code_blocks) S_free_codeblocks(aTHX_ ri->code_blocks); @@ -22393,14 +22171,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) reti->name_list_idx = ri->name_list_idx; -#ifdef RE_TRACK_PATTERN_OFFSETS - if (ri->u.offsets) { - Newx(reti->u.offsets, 2*len+1, U32); - Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); - } -#else SetProgLen(reti, len); -#endif return (void*)reti; } diff --git a/regcomp.h b/regcomp.h index 552cd6ed65f4..6b8dc27642f2 100644 --- a/regcomp.h +++ b/regcomp.h @@ -26,11 +26,6 @@ /* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 -/* Activate offsets code - set to if 1 to enable */ -#ifdef DEBUGGING -#define RE_TRACK_PATTERN_OFFSETS -#endif - /* * Structure for regexp "program". This is essentially a linear encoding * of a nondeterministic finite-state machine (aka syntax charts or @@ -65,24 +60,18 @@ /* This is the stuff that used to live in regexp.h that was truly private to the engine itself. It now lives here. */ - typedef struct regexp_internal { - union { - U32 *offsets; /* offset annotations 20001228 MJD - data about mapping the program to the - string - - offsets[0] is proglen when this is used - */ - U32 proglen; - } u; - +typedef struct regexp_internal { regnode *regstclass; /* Optional startclass as identified or constructed by the optimiser */ struct reg_data *data; /* Additional miscellaneous data used by the program. Used to make it easier to clone and free arbitrary data that the regops need. Often the ARG field of - a regop is an index into this structure */ + a regop is an index into this structure. NOTE the + 0th element of this structure is NEVER used and is + strictly reserved for internal purposes. */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */ - int name_list_idx; /* Optional data index of an array of paren names */ + U32 proglen; /* size of the compiled program in regnodes */ + int name_list_idx; /* Optional data index of an array of paren names */ regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp_internal; @@ -995,7 +984,6 @@ further group, as currently only the low three bytes are used. PEEP TRIE PROGRAM - OFFSETS Execute Options: @@ -1006,7 +994,6 @@ further group, as currently only the low three bytes are used. Extra Options TRIE - OFFSETS If you modify any of these make sure you make corresponding changes to re.pm, especially to the documentation. @@ -1032,8 +1019,6 @@ re.pm, especially to the documentation. /* Extra */ #define RE_DEBUG_EXTRA_MASK 0x3FF0000 #define RE_DEBUG_EXTRA_TRIE 0x0010000 -#define RE_DEBUG_EXTRA_OFFSETS 0x0020000 -#define RE_DEBUG_EXTRA_OFFDEBUG 0x0040000 #define RE_DEBUG_EXTRA_STATE 0x0080000 #define RE_DEBUG_EXTRA_OPTIMISE 0x0100000 #define RE_DEBUG_EXTRA_BUFFERS 0x0400000 @@ -1072,8 +1057,6 @@ re.pm, especially to the documentation. /* Extra */ #define DEBUG_EXTRA_r(x) DEBUG_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_MASK)) x ) -#define DEBUG_OFFSETS_r(x) DEBUG_r( \ - if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFSETS)) x ) #define DEBUG_STATE_r(x) DEBUG_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STATE)) x ) #define DEBUG_STACK_r(x) DEBUG_r( \ @@ -1084,9 +1067,6 @@ re.pm, especially to the documentation. #define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ if (DEBUG_v_TEST || ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE))) x ) -#define MJD_OFFSET_DEBUG(x) DEBUG_r( \ - if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFDEBUG)) \ - Perl_warn_nocontext x ) #define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_TRIE)) x ) #define DEBUG_TRIE_EXECUTE_MORE_r(x) DEBUG_TRIE_EXECUTE_r( \ From 109abe1fa197c87964aa8f30f12bf5fae119e8d6 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Wed, 16 Feb 2022 03:49:52 +0100 Subject: [PATCH 3/4] regcomp.c: disambiguate "parse_start" and related var names This was originally done to make the cleanup of the offsets debug logic easier to follow and understand. 'parse_start' was heavily used in multiple functions, and given the size of the functions in regcomp.c it was often not clear which parse_start was which. 'oregcomp_parse' was also used in a similar way. This patch disambiguates them all so they are all uniquely named and relevant to the code they operate on and of the form "thing_parse_start", (or "thing_parse_start_const" where both were in use). --- embed.fnc | 2 +- proto.h | 4 ++-- regcomp.c | 50 +++++++++++++++++++++++++++++--------------------- 3 files changed, 32 insertions(+), 24 deletions(-) diff --git a/embed.fnc b/embed.fnc index c54294cf7e69..fc09075bdaa1 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2111,7 +2111,7 @@ ES |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ |NN SV** invlist ES |regnode_offset|handle_named_backref|NN RExC_state_t *pRExC_state \ |NN I32 *flagp \ - |NN char * parse_start \ + |NN char * backref_parse_start \ |char ch ESTR |unsigned int|regex_set_precedence|const U8 my_operator ES |regnode_offset|handle_regex_sets|NN RExC_state_t *pRExC_state \ diff --git a/proto.h b/proto.h index 6f7619d3b475..111901d56667 100644 --- a/proto.h +++ b/proto.h @@ -5880,9 +5880,9 @@ STATIC U32 S_get_quantifier_value(pTHX_ RExC_state_t *pRExC_state, const char * STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode_offset* nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ assert(pRExC_state); assert(flagp) -STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch); +STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * backref_parse_start, char ch); #define PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF \ - assert(pRExC_state); assert(flagp); assert(parse_start) + assert(pRExC_state); assert(flagp); assert(backref_parse_start) STATIC bool S_handle_names_wildcard(pTHX_ const char * wname, const STRLEN wname_len, SV ** prop_definition, AV ** strings); #define PERL_ARGS_ASSERT_HANDLE_NAMES_WILDCARD \ assert(wname); assert(prop_definition); assert(strings) diff --git a/regcomp.c b/regcomp.c index d735e39587b7..462fe2b08fdf 100644 --- a/regcomp.c +++ b/regcomp.c @@ -10994,7 +10994,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) STATIC regnode_offset S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, - char * parse_start, + char * backref_parse_start, char ch ) { @@ -11013,7 +11013,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, } if (RExC_parse == name_start || *RExC_parse != ch) { /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated", parse_start); + vFAIL2("Sequence %.3s... not terminated", backref_parse_start); } if (sv_dat) { @@ -11115,8 +11115,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) SV * max_open; /* Max number of unclosed parens */ I32 was_in_lookaround = RExC_in_lookaround; - char * parse_start = RExC_parse; /* MJD */ - char * const oregcomp_parse = RExC_parse; + /* The difference between the following variables can be seen with * + * the broken pattern /(?:foo/ where segment_parse_start will point * + * at the 'f', and reg_parse_start will point at the '(' */ + + /* the following is used for unmatched '(' errors */ + char * const reg_parse_start = RExC_parse; + + /* the following is used to track where various segments of + * the pattern that we parse out started. */ + char * segment_parse_start = RExC_parse; DECLARE_AND_GET_RE_DEBUG_FLAGS; @@ -11501,7 +11509,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) else if (paren == '=') { /* (?P=...) named backref */ RExC_parse++; return handle_named_backref(pRExC_state, flagp, - parse_start, ')'); + segment_parse_start, ')'); } RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ @@ -11652,7 +11660,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /*notreached*/ /* named and numeric backreferences */ case '&': /* (?&NAME) */ - parse_start = RExC_parse - 1; + segment_parse_start = RExC_parse - 1; named_recursion: { SV *sv_dat = reg_scan_name(pRExC_state, @@ -11683,7 +11691,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) { bool is_neg = FALSE; UV unum; - parse_start = RExC_parse - 1; /* MJD */ + segment_parse_start = RExC_parse - 1; if (*RExC_parse == '-') { RExC_parse++; is_neg = TRUE; @@ -12181,7 +12189,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) parse_rest: /* Pick up the branches, linking them together. */ - parse_start = RExC_parse; + segment_parse_start = RExC_parse; br = regbranch(pRExC_state, &flags, 1, depth+1); /* branch_len = (paren != 0); */ @@ -12406,7 +12414,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { - RExC_parse = oregcomp_parse; + RExC_parse = reg_parse_start; vFAIL("Unmatched ("); } nextchar(pRExC_state); @@ -13509,7 +13517,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { regnode_offset ret = 0; I32 flags = 0; - char *parse_start; + char *atom_parse_start; U8 op; int invert = 0; @@ -13522,7 +13530,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REGATOM; tryagain: - parse_start = RExC_parse; + atom_parse_start = RExC_parse; assert(RExC_parse < RExC_end); switch ((U8)*RExC_parse) { case '^': @@ -13553,7 +13561,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; case '[': { - char * const oregcomp_parse = ++RExC_parse; + char * const cc_parse_start = ++RExC_parse; ret = regclass(pRExC_state, flagp, depth+1, FALSE, /* means parse the whole char class */ TRUE, /* allow multi-char folds */ @@ -13567,7 +13575,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) (UV) *flagp); } if (*RExC_parse != ']') { - RExC_parse = oregcomp_parse; + RExC_parse = cc_parse_start; vFAIL("Unmatched ["); } nextchar(pRExC_state); @@ -13854,7 +13862,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* The escapes above that don't take a parameter can't be * followed by a '{'. But 'pX', 'p{foo}' and * correspondingly 'P' can be */ - if ( RExC_parse - parse_start == 1 + if ( RExC_parse - atom_parse_start == 1 && UCHARAT(RExC_parse + 1) == '{' && UNLIKELY(! regcurly(RExC_parse + 1, RExC_end, NULL))) { @@ -13892,7 +13900,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RETURN_FAIL_ON_RESTART_FLAGP(flagp); /* Here, evaluates to a single code point. Go get that */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; case 'k': /* Handle \k and \k'NAME' and \k{NAME} */ @@ -13906,7 +13914,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { RExC_parse++; /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.2s... not terminated", parse_start); + vFAIL2("Sequence %.2s... not terminated", atom_parse_start); } else { RExC_parse += 2; if (ch == '{') { @@ -13916,7 +13924,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } ret = handle_named_backref(pRExC_state, flagp, - parse_start, + atom_parse_start, (ch == '<') ? '>' : (ch == '{') @@ -14027,7 +14035,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * to be an octal character escape, e.g. \35 or \777. * The above logic should make it obvious why using * octal escapes in patterns is problematic. - Yves */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; } } @@ -14089,7 +14097,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ - RExC_parse = parse_start; + RExC_parse = atom_parse_start; goto defchar; } /* end of switch on a \foo sequence */ break; @@ -14328,7 +14336,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } p = RExC_parse; - RExC_parse = parse_start; + RExC_parse = atom_parse_start; /* The \N{} means the pattern, if previously /d, * becomes /u. That means it can't be an EXACTF node, @@ -14518,7 +14526,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * string of characters instead of a meta construct */ if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) { if ( RExC_strict - || ( p > parse_start + 1 + || ( p > atom_parse_start + 1 && isALPHA_A(*(p - 1)) && *(p - 2) == '\\')) { From f2545787c8ccf3f0c966aff71d12f16eaaded5c8 Mon Sep 17 00:00:00 2001 From: Yves Orton Date: Sat, 12 Feb 2022 05:36:36 +0100 Subject: [PATCH 4/4] regcomp.h: change regexp_internal attribute from I32 to U32 This changes the name_list_idx attribute from I32 to a U32 as it will never be negative, and as of a963d6d5acabdd8c7 a 0 can be safely used to represent "no value" for items in the 'data' array. I noticed this while cleaning up the offsets debug logic and updating the perlreguts documentation, so I figured I might as well clean it up at the same time. --- pod/perlreguts.pod | 6 +++--- regcomp.h | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pod/perlreguts.pod b/pod/perlreguts.pod index 2aae739d9b86..e19f157058ba 100644 --- a/pod/perlreguts.pod +++ b/pod/perlreguts.pod @@ -832,7 +832,7 @@ value to other engine implementations. struct reg_data *data; struct reg_code_blocks *code_blocks; U32 proglen; - int name_list_idx; + U32 name_list_idx; regnode program[1]; } regexp_internal; @@ -900,8 +900,8 @@ Stores the length of the compiled program in units of regops. This is the index into the data array where an AV is stored that contains the names of any named capture buffers in the pattern, should there be -any. This is only used in the debugging version of the regex engine. It -will be 0 if there is no such data. +any. This is only used in the debugging version of the regex engine and +when RXp_PAREN_NAMES(prog) is true. It will be 0 if there is no such data. =item C diff --git a/regcomp.h b/regcomp.h index 6b8dc27642f2..2705463474e7 100644 --- a/regcomp.h +++ b/regcomp.h @@ -71,7 +71,10 @@ typedef struct regexp_internal { strictly reserved for internal purposes. */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */ U32 proglen; /* size of the compiled program in regnodes */ - int name_list_idx; /* Optional data index of an array of paren names */ + U32 name_list_idx; /* Optional data index of an array of paren names, + only valid when RXp_PAREN_NAMES(prog) is true, + 0 means "no value" like any other index into the + data array.*/ regnode program[1]; /* Unwarranted chumminess with compiler. */ } regexp_internal;