Skip to content

Commit ffffbf0

Browse files
committed
Move data for PL_InBitmap to charclass_invlists.h
This makes it consistent with the other inversion lists for this sort of thing, and finishes the fix for GH #17154
1 parent 4fcf1e9 commit ffffbf0

File tree

6 files changed

+71
-40
lines changed

6 files changed

+71
-40
lines changed

charclass_invlists.h

+26-1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,22 @@
66

77
/* See the generating file for comments */
88

9+
/* This gives the number of code points that can be in the bitmap of an ANYOF
10+
* node. The shift number must currently be one of: 8..12. It can't be less
11+
* than 8 (256) because some code relies on it being at least that. Above 12
12+
* (4096), and you start running into warnings that some data structure widths
13+
* have been exceeded, though the test suite as of this writing still passes
14+
* for up through 16, which is as high as anyone would ever want to go,
15+
* encompassing all of the Unicode BMP, and thus including all the economically
16+
* important world scripts. At 12 most of them are: including Arabic,
17+
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
18+
* Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
19+
* the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
20+
* above 12.) Be sure to benchmark before changing, as larger sizes do
21+
* significantly slow down the test suite */
22+
23+
#define NUM_ANYOF_CODE_POINTS (1 << 8)
24+
925

1026
#if (defined(PERL_IN_REGCOMP_C) && ! defined(PERL_IN_XSUB_RE))
1127

@@ -29368,6 +29384,15 @@ static const GCB_enum _Perl_GCB_invmap[] = { /* for EBCDIC 037 */
2936829384

2936929385
#if (defined(PERL_IN_REGCOMP_C) && ! defined(PERL_IN_XSUB_RE))
2937029386

29387+
static const UV _Perl_InBitmap_invlist[] = { /* for all charsets */
29388+
2, /* Number of elements */
29389+
148565664, /* Version and data structure type */
29390+
0, /* 0 if the list starts at 0;
29391+
1 if it starts at the element beyond 0 */
29392+
0x0,
29393+
0x100
29394+
};
29395+
2937129396
# if 'A' == 65 /* ASCII/Latin1 */
2937229397

2937329398
static const UV _Perl_IVCF_invlist[] = { /* for ASCII/Latin1 */
@@ -395305,5 +395330,5 @@ static const U8 WB_table[23][23] = {
395305395330
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
395306395331
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
395307395332
* 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl
395308-
* 61ea8132bb9ea5c637609e2d026b0b85ce17d6bec544c2f08ce411e6f65e8386 regen/mk_invlists.pl
395333+
* eeb419293bc4fed3c653e3f41c9b7a889019e2134dc892b7bf669ccdcbeb869b regen/mk_invlists.pl
395309395334
* ex: set ro: */

lib/unicore/uni_keywords.pl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1265,5 +1265,5 @@
12651265
# a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
12661266
# 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
12671267
# 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl
1268-
# 61ea8132bb9ea5c637609e2d026b0b85ce17d6bec544c2f08ce411e6f65e8386 regen/mk_invlists.pl
1268+
# eeb419293bc4fed3c653e3f41c9b7a889019e2134dc892b7bf669ccdcbeb869b regen/mk_invlists.pl
12691269
# ex: set ro:

regcomp.c

+12-22
Original file line numberDiff line numberDiff line change
@@ -7368,28 +7368,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
73687368

73697369
DEBUG_r(if (!PL_colorset) reginitcolors());
73707370

7371-
/* Initialize these here instead of as-needed, as is quick and avoids
7372-
* having to test them each time otherwise */
7373-
if (! PL_InBitmap) {
7374-
#ifdef DEBUGGING
7375-
char * dump_len_string;
7376-
#endif
7377-
7378-
/* This is calculated here, because the Perl program that generates the
7379-
* static global ones doesn't currently have access to
7380-
* NUM_ANYOF_CODE_POINTS */
7381-
PL_InBitmap = _new_invlist(2);
7382-
PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
7383-
NUM_ANYOF_CODE_POINTS - 1);
7384-
#ifdef DEBUGGING
7385-
dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
7386-
if ( ! dump_len_string
7387-
|| ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
7388-
{
7389-
PL_dump_re_max_len = 60; /* A reasonable default */
7390-
}
7391-
#endif
7392-
}
73937371

73947372
pRExC_state->warn_text = NULL;
73957373
pRExC_state->unlexed_names = NULL;
@@ -22020,6 +21998,17 @@ Perl_init_uniprops(pTHX)
2202021998
{
2202121999
dVAR;
2202222000

22001+
#ifdef DEBUGGING
22002+
char * dump_len_string;
22003+
22004+
dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN");
22005+
if ( ! dump_len_string
22006+
|| ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
22007+
{
22008+
PL_dump_re_max_len = 60; /* A reasonable default */
22009+
}
22010+
#endif
22011+
2202322012
PL_user_def_props = newHV();
2202422013

2202522014
#ifdef USE_ITHREADS
@@ -22071,6 +22060,7 @@ Perl_init_uniprops(pTHX)
2207122060
PL_LB_invlist = _new_invlist_C_array(_Perl_LB_invlist);
2207222061
PL_SCX_invlist = _new_invlist_C_array(_Perl_SCX_invlist);
2207322062

22063+
PL_InBitmap = _new_invlist_C_array(_Perl_InBitmap_invlist);
2207422064
PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
2207522065
PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
2207622066
PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);

regcomp.h

-15
Original file line numberDiff line numberDiff line change
@@ -183,21 +183,6 @@ struct regnode_2 {
183183
U16 arg2;
184184
};
185185

186-
/* This give the number of code points that can be in the bitmap of an ANYOF
187-
* node. The shift number must currently be one of: 8..12. It can't be less
188-
* than 8 (256) because some code relies on it being at least that. Above 12
189-
* (4096), and you start running into warnings that some data structure widths
190-
* have been exceeded, though the test suite as of this writing still passes
191-
* for up through 16, which is as high as anyone would ever want to go,
192-
* encompassing all of the Unicode BMP, and thus including all the economically
193-
* important world scripts. At 12 most of them are: including Arabic,
194-
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
195-
* Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
196-
* the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
197-
* above 12.) Be sure to benchmark before changing, as larger sizes do
198-
* significantly slow down the test suite */
199-
#define NUM_ANYOF_CODE_POINTS (1 << 8)
200-
201186
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */
202187

203188
/* Note that these form structs which are supersets of the next smaller one, by

regen/mk_invlists.pl

+31
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,29 @@
6161

6262
print $out_fh "/* See the generating file for comments */\n\n";
6363

64+
print $out_fh <<'EOF';
65+
/* This gives the number of code points that can be in the bitmap of an ANYOF
66+
* node. The shift number must currently be one of: 8..12. It can't be less
67+
* than 8 (256) because some code relies on it being at least that. Above 12
68+
* (4096), and you start running into warnings that some data structure widths
69+
* have been exceeded, though the test suite as of this writing still passes
70+
* for up through 16, which is as high as anyone would ever want to go,
71+
* encompassing all of the Unicode BMP, and thus including all the economically
72+
* important world scripts. At 12 most of them are: including Arabic,
73+
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
74+
* Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
75+
* the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
76+
* above 12.) Be sure to benchmark before changing, as larger sizes do
77+
* significantly slow down the test suite */
78+
79+
EOF
80+
81+
my $num_anyof_code_points = '(1 << 8)';
82+
83+
print $out_fh "#define NUM_ANYOF_CODE_POINTS $num_anyof_code_points\n\n";
84+
85+
$num_anyof_code_points = eval $num_anyof_code_points;
86+
6487
# enums that should be made public
6588
my %public_enums = (
6689
_Perl_SCX => 1
@@ -1125,6 +1148,13 @@ sub _Perl_CCC_non0_non230 {
11251148
return \@return;
11261149
}
11271150

1151+
sub _Perl_InBitmap {
1152+
my @return;
1153+
push @return, $_ for 0 .. $num_anyof_code_points - 1;
1154+
@return = mk_invlist_from_sorted_cp_list(\@return);
1155+
return \@return;
1156+
}
1157+
11281158
sub output_table_common {
11291159

11301160
# Common subroutine to actually output the generated rules table.
@@ -2353,6 +2383,7 @@ ($)
23532383
Case_Folding
23542384
&_Perl_IVCF
23552385
&_Perl_CCC_non0_non230
2386+
&_Perl_InBitmap
23562387
);
23572388
# NOTE that the convention is that extra enum values come
23582389
# after the property name, separated by commas, with the enums

uni_keywords.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -7288,6 +7288,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
72887288
* a712c758275b460d18fa77a26ed3589689bb3f69dcc1ea99b913e32db92a5cd2 lib/unicore/version
72897289
* 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl
72907290
* 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl
7291-
* 61ea8132bb9ea5c637609e2d026b0b85ce17d6bec544c2f08ce411e6f65e8386 regen/mk_invlists.pl
7291+
* eeb419293bc4fed3c653e3f41c9b7a889019e2134dc892b7bf669ccdcbeb869b regen/mk_invlists.pl
72927292
* c56b78df81e0f96632246052d71580b212546ca02ba4075158965e11d892f21e regen/mph.pl
72937293
* ex: set ro: */

0 commit comments

Comments
 (0)