Skip to content

Commit 4e058f4

Browse files
committed
t/re/uniprops.t: Add more description for \b{} tests
mktables generates a file of tests used in t/re/uniprops.t. The tests furnished by Unicode for the boundaries like \b{gcb} have comments that indicate the rules each test is testing. These are useful in debugging. This commit changes things so the generated file that includes these Unicode-supplied tests also has the corresponding comments which are output as part of the test descriptions.
1 parent c92b26e commit 4e058f4

File tree

3 files changed

+44
-6
lines changed

3 files changed

+44
-6
lines changed

charclass_invlists.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87887,7 +87887,7 @@ static const U8 WB_table[19][19] = {
8788787887
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
8788887888
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
8788987889
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
87890-
* 718d6ea8b96ee3d12c9c3a48ceb0f5cebe023634002ac8b2ede12b306273aa52 lib/unicore/mktables
87890+
* 2e9c8c898fd78231c21ff0da9facb8d231bf419bde94dc63075dff904be4f5f7 lib/unicore/mktables
8789187891
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
8789287892
* 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
8789387893
* 12bd58cb9d5a99f631ca95e269f7f9c90dacaf81020efa5d95a995f3cdc19200 regen/mk_invlists.pl

lib/unicore/mktables

Lines changed: 42 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2198,6 +2198,15 @@ sub trace { return main::trace(@_); }
21982198
# 'handler'
21992199
main::set_access('each_line_handler', \%each_line_handler, 'c');
22002200

2201+
my %retain_trailing_comments;
2202+
# This is used to not discard the comments that end data lines. This
2203+
# would be used only for files with non-typical syntax, and most code here
2204+
# assumes that comments have been stripped, so special handlers would have
2205+
# to be written. It is assumed that the code will use these in
2206+
# single-quoted contexts, and so any "'" marks in the comment will be
2207+
# prefixed by a backslash.
2208+
main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2209+
22012210
my %properties; # Optional ordered list of the properties that occur in each
22022211
# meaningful line of the input file. If present, an appropriate
22032212
# each_line_handler() is automatically generated and pushed onto the stack
@@ -2355,6 +2364,7 @@ sub trace { return main::trace(@_); }
23552364

23562365
# Set defaults
23572366
$handler{$addr} = \&main::process_generic_property_file;
2367+
$retain_trailing_comments{$addr} = 0;
23582368
$non_skip{$addr} = 0;
23592369
$skip{$addr} = undef;
23602370
$has_missings_defaults{$addr} = $NO_DEFAULTS;
@@ -3020,9 +3030,21 @@ END
30203030
next;
30213031
}
30223032

3023-
# Remove comments and trailing space, and skip this line if the
3024-
# result is empty
3025-
s/#.*//;
3033+
# Unless to keep, remove comments. If to keep, ignore
3034+
# comment-only lines
3035+
if ($retain_trailing_comments{$addr}) {
3036+
next if / ^ \s* \# /x;
3037+
3038+
# But escape any single quotes (done in both the comment and
3039+
# non-comment portion; this could be a bug someday, but not
3040+
# likely)
3041+
s/'/\\'/g;
3042+
}
3043+
else {
3044+
s/#.*//;
3045+
}
3046+
3047+
# Remove trailing space, and skip this line if the result is empty
30263048
s/\s+$//;
30273049
next if /^$/;
30283050

@@ -19188,18 +19210,21 @@ my @input_file_objects = (
1918819210
),
1918919211
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
1919019212
Handler => \&process_GCB_test,
19213+
retain_trailing_comments => 1,
1919119214
),
1919219215
Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
1919319216
Skip => $Validation_Documentation,
1919419217
),
1919519218
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
1919619219
Handler => \&process_SB_test,
19220+
retain_trailing_comments => 1,
1919719221
),
1919819222
Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
1919919223
Skip => $Validation_Documentation,
1920019224
),
1920119225
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
1920219226
Handler => \&process_WB_test,
19227+
retain_trailing_comments => 1,
1920319228
),
1920419229
Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
1920519230
Skip => $Validation_Documentation,
@@ -19250,6 +19275,7 @@ my @input_file_objects = (
1925019275
),
1925119276
Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
1925219277
Handler => \&process_LB_test,
19278+
retain_trailing_comments => 1,
1925319279
),
1925419280
Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
1925519281
Skip => $Validation_Documentation,
@@ -19842,6 +19868,15 @@ sub _test_break($$) {
1984219868
my $break_type = shift;
1984319869

1984419870
my $line = (caller 1)[2]; # Line number
19871+
my $comment = "";
19872+
19873+
if ($template =~ / ( .*? ) \s* \# (.*) /x) {
19874+
$template = $1;
19875+
$comment = $2;
19876+
19877+
# Replace leading spaces with a single one.
19878+
$comment =~ s/ ^ \s* / # /x;
19879+
}
1984519880

1984619881
# The line contains characters above the ASCII range, but in Latin1. It
1984719882
# may or may not be in utf8, and if it is, it may or may not know it. So,
@@ -19985,7 +20020,10 @@ sub _test_break($$) {
1998520020

1998620021
# Fancy display of test results
1998720022
$matched = ($matched) ? "matched" : "failed to match";
19988-
print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
20023+
print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20024+
20025+
# Only print the comment on the first use of this line
20026+
$comment = "";
1998920027

1999020028
# Repeat with the first \B{} in the pattern. This makes sure the
1999120029
# code in regexec.c:find_byclass() for \B gets executed

regcharclass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1895,7 +1895,7 @@
18951895
* 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
18961896
* 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
18971897
* a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
1898-
* 718d6ea8b96ee3d12c9c3a48ceb0f5cebe023634002ac8b2ede12b306273aa52 lib/unicore/mktables
1898+
* 2e9c8c898fd78231c21ff0da9facb8d231bf419bde94dc63075dff904be4f5f7 lib/unicore/mktables
18991899
* 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
19001900
* 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
19011901
* d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl

0 commit comments

Comments
 (0)