t/re/uniprops.t: Add more description for \b{} tests

khwilliamson · khwilliamson · commit 4e058f448f14 · 2016-06-21T18:10:38.000-06:00
mktables generates a file of tests used in t/re/uniprops.t.
The tests furnished by Unicode for the boundaries like \b{gcb} have
comments that indicate the rules each test is testing.  These are useful
in debugging.  This commit changes things so the generated file that
includes these Unicode-supplied tests also has the corresponding
comments which are output as part of the test descriptions.
diff --git a/charclass_invlists.h b/charclass_invlists.h
@@ -87887,7 +87887,7 @@ static const U8 WB_table[19][19] = {
  * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
  * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
  * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 718d6ea8b96ee3d12c9c3a48ceb0f5cebe023634002ac8b2ede12b306273aa52 lib/unicore/mktables
+ * 2e9c8c898fd78231c21ff0da9facb8d231bf419bde94dc63075dff904be4f5f7 lib/unicore/mktables
  * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
  * 12bd58cb9d5a99f631ca95e269f7f9c90dacaf81020efa5d95a995f3cdc19200 regen/mk_invlists.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
@@ -2198,6 +2198,15 @@ sub trace { return main::trace(@_); }
     # 'handler'
     main::set_access('each_line_handler', \%each_line_handler, 'c');
 
+    my %retain_trailing_comments;
+    # This is used to not discard the comments that end data lines.  This
+    # would be used only for files with non-typical syntax, and most code here
+    # assumes that comments have been stripped, so special handlers would have
+    # to be written.  It is assumed that the code will use these in
+    # single-quoted contexts, and so any "'" marks in the comment will be
+    # prefixed by a backslash.
+    main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
+
     my %properties; # Optional ordered list of the properties that occur in each
     # meaningful line of the input file.  If present, an appropriate
     # each_line_handler() is automatically generated and pushed onto the stack
@@ -2355,6 +2364,7 @@ sub trace { return main::trace(@_); }
 
         # Set defaults
         $handler{$addr} = \&main::process_generic_property_file;
+        $retain_trailing_comments{$addr} = 0;
         $non_skip{$addr} = 0;
         $skip{$addr} = undef;
         $has_missings_defaults{$addr} = $NO_DEFAULTS;
@@ -3020,9 +3030,21 @@ END
                 next;
             }
 
-            # Remove comments and trailing space, and skip this line if the
-            # result is empty
-            s/#.*//;
+            # Unless to keep, remove comments.  If to keep, ignore
+            # comment-only lines
+            if ($retain_trailing_comments{$addr}) {
+                next if / ^ \s* \# /x;
+
+                # But escape any single quotes (done in both the comment and
+                # non-comment portion; this could be a bug someday, but not
+                # likely)
+                s/'/\\'/g;
+            }
+            else {
+                s/#.*//;
+            }
+
+            # Remove trailing space, and skip this line if the result is empty
             s/\s+$//;
             next if /^$/;
 
@@ -19188,18 +19210,21 @@ my @input_file_objects = (
                    ),
     Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
                     Handler => \&process_GCB_test,
+                    retain_trailing_comments => 1,
                    ),
     Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
                     Skip => $Validation_Documentation,
                    ),
     Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
                     Handler => \&process_SB_test,
+                    retain_trailing_comments => 1,
                    ),
     Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
                     Skip => $Validation_Documentation,
                    ),
     Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
                     Handler => \&process_WB_test,
+                    retain_trailing_comments => 1,
                    ),
     Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
                     Skip => $Validation_Documentation,
@@ -19250,6 +19275,7 @@ my @input_file_objects = (
                    ),
     Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
                     Handler => \&process_LB_test,
+                    retain_trailing_comments => 1,
                    ),
     Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
                     Skip => $Validation_Documentation,
@@ -19842,6 +19868,15 @@ sub _test_break($$) {
     my $break_type = shift;
 
     my $line   = (caller 1)[2];   # Line number
+    my $comment = "";
+
+    if ($template =~ / ( .*? ) \s* \# (.*) /x) {
+        $template = $1;
+        $comment = $2;
+
+        # Replace leading spaces with a single one.
+        $comment =~ s/ ^ \s* / # /x;
+    }
 
     # The line contains characters above the ASCII range, but in Latin1.  It
     # may or may not be in utf8, and if it is, it may or may not know it.  So,
@@ -19985,7 +20020,10 @@ sub _test_break($$) {
 
             # Fancy display of test results
             $matched = ($matched) ? "matched" : "failed to match";
-            print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+            print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale$comment\n";
+
+            # Only print the comment on the first use of this line
+            $comment = "";
 
             # Repeat with the first \B{} in the pattern.  This makes sure the
             # code in regexec.c:find_byclass() for \B gets executed
diff --git a/regcharclass.h b/regcharclass.h
@@ -1895,7 +1895,7 @@
  * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd lib/unicore/extracted/DLineBreak.txt
  * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 lib/unicore/extracted/DNumType.txt
  * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed lib/unicore/extracted/DNumValues.txt
- * 718d6ea8b96ee3d12c9c3a48ceb0f5cebe023634002ac8b2ede12b306273aa52 lib/unicore/mktables
+ * 2e9c8c898fd78231c21ff0da9facb8d231bf419bde94dc63075dff904be4f5f7 lib/unicore/mktables
  * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 lib/unicore/version
  * 913d2f93f3cb6cdf1664db888bf840bc4eb074eef824e082fceda24a9445e60c regen/charset_translations.pl
  * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 regen/regcharclass.pl