Perl · May 24, 2016
diff --git a/‎embed.fnc
Lines changed: 6 additions & 0 deletions b/‎embed.fnc
Lines changed: 6 additions & 0 deletions
diff --git a/‎embed.h
Lines changed: 5 additions & 0 deletions b/‎embed.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎embedvar.h
Lines changed: 1 addition & 0 deletions b/‎embedvar.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎intrpvar.h
Lines changed: 1 addition & 0 deletions b/‎intrpvar.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/locale.t
Lines changed: 52 additions & 2 deletions b/‎lib/locale.t
Lines changed: 52 additions & 2 deletions
@@ -910,6 +910,12 @@ Ap	|I32 *	|markstack_grow
 p	|int	|magic_setcollxfrm|NN SV* sv|NN MAGIC* mg
 : Defined in locale.c, used only in sv.c
 p	|char*	|mem_collxfrm	|NN const char* input_string|STRLEN len|NN STRLEN* xlen
+#   if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C)
+pM	|char*	|_mem_collxfrm	|NN const char* input_string	\
+				|STRLEN len			\
+				|NN STRLEN* xlen		\
+				|bool utf8
+#   endif
 #endif
 Afpd	|SV*	|mess		|NN const char* pat|...
 Apd	|SV*	|mess_sv	|NN SV* basemsg|bool consume
 
@@ -1559,6 +1559,11 @@
 #define share_hek_flags(a,b,c,d)	S_share_hek_flags(aTHX_ a,b,c,d)
 #define unshare_hek_or_pvn(a,b,c,d)	S_unshare_hek_or_pvn(aTHX_ a,b,c,d)
 #  endif
+#  if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C)
+#    if defined(USE_LOCALE_COLLATE)
+#define _mem_collxfrm(a,b,c,d)	Perl__mem_collxfrm(aTHX_ a,b,c,d)
+#    endif
+#  endif
 #  if defined(PERL_IN_MALLOC_C)
 #define adjust_size_and_find_bucket	S_adjust_size_and_find_bucket
 #  endif
 
@@ -310,6 +310,7 @@
 #define PL_stdingv		(vTHX->Istdingv)
 #define PL_strtab		(vTHX->Istrtab)
 #define PL_strxfrm_is_behaved	(vTHX->Istrxfrm_is_behaved)
+#define PL_strxfrm_max_cp	(vTHX->Istrxfrm_max_cp)
 #define PL_strxfrm_min_char	(vTHX->Istrxfrm_min_char)
 #define PL_sub_generation	(vTHX->Isub_generation)
 #define PL_subline		(vTHX->Isubline)
 
@@ -567,6 +567,7 @@ PERLVARI(I, collation_ix, U32,	0)	/* Collation generation index */
 PERLVARA(I, strxfrm_min_char, 3, char)
 PERLVARI(I, strxfrm_is_behaved, bool, TRUE)
                             /* Assume until proven otherwise that it works */
+PERLVARI(I, strxfrm_max_cp, U8, 0)      /* Highest collating cp in locale */
 PERLVARI(I, collation_standard, bool, TRUE)
 					/* Assume simple collation */
 #endif /* USE_LOCALE_COLLATE */
 
@@ -1752,16 +1752,66 @@ foreach my $Locale (@Locale) {
 
         ++$locales_test_number;
         $test_names{$locales_test_number}
-                            = 'TODO Verify that strings with embedded NUL collate';
+                            = 'Verify that strings with embedded NUL collate';
         my $ok = "a\0a\0a" lt "a\001a\001a";
         report_result($Locale, $locales_test_number, $ok);
 
         ++$locales_test_number;
         $test_names{$locales_test_number}
-                            = 'TODO Verify that strings with embedded NUL and '
+                            = 'Verify that strings with embedded NUL and '
                             . 'extra trailing NUL collate';
         $ok = "a\0a\0" lt "a\001a\001";
         report_result($Locale, $locales_test_number, $ok);
+
+        ++$locales_test_number;
+        $test_names{$locales_test_number}
+            = "Skip in non-UTF-8 locales; otherwise verify that UTF8ness "
+            . "doesn't matter with collation";
+        if (! $is_utf8_locale) {
+            report_result($Locale, $locales_test_number, 1);
+        }
+        else {
+
+            # khw can't think of anything better.  Start with a string that is
+            # higher than its UTF-8 representation in both EBCDIC and ASCII
+            my $string = chr utf8::unicode_to_native(0xff);
+            my $utf8_string = $string;
+            utf8::upgrade($utf8_string);
+
+            # 8 should be lt 9 in all locales (except ones that aren't
+            # ASCII-based, which might fail this)
+            $ok = ("a${string}8") lt ("a${utf8_string}9");
+            report_result($Locale, $locales_test_number, $ok);
+        }
+
+        ++$locales_test_number;
+        $test_names{$locales_test_number}
+            = "Skip in UTF-8 locales; otherwise verify that single byte "
+            . "collates before 0x100 and above";
+        if ($is_utf8_locale) {
+            report_result($Locale, $locales_test_number, 1);
+        }
+        else {
+            my $max_collating = chr 0;  # Find byte that collates highest
+            for my $i (0 .. 255) {
+                my $char = chr $i;
+                $max_collating = $char if $char gt $max_collating;
+            }
+            $ok = $max_collating lt chr 0x100;
+            report_result($Locale, $locales_test_number, $ok);
+        }
+
+        ++$locales_test_number;
+        $test_names{$locales_test_number}
+            = "Skip in UTF-8 locales; otherwise verify that 0x100 and "
+            . "above collate in code point order";
+        if ($is_utf8_locale) {
+            report_result($Locale, $locales_test_number, 1);
+        }
+        else {
+            $ok = chr 0x100 lt chr 0x101;
+            report_result($Locale, $locales_test_number, $ok);
+        }
     }
 
     my $ok1;