Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit a4a439f

Browse files
committedMay 24, 2016
Do better locale collation in UTF-8 locales
On some platforms, the libc strxfrm() works reasonably well on UTF-8 locales, giving a default collation ordering. It will assume that every string passed to it is in UTF-8. This commit changes Perl to make sure that strxfrm's expectations are met. Likewise under a non-UTF-8 locale, strxfrm is expecting a non-UTF-8 string. And this commit makes sure of that as well. So, simply meeting strxfrm's expectations allows Perl to start supporting default collation in UTF-8 locales, and fixes it to work on single-byte locales with UTF-8 input. (Unicode::Collate provides tailorable functionality and is portable to platforms where strxfrm isn't as intelligent, but is a much more heavy-weight solution that may not be needed for particular applications.) There is a problem in non-UTF-8 locales if the passed string contains code points representable only in UTF-8. This commit causes them to be changed, before being passed to strxfrm, into the highest collating character in the locale that doesn't require UTF-8. They then will sort the same as that character, which means after all other characters in the locale but that one. In strings that don't have that character, this will generally provide exactly correct operation. There still is a problem, if that character, in the given locale, combines with adjacent characters to form a specially weighted sequence. Then, the change of these above-255 code points into that character can skew the results. See the commit message for 6696cfa for more on this. But it is really an illegal situation to have above-255 code points in a single-byte locale, so this behavior is a reasonable degradation when given illegal input. If two transformed strings compare exactly equal, Perl already uses the un-transformed versions to break ties, and there, these faked-up strings will collate so the above-255 code points sort after everything else, and in code point order amongst themselves.
1 parent ff52fcf commit a4a439f

File tree

10 files changed

+307
-41
lines changed

10 files changed

+307
-41
lines changed
 

‎embed.fnc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,12 @@ Ap |I32 * |markstack_grow
910910
p |int |magic_setcollxfrm|NN SV* sv|NN MAGIC* mg
911911
: Defined in locale.c, used only in sv.c
912912
p |char* |mem_collxfrm |NN const char* input_string|STRLEN len|NN STRLEN* xlen
913+
# if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C)
914+
pM |char* |_mem_collxfrm |NN const char* input_string \
915+
|STRLEN len \
916+
|NN STRLEN* xlen \
917+
|bool utf8
918+
# endif
913919
#endif
914920
Afpd |SV* |mess |NN const char* pat|...
915921
Apd |SV* |mess_sv |NN SV* basemsg|bool consume

‎embed.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,6 +1559,11 @@
15591559
#define share_hek_flags(a,b,c,d) S_share_hek_flags(aTHX_ a,b,c,d)
15601560
#define unshare_hek_or_pvn(a,b,c,d) S_unshare_hek_or_pvn(aTHX_ a,b,c,d)
15611561
# endif
1562+
# if defined(PERL_IN_LOCALE_C) || defined(PERL_IN_SV_C)
1563+
# if defined(USE_LOCALE_COLLATE)
1564+
#define _mem_collxfrm(a,b,c,d) Perl__mem_collxfrm(aTHX_ a,b,c,d)
1565+
# endif
1566+
# endif
15621567
# if defined(PERL_IN_MALLOC_C)
15631568
#define adjust_size_and_find_bucket S_adjust_size_and_find_bucket
15641569
# endif

‎embedvar.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@
310310
#define PL_stdingv (vTHX->Istdingv)
311311
#define PL_strtab (vTHX->Istrtab)
312312
#define PL_strxfrm_is_behaved (vTHX->Istrxfrm_is_behaved)
313+
#define PL_strxfrm_max_cp (vTHX->Istrxfrm_max_cp)
313314
#define PL_strxfrm_min_char (vTHX->Istrxfrm_min_char)
314315
#define PL_sub_generation (vTHX->Isub_generation)
315316
#define PL_subline (vTHX->Isubline)

‎intrpvar.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,7 @@ PERLVARI(I, collation_ix, U32, 0) /* Collation generation index */
567567
PERLVARA(I, strxfrm_min_char, 3, char)
568568
PERLVARI(I, strxfrm_is_behaved, bool, TRUE)
569569
/* Assume until proven otherwise that it works */
570+
PERLVARI(I, strxfrm_max_cp, U8, 0) /* Highest collating cp in locale */
570571
PERLVARI(I, collation_standard, bool, TRUE)
571572
/* Assume simple collation */
572573
#endif /* USE_LOCALE_COLLATE */

‎lib/locale.t

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1752,16 +1752,66 @@ foreach my $Locale (@Locale) {
17521752

17531753
++$locales_test_number;
17541754
$test_names{$locales_test_number}
1755-
= 'TODO Verify that strings with embedded NUL collate';
1755+
= 'Verify that strings with embedded NUL collate';
17561756
my $ok = "a\0a\0a" lt "a\001a\001a";
17571757
report_result($Locale, $locales_test_number, $ok);
17581758

17591759
++$locales_test_number;
17601760
$test_names{$locales_test_number}
1761-
= 'TODO Verify that strings with embedded NUL and '
1761+
= 'Verify that strings with embedded NUL and '
17621762
. 'extra trailing NUL collate';
17631763
$ok = "a\0a\0" lt "a\001a\001";
17641764
report_result($Locale, $locales_test_number, $ok);
1765+
1766+
++$locales_test_number;
1767+
$test_names{$locales_test_number}
1768+
= "Skip in non-UTF-8 locales; otherwise verify that UTF8ness "
1769+
. "doesn't matter with collation";
1770+
if (! $is_utf8_locale) {
1771+
report_result($Locale, $locales_test_number, 1);
1772+
}
1773+
else {
1774+
1775+
# khw can't think of anything better. Start with a string that is
1776+
# higher than its UTF-8 representation in both EBCDIC and ASCII
1777+
my $string = chr utf8::unicode_to_native(0xff);
1778+
my $utf8_string = $string;
1779+
utf8::upgrade($utf8_string);
1780+
1781+
# 8 should be lt 9 in all locales (except ones that aren't
1782+
# ASCII-based, which might fail this)
1783+
$ok = ("a${string}8") lt ("a${utf8_string}9");
1784+
report_result($Locale, $locales_test_number, $ok);
1785+
}
1786+
1787+
++$locales_test_number;
1788+
$test_names{$locales_test_number}
1789+
= "Skip in UTF-8 locales; otherwise verify that single byte "
1790+
. "collates before 0x100 and above";
1791+
if ($is_utf8_locale) {
1792+
report_result($Locale, $locales_test_number, 1);
1793+
}
1794+
else {
1795+
my $max_collating = chr 0; # Find byte that collates highest
1796+
for my $i (0 .. 255) {
1797+
my $char = chr $i;
1798+
$max_collating = $char if $char gt $max_collating;
1799+
}
1800+
$ok = $max_collating lt chr 0x100;
1801+
report_result($Locale, $locales_test_number, $ok);
1802+
}
1803+
1804+
++$locales_test_number;
1805+
$test_names{$locales_test_number}
1806+
= "Skip in UTF-8 locales; otherwise verify that 0x100 and "
1807+
. "above collate in code point order";
1808+
if ($is_utf8_locale) {
1809+
report_result($Locale, $locales_test_number, 1);
1810+
}
1811+
else {
1812+
$ok = chr 0x100 lt chr 0x101;
1813+
report_result($Locale, $locales_test_number, $ok);
1814+
}
17651815
}
17661816

17671817
my $ok1;

0 commit comments

Comments
 (0)
Please sign in to comment.