@@ -545,18 +545,96 @@ Perl_new_collate(pTHX_ const char *newcoll)
545
545
* transformations. */
546
546
547
547
{
548
- /* 2: at most so many chars ('a', 'b'). */
549
- /* 50: surely no system expands a char more. */
550
- #define XFRMBUFSIZE (2 * 50)
551
- char xbuf [XFRMBUFSIZE ];
552
- const Size_t fa = strxfrm (xbuf , "a" , XFRMBUFSIZE );
553
- const Size_t fb = strxfrm (xbuf , "ab" , XFRMBUFSIZE );
554
- const SSize_t mult = fb - fa ;
555
- if (mult < 1 && !(fa == 0 && fb == 0 ))
556
- Perl_croak (aTHX_ "panic: strxfrm() gets absurd - a => %" UVuf ", ab => %" UVuf ,
557
- (UV ) fa , (UV ) fb );
558
- PL_collxfrm_base = (fa > (Size_t )mult ) ? (fa - mult ) : 0 ;
559
- PL_collxfrm_mult = mult ;
548
+ /* We use the string below to find how long the tranformation of it
549
+ * is. Almost all locales are supersets of ASCII, or at least the
550
+ * ASCII letters. We use all of them, half upper half lower,
551
+ * because if we used fewer, we might hit just the ones that are
552
+ * outliers in a particular locale. Most of the strings being
553
+ * collated will contain a preponderance of letters, and even if
554
+ * they are above-ASCII, they are likely to have the same number of
555
+ * weight levels as the ASCII ones. It turns out that digits tend
556
+ * to have fewer levels, and some punctuation has more, but those
557
+ * are relatively sparse in text, and khw believes this gives a
558
+ * reasonable result, but it could be changed if experience so
559
+ * dictates. */
560
+ const char longer [] = "ABCDEFGHIJKLMnopqrstuvwxyz" ;
561
+ char * x_longer ; /* Transformed 'longer' */
562
+ Size_t x_len_longer ; /* Length of 'x_longer' */
563
+
564
+ char * x_shorter ; /* We also transform a substring of 'longer' */
565
+ Size_t x_len_shorter ;
566
+
567
+ /* mem_collxfrm() is used get the transformation (though here we
568
+ * are interested only in its length). It is used because it has
569
+ * the intelligence to handle all cases, but to work, it needs some
570
+ * values of 'm' and 'b' to get it started. For the purposes of
571
+ * this calculation we use a very conservative estimate of 'm' and
572
+ * 'b'. This assumes a weight can be multiple bytes, enough to
573
+ * hold any UV on the platform, and there are 5 levels, 4 weight
574
+ * bytes, and a trailing NUL. */
575
+ PL_collxfrm_base = 5 ;
576
+ PL_collxfrm_mult = 5 * sizeof (UV );
577
+
578
+ /* Find out how long the transformation really is */
579
+ x_longer = mem_collxfrm (longer ,
580
+ sizeof (longer ) - 1 ,
581
+ & x_len_longer );
582
+ Safefree (x_longer );
583
+
584
+ /* Find out how long the transformation of a substring of 'longer'
585
+ * is. Together the lengths of these transformations are
586
+ * sufficient to calculate 'm' and 'b'. The substring is all of
587
+ * 'longer' except the first character. This minimizes the chances
588
+ * of being swayed by outliers */
589
+ x_shorter = mem_collxfrm (longer + 1 ,
590
+ sizeof (longer ) - 2 ,
591
+ & x_len_shorter );
592
+ Safefree (x_shorter );
593
+
594
+ /* If the results are nonsensical for this simple test, the whole
595
+ * locale definition is suspect. Mark it so that locale collation
596
+ * is not active at all for it. XXX Should we warn? */
597
+ if ( x_len_shorter == 0
598
+ || x_len_longer == 0
599
+ || x_len_shorter >= x_len_longer )
600
+ {
601
+ PL_collxfrm_mult = 0 ;
602
+ PL_collxfrm_base = 0 ;
603
+ }
604
+ else {
605
+ SSize_t base ; /* Temporary */
606
+
607
+ /* We have both: m * strlen(longer) + b = x_len_longer
608
+ * m * strlen(shorter) + b = x_len_shorter;
609
+ * subtracting yields:
610
+ * m * (strlen(longer) - strlen(shorter))
611
+ * = x_len_longer - x_len_shorter
612
+ * But we have set things up so that 'shorter' is 1 byte smaller
613
+ * than 'longer'. Hence:
614
+ * m = x_len_longer - x_len_shorter
615
+ *
616
+ * But if something went wrong, make sure the multiplier is at
617
+ * least 1.
618
+ */
619
+ if (x_len_longer > x_len_shorter ) {
620
+ PL_collxfrm_mult = (STRLEN ) x_len_longer - x_len_shorter ;
621
+ }
622
+ else {
623
+ PL_collxfrm_mult = 1 ;
624
+ }
625
+
626
+ /* mx + b = len
627
+ * so: b = len - mx
628
+ * but in case something has gone wrong, make sure it is
629
+ * non-negative */
630
+ base = x_len_longer - PL_collxfrm_mult * (sizeof (longer ) - 1 );
631
+ if (base < 0 ) {
632
+ base = 0 ;
633
+ }
634
+
635
+ /* Add 1 for the trailing NUL */
636
+ PL_collxfrm_base = base + 1 ;
637
+ }
560
638
}
561
639
}
562
640
@@ -1304,12 +1382,17 @@ Perl_mem_collxfrm(pTHX_ const char *input_string,
1304
1382
{
1305
1383
char * s = (char * ) input_string ;
1306
1384
STRLEN s_strlen = strlen (input_string );
1307
- char * xbuf ;
1385
+ char * xbuf = NULL ;
1308
1386
STRLEN xAlloc , xout ; /* xalloc is a reserved word in VC */
1309
1387
bool first_time = TRUE; /* Cleared after first loop iteration */
1310
1388
1311
1389
PERL_ARGS_ASSERT_MEM_COLLXFRM ;
1312
1390
1391
+ /* If this locale has defective collation, skip */
1392
+ if (PL_collxfrm_base == 0 && PL_collxfrm_mult == 0 ) {
1393
+ goto bad ;
1394
+ }
1395
+
1313
1396
/* Replace any embedded NULs with the control that sorts before any others.
1314
1397
* This will give as good as possible results on strings that don't
1315
1398
* otherwise contain that character, but otherwise there may be
@@ -1506,7 +1589,6 @@ Perl_mem_collxfrm(pTHX_ const char *input_string,
1506
1589
}
1507
1590
1508
1591
#endif /* USE_LOCALE_COLLATE */
1509
-
1510
1592
#ifdef USE_LOCALE
1511
1593
1512
1594
bool
0 commit comments