@@ -7061,12 +7061,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7061
7061
* these up into smaller chunks, but doesn't merge any together. This
7062
7062
* makes it easy to find the instances it's looking for. A second pass is
7063
7063
* done after this has been determined which merges things together to
7064
- * shrink the table for runtime. For ASCII platforms, the table is
7065
- * trivial, given below, and uses the fundamental characteristics of UTF-8
7066
- * to construct the values. For EBCDIC, it isn't so, and we rely on a
7067
- * table constructed by the perl script that generates these kinds of
7068
- * things */
7069
- #ifndef EBCDIC
7064
+ * shrink the table for runtime. The table below is used for both ASCII
7065
+ * and EBCDIC platforms. On EBCDIC, the byte length is not monotonically
7066
+ * increasing for code points below 256. To correct for that, the macro
7067
+ * CP_ADJUST defined below converts those code points to ASCII in the first
7068
+ * pass, and we use the ASCII partition values. That works because the
7069
+ * growth factor will be unaffected, which is all that is calculated during
7070
+ * the first pass. */
7070
7071
UV PL_partition_by_byte_length[] = {
7071
7072
0,
7072
7073
0x80, /* Below this is 1 byte representations */
@@ -7083,8 +7084,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7083
7084
7084
7085
};
7085
7086
7086
- #endif
7087
-
7088
7087
PERL_ARGS_ASSERT_PMTRANS;
7089
7088
7090
7089
PL_hints |= HINT_BLOCK_SCOPE;
@@ -7212,6 +7211,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7212
7211
t_array = invlist_array(t_invlist);
7213
7212
}
7214
7213
7214
+ /* As noted earlier, we convert EBCDIC code points to Unicode in the first pass
7215
+ * so as to get the well-behaved length 1 vs length 2 boundary. Only code
7216
+ * points below 256 differ between the two character sets in this regard. For
7217
+ * these, we also can't have any ranges, as they have to be individually
7218
+ * converted. */
7219
+ #ifdef EBCDIC
7220
+ # define CP_ADJUST(x) ((pass2) ? (x) : NATIVE_TO_UNI(x))
7221
+ # define FORCE_RANGE_LEN_1(x) ((pass2) ? 0 : ((x) < 256))
7222
+ # define CP_SKIP(x) ((pass2) ? UVCHR_SKIP(x) : OFFUNISKIP(x))
7223
+ #else
7224
+ # define CP_ADJUST(x) (x)
7225
+ # define FORCE_RANGE_LEN_1(x) 0
7226
+ # define CP_SKIP(x) UVCHR_SKIP(x)
7227
+ #endif
7228
+
7215
7229
/* And the mapping of each of the ranges is initialized. Initially,
7216
7230
* everything is TR_UNLISTED. */
7217
7231
for (i = 0; i < len; i++) {
@@ -7345,7 +7359,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7345
7359
7346
7360
/* Here, not in the middle of a range, and not UTF-8. The
7347
7361
* next code point is the single byte where we're at */
7348
- t_cp = *t ;
7362
+ t_cp = CP_ADJUST(*t) ;
7349
7363
t_range_count = 1;
7350
7364
t++;
7351
7365
}
@@ -7356,15 +7370,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7356
7370
* next code point is the next UTF-8 char in the input. We
7357
7371
* know the input is valid, because the toker constructed
7358
7372
* it */
7359
- t_cp = valid_utf8_to_uvchr(t, &t_char_len);
7373
+ t_cp = CP_ADJUST( valid_utf8_to_uvchr(t, &t_char_len) );
7360
7374
t += t_char_len;
7361
7375
7362
7376
/* UTF-8 strings (only) have been parsed in toke.c to have
7363
7377
* ranges. See if the next byte indicates that this was
7364
7378
* the first element of a range. If so, get the final
7365
7379
* element and calculate the range size. If not, the range
7366
7380
* size is 1 */
7367
- if (t < tend && *t == RANGE_INDICATOR) {
7381
+ if ( t < tend && *t == RANGE_INDICATOR
7382
+ && ! FORCE_RANGE_LEN_1(t_cp))
7383
+ {
7368
7384
t++;
7369
7385
t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
7370
7386
- t_cp + 1;
@@ -7396,16 +7412,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7396
7412
}
7397
7413
else {
7398
7414
if (! rstr_utf8) {
7399
- r_cp = *r ;
7415
+ r_cp = CP_ADJUST(*r) ;
7400
7416
r_range_count = 1;
7401
7417
r++;
7402
7418
}
7403
7419
else {
7404
7420
Size_t r_char_len;
7405
7421
7406
- r_cp = valid_utf8_to_uvchr(r, &r_char_len);
7422
+ r_cp = CP_ADJUST( valid_utf8_to_uvchr(r, &r_char_len) );
7407
7423
r += r_char_len;
7408
- if (r < rend && *r == RANGE_INDICATOR) {
7424
+ if ( r < rend && *r == RANGE_INDICATOR
7425
+ && ! FORCE_RANGE_LEN_1(r_cp))
7426
+ {
7409
7427
r++;
7410
7428
r_range_count = valid_utf8_to_uvchr(r,
7411
7429
&r_char_len) - r_cp + 1;
@@ -7537,7 +7555,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7537
7555
* code point in the rhs against any code point in the lhs. */
7538
7556
if ( ! pass2
7539
7557
&& r_cp_end != TR_SPECIAL_HANDLING
7540
- && UVCHR_SKIP (t_cp_end) < UVCHR_SKIP (r_cp_end))
7558
+ && CP_SKIP (t_cp_end) < CP_SKIP (r_cp_end))
7541
7559
{
7542
7560
/* Here, we will need to make a copy of the input string
7543
7561
* before doing the transliteration. The worst possible
@@ -7560,8 +7578,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7560
7578
* string not being UTF-8 */
7561
7579
NV t_size = (can_force_utf8 && t_cp < 256)
7562
7580
? 1
7563
- : UVCHR_SKIP (t_cp_end);
7564
- NV ratio = UVCHR_SKIP (r_cp_end) / t_size;
7581
+ : CP_SKIP (t_cp_end);
7582
+ NV ratio = CP_SKIP (r_cp_end) / t_size;
7565
7583
7566
7584
o->op_private |= OPpTRANS_GROWS;
7567
7585
@@ -7594,8 +7612,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7594
7612
* is if it 'grows'. But in the 2nd pass, there's no
7595
7613
* reason to not merge */
7596
7614
if ( (i > 0 && ( pass2
7597
- || UVCHR_SKIP (t_array[i-1])
7598
- == UVCHR_SKIP (t_cp)))
7615
+ || CP_SKIP (t_array[i-1])
7616
+ == CP_SKIP (t_cp)))
7599
7617
&& ( ( r_cp == TR_SPECIAL_HANDLING
7600
7618
&& r_map[i-1] == TR_SPECIAL_HANDLING)
7601
7619
|| ( r_cp != TR_SPECIAL_HANDLING
@@ -7615,7 +7633,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
7615
7633
adjacent_to_range_above = TRUE;
7616
7634
if (i + 1 < len)
7617
7635
if ( ( pass2
7618
- || UVCHR_SKIP (t_cp) == UVCHR_SKIP (t_array[i+1]))
7636
+ || CP_SKIP (t_cp) == CP_SKIP (t_array[i+1]))
7619
7637
&& ( ( r_cp == TR_SPECIAL_HANDLING
7620
7638
&& r_map[i+1] == (UV) TR_SPECIAL_HANDLING)
7621
7639
|| ( r_cp != TR_SPECIAL_HANDLING
0 commit comments