Skip to content

Commit 65a97d4

Browse files
khwilliamsonxsawyerx
authored andcommitted
Fix tr/// determination of inplace editing for EBCDIC
I realized as a result of fixing GH #17654, that the code didn't properly decide if a tr/// can be done in-place on EBCDIC platforms. Since we didn't have an EBCDIC smoker at the time, I couldn't be sure that the fix actually worked. Now that we do have a smoker, I have successfully tested it. This patch is constructed so that the code generated on non-EBCDIC platforms should not be changed by it.
1 parent e2d0e9a commit 65a97d4

File tree

3 files changed

+38
-180
lines changed

3 files changed

+38
-180
lines changed

ebcdic_tables.h

Lines changed: 0 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -413,60 +413,6 @@ SOFTWARE.
413413
};
414414
# endif
415415

416-
/* This table partitions all the code points of the platform into ranges which
417-
* have the property that all the code points in each range have the same
418-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
419-
* ranges have a different number of bytes.
420-
*
421-
* Each number in the table begins such a range, which extends up to just
422-
* before the following table entry, except the final entry is understood to
423-
* extend to the platform's infinity
424-
*/
425-
# ifndef DOINIT
426-
EXTCONST UV PL_partition_by_byte_length[38];
427-
# else
428-
EXTCONST UV PL_partition_by_byte_length[38] = {
429-
0x00,
430-
0x41,
431-
0x4b,
432-
0x51,
433-
0x5a,
434-
0x62,
435-
0x6b,
436-
0x70,
437-
0x79,
438-
0x80,
439-
0x81,
440-
0x8a,
441-
0x91,
442-
0x9a,
443-
0xa1,
444-
0xaa,
445-
0xad,
446-
0xae,
447-
0xbd,
448-
0xbe,
449-
0xc0,
450-
0xca,
451-
0xd0,
452-
0xda,
453-
0xe0,
454-
0xe1,
455-
0xe2,
456-
0xea,
457-
0xf0,
458-
0xfa,
459-
0xff,
460-
0x100,
461-
0x400,
462-
0x4000,
463-
0x40000,
464-
0x400000,
465-
0x4000000,
466-
0x40000000
467-
};
468-
# endif
469-
470416
#endif /* EBCDIC 1047 */
471417

472418
#if 'A' == 193 /* EBCDIC 037 */ \
@@ -845,62 +791,6 @@ SOFTWARE.
845791
};
846792
# endif
847793

848-
/* This table partitions all the code points of the platform into ranges which
849-
* have the property that all the code points in each range have the same
850-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
851-
* ranges have a different number of bytes.
852-
*
853-
* Each number in the table begins such a range, which extends up to just
854-
* before the following table entry, except the final entry is understood to
855-
* extend to the platform's infinity
856-
*/
857-
# ifndef DOINIT
858-
EXTCONST UV PL_partition_by_byte_length[40];
859-
# else
860-
EXTCONST UV PL_partition_by_byte_length[40] = {
861-
0x00,
862-
0x41,
863-
0x4b,
864-
0x51,
865-
0x5a,
866-
0x5f,
867-
0x60,
868-
0x62,
869-
0x6b,
870-
0x70,
871-
0x79,
872-
0x80,
873-
0x81,
874-
0x8a,
875-
0x91,
876-
0x9a,
877-
0xa1,
878-
0xaa,
879-
0xb0,
880-
0xb1,
881-
0xba,
882-
0xbc,
883-
0xc0,
884-
0xca,
885-
0xd0,
886-
0xda,
887-
0xe0,
888-
0xe1,
889-
0xe2,
890-
0xea,
891-
0xf0,
892-
0xfa,
893-
0xff,
894-
0x100,
895-
0x400,
896-
0x4000,
897-
0x40000,
898-
0x400000,
899-
0x4000000,
900-
0x40000000
901-
};
902-
# endif
903-
904794
#endif /* EBCDIC 037 */
905795

906796
#endif /* PERL_EBCDIC_TABLES_H_ */

op.c

Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -7061,12 +7061,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
70617061
* these up into smaller chunks, but doesn't merge any together. This
70627062
* makes it easy to find the instances it's looking for. A second pass is
70637063
* done after this has been determined which merges things together to
7064-
* shrink the table for runtime. For ASCII platforms, the table is
7065-
* trivial, given below, and uses the fundamental characteristics of UTF-8
7066-
* to construct the values. For EBCDIC, it isn't so, and we rely on a
7067-
* table constructed by the perl script that generates these kinds of
7068-
* things */
7069-
#ifndef EBCDIC
7064+
* shrink the table for runtime. The table below is used for both ASCII
7065+
* and EBCDIC platforms. On EBCDIC, the byte length is not monotonically
7066+
* increasing for code points below 256. To correct for that, the macro
7067+
* CP_ADJUST defined below converts those code points to ASCII in the first
7068+
* pass, and we use the ASCII partition values. That works because the
7069+
* growth factor will be unaffected, which is all that is calculated during
7070+
* the first pass. */
70707071
UV PL_partition_by_byte_length[] = {
70717072
0,
70727073
0x80, /* Below this is 1 byte representations */
@@ -7083,8 +7084,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
70837084

70847085
};
70857086

7086-
#endif
7087-
70887087
PERL_ARGS_ASSERT_PMTRANS;
70897088

70907089
PL_hints |= HINT_BLOCK_SCOPE;
@@ -7212,6 +7211,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
72127211
t_array = invlist_array(t_invlist);
72137212
}
72147213

7214+
/* As noted earlier, we convert EBCDIC code points to Unicode in the first pass
7215+
* so as to get the well-behaved length 1 vs length 2 boundary. Only code
7216+
* points below 256 differ between the two character sets in this regard. For
7217+
* these, we also can't have any ranges, as they have to be individually
7218+
* converted. */
7219+
#ifdef EBCDIC
7220+
# define CP_ADJUST(x) ((pass2) ? (x) : NATIVE_TO_UNI(x))
7221+
# define FORCE_RANGE_LEN_1(x) ((pass2) ? 0 : ((x) < 256))
7222+
# define CP_SKIP(x) ((pass2) ? UVCHR_SKIP(x) : OFFUNISKIP(x))
7223+
#else
7224+
# define CP_ADJUST(x) (x)
7225+
# define FORCE_RANGE_LEN_1(x) 0
7226+
# define CP_SKIP(x) UVCHR_SKIP(x)
7227+
#endif
7228+
72157229
/* And the mapping of each of the ranges is initialized. Initially,
72167230
* everything is TR_UNLISTED. */
72177231
for (i = 0; i < len; i++) {
@@ -7345,7 +7359,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
73457359

73467360
/* Here, not in the middle of a range, and not UTF-8. The
73477361
* next code point is the single byte where we're at */
7348-
t_cp = *t;
7362+
t_cp = CP_ADJUST(*t);
73497363
t_range_count = 1;
73507364
t++;
73517365
}
@@ -7356,15 +7370,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
73567370
* next code point is the next UTF-8 char in the input. We
73577371
* know the input is valid, because the toker constructed
73587372
* it */
7359-
t_cp = valid_utf8_to_uvchr(t, &t_char_len);
7373+
t_cp = CP_ADJUST(valid_utf8_to_uvchr(t, &t_char_len));
73607374
t += t_char_len;
73617375

73627376
/* UTF-8 strings (only) have been parsed in toke.c to have
73637377
* ranges. See if the next byte indicates that this was
73647378
* the first element of a range. If so, get the final
73657379
* element and calculate the range size. If not, the range
73667380
* size is 1 */
7367-
if (t < tend && *t == RANGE_INDICATOR) {
7381+
if ( t < tend && *t == RANGE_INDICATOR
7382+
&& ! FORCE_RANGE_LEN_1(t_cp))
7383+
{
73687384
t++;
73697385
t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
73707386
- t_cp + 1;
@@ -7396,16 +7412,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
73967412
}
73977413
else {
73987414
if (! rstr_utf8) {
7399-
r_cp = *r;
7415+
r_cp = CP_ADJUST(*r);
74007416
r_range_count = 1;
74017417
r++;
74027418
}
74037419
else {
74047420
Size_t r_char_len;
74057421

7406-
r_cp = valid_utf8_to_uvchr(r, &r_char_len);
7422+
r_cp = CP_ADJUST(valid_utf8_to_uvchr(r, &r_char_len));
74077423
r += r_char_len;
7408-
if (r < rend && *r == RANGE_INDICATOR) {
7424+
if ( r < rend && *r == RANGE_INDICATOR
7425+
&& ! FORCE_RANGE_LEN_1(r_cp))
7426+
{
74097427
r++;
74107428
r_range_count = valid_utf8_to_uvchr(r,
74117429
&r_char_len) - r_cp + 1;
@@ -7537,7 +7555,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
75377555
* code point in the rhs against any code point in the lhs. */
75387556
if ( ! pass2
75397557
&& r_cp_end != TR_SPECIAL_HANDLING
7540-
&& UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
7558+
&& CP_SKIP(t_cp_end) < CP_SKIP(r_cp_end))
75417559
{
75427560
/* Here, we will need to make a copy of the input string
75437561
* before doing the transliteration. The worst possible
@@ -7560,8 +7578,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
75607578
* string not being UTF-8 */
75617579
NV t_size = (can_force_utf8 && t_cp < 256)
75627580
? 1
7563-
: UVCHR_SKIP(t_cp_end);
7564-
NV ratio = UVCHR_SKIP(r_cp_end) / t_size;
7581+
: CP_SKIP(t_cp_end);
7582+
NV ratio = CP_SKIP(r_cp_end) / t_size;
75657583

75667584
o->op_private |= OPpTRANS_GROWS;
75677585

@@ -7594,8 +7612,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
75947612
* is if it 'grows'. But in the 2nd pass, there's no
75957613
* reason to not merge */
75967614
if ( (i > 0 && ( pass2
7597-
|| UVCHR_SKIP(t_array[i-1])
7598-
== UVCHR_SKIP(t_cp)))
7615+
|| CP_SKIP(t_array[i-1])
7616+
== CP_SKIP(t_cp)))
75997617
&& ( ( r_cp == TR_SPECIAL_HANDLING
76007618
&& r_map[i-1] == TR_SPECIAL_HANDLING)
76017619
|| ( r_cp != TR_SPECIAL_HANDLING
@@ -7615,7 +7633,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
76157633
adjacent_to_range_above = TRUE;
76167634
if (i + 1 < len)
76177635
if ( ( pass2
7618-
|| UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1]))
7636+
|| CP_SKIP(t_cp) == CP_SKIP(t_array[i+1]))
76197637
&& ( ( r_cp == TR_SPECIAL_HANDLING
76207638
&& r_map[i+1] == (UV) TR_SPECIAL_HANDLING)
76217639
|| ( r_cp != TR_SPECIAL_HANDLING

regen/ebcdic.pl

Lines changed: 0 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -779,56 +779,6 @@ END
779779
output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
780780
}
781781

782-
{
783-
print $out_fh <<EOF;
784-
/* This table partitions all the code points of the platform into ranges which
785-
* have the property that all the code points in each range have the same
786-
* number of bytes in their UTF-EBCDIC representations, and the adjacent
787-
* ranges have a different number of bytes.
788-
*
789-
* Each number in the table begins such a range, which extends up to just
790-
* before the following table entry, except the final entry is understood to
791-
* extend to the platform's infinity
792-
*/
793-
EOF
794-
# The lengths of the characters between 0 and 255 are either 1 or 2,
795-
# with those whose ASCII platform equivalents below 160 being 1, and
796-
# the rest being 2.
797-
my @list;
798-
push @list, 0;
799-
my $pushed_range_is_length_1 = 1;
800-
801-
for my $i (1 .. 0xFF) {
802-
my $this_code_point_is_length_1 = ($e2a[$i] < 160);
803-
if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
804-
push @list, $i;
805-
$pushed_range_is_length_1 = $this_code_point_is_length_1;
806-
}
807-
}
808-
809-
# Starting at 256, the length is 2.
810-
push @list, 0x100 if $pushed_range_is_length_1;
811-
812-
# These are based on the fundamental properties of UTF-EBCDIC. Each
813-
# continuation byte has 5 bits of information. Comments in utf8.h
814-
# explain the rest.
815-
my $UTF_ACCUMULATION_SHIFT = 5;
816-
push @list, (32 * (1 << ( $UTF_ACCUMULATION_SHIFT)));
817-
push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
818-
push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
819-
push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
820-
push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
821-
push @list, ( (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
822-
823-
output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
824-
print $out_fh "\t";
825-
826-
print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
827-
print $out_fh "\n";
828-
829-
output_table_end($out_fh);
830-
}
831-
832782
print $out_fh get_conditional_compile_line_end();
833783
}
834784

0 commit comments

Comments
 (0)