Fix tr/// determination of inplace editing for EBCDIC

khwilliamson · xsawyerx · commit 65a97d491a9a · 2020-05-27T11:09:32.000+03:00
I realized as a result of fixing GH #17654, that the code didn't properly decide if a tr/// can be done in-place on EBCDIC platforms. Since we didn't have an EBCDIC smoker at the time, I couldn't be sure that the fix actually worked. Now that we do have a smoker, I have successfully tested it. This patch is constructed so that the code generated on non-EBCDIC platforms should not be changed by it.
diff --git a/ebcdic_tables.h b/ebcdic_tables.h
@@ -413,60 +413,6 @@ SOFTWARE.
 };
 #  endif
 
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-#  ifndef DOINIT
-    EXTCONST UV PL_partition_by_byte_length[38];
-#  else
-    EXTCONST UV PL_partition_by_byte_length[38] = {
-	0x00,
-	0x41,
-	0x4b,
-	0x51,
-	0x5a,
-	0x62,
-	0x6b,
-	0x70,
-	0x79,
-	0x80,
-	0x81,
-	0x8a,
-	0x91,
-	0x9a,
-	0xa1,
-	0xaa,
-	0xad,
-	0xae,
-	0xbd,
-	0xbe,
-	0xc0,
-	0xca,
-	0xd0,
-	0xda,
-	0xe0,
-	0xe1,
-	0xe2,
-	0xea,
-	0xf0,
-	0xfa,
-	0xff,
-	0x100,
-	0x400,
-	0x4000,
-	0x40000,
-	0x400000,
-	0x4000000,
-	0x40000000
-};
-#  endif
-
 #endif	/* EBCDIC 1047 */
 
 #if 'A' == 193 /* EBCDIC 037 */ \
@@ -845,62 +791,6 @@ SOFTWARE.
 };
 #  endif
 
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-#  ifndef DOINIT
-    EXTCONST UV PL_partition_by_byte_length[40];
-#  else
-    EXTCONST UV PL_partition_by_byte_length[40] = {
-	0x00,
-	0x41,
-	0x4b,
-	0x51,
-	0x5a,
-	0x5f,
-	0x60,
-	0x62,
-	0x6b,
-	0x70,
-	0x79,
-	0x80,
-	0x81,
-	0x8a,
-	0x91,
-	0x9a,
-	0xa1,
-	0xaa,
-	0xb0,
-	0xb1,
-	0xba,
-	0xbc,
-	0xc0,
-	0xca,
-	0xd0,
-	0xda,
-	0xe0,
-	0xe1,
-	0xe2,
-	0xea,
-	0xf0,
-	0xfa,
-	0xff,
-	0x100,
-	0x400,
-	0x4000,
-	0x40000,
-	0x400000,
-	0x4000000,
-	0x40000000
-};
-#  endif
-
 #endif	/* EBCDIC 037 */
 
 #endif /* PERL_EBCDIC_TABLES_H_ */
diff --git a/op.c b/op.c
@@ -7061,12 +7061,13 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
      * these up into smaller chunks, but doesn't merge any together.  This
      * makes it easy to find the instances it's looking for.  A second pass is
      * done after this has been determined which merges things together to
-     * shrink the table for runtime.  For ASCII platforms, the table is
-     * trivial, given below, and uses the fundamental characteristics of UTF-8
-     * to construct the values.  For EBCDIC, it isn't so, and we rely on a
-     * table constructed by the perl script that generates these kinds of
-     * things */
-#ifndef EBCDIC
+     * shrink the table for runtime.  The table below is used for both ASCII
+     * and EBCDIC platforms.  On EBCDIC, the byte length is not monotonically
+     * increasing for code points below 256.  To correct for that, the macro
+     * CP_ADJUST defined below converts those code points to ASCII in the first
+     * pass, and we use the ASCII partition values.  That works because the
+     * growth factor will be unaffected, which is all that is calculated during
+     * the first pass. */
     UV PL_partition_by_byte_length[] = {
         0,
         0x80,   /* Below this is 1 byte representations */
@@ -7083,8 +7084,6 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 
     };
 
-#endif
-
     PERL_ARGS_ASSERT_PMTRANS;
 
     PL_hints |= HINT_BLOCK_SCOPE;
@@ -7212,6 +7211,21 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
             t_array = invlist_array(t_invlist);
         }
 
+/* As noted earlier, we convert EBCDIC code points to Unicode in the first pass
+ * so as to get the well-behaved length 1 vs length 2 boundary.  Only code
+ * points below 256 differ between the two character sets in this regard.  For
+ * these, we also can't have any ranges, as they have to be individually
+ * converted. */
+#ifdef EBCDIC
+#  define CP_ADJUST(x)          ((pass2) ? (x) : NATIVE_TO_UNI(x))
+#  define FORCE_RANGE_LEN_1(x)  ((pass2) ? 0 : ((x) < 256))
+#  define CP_SKIP(x)            ((pass2) ? UVCHR_SKIP(x) : OFFUNISKIP(x))
+#else
+#  define CP_ADJUST(x)          (x)
+#  define FORCE_RANGE_LEN_1(x)  0
+#  define CP_SKIP(x)            UVCHR_SKIP(x)
+#endif
+
         /* And the mapping of each of the ranges is initialized.  Initially,
          * everything is TR_UNLISTED. */
         for (i = 0; i < len; i++) {
@@ -7345,7 +7359,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
 
                     /* Here, not in the middle of a range, and not UTF-8.  The
                      * next code point is the single byte where we're at */
-                    t_cp = *t;
+                    t_cp = CP_ADJUST(*t);
                     t_range_count = 1;
                     t++;
                 }
@@ -7356,15 +7370,17 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      * next code point is the next UTF-8 char in the input.  We
                      * know the input is valid, because the toker constructed
                      * it */
-                    t_cp = valid_utf8_to_uvchr(t, &t_char_len);
+                    t_cp = CP_ADJUST(valid_utf8_to_uvchr(t, &t_char_len));
                     t += t_char_len;
 
                     /* UTF-8 strings (only) have been parsed in toke.c to have
                      * ranges.  See if the next byte indicates that this was
                      * the first element of a range.  If so, get the final
                      * element and calculate the range size.  If not, the range
                      * size is 1 */
-                    if (t < tend && *t == RANGE_INDICATOR) {
+                    if (   t < tend && *t == RANGE_INDICATOR
+                        && ! FORCE_RANGE_LEN_1(t_cp))
+                    {
                         t++;
                         t_range_count = valid_utf8_to_uvchr(t, &t_char_len)
                                       - t_cp + 1;
@@ -7396,16 +7412,18 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                 }
                 else {
                     if (! rstr_utf8) {
-                        r_cp = *r;
+                        r_cp = CP_ADJUST(*r);
                         r_range_count = 1;
                         r++;
                     }
                     else {
                         Size_t r_char_len;
 
-                        r_cp = valid_utf8_to_uvchr(r, &r_char_len);
+                        r_cp = CP_ADJUST(valid_utf8_to_uvchr(r, &r_char_len));
                         r += r_char_len;
-                        if (r < rend && *r == RANGE_INDICATOR) {
+                        if (   r < rend && *r == RANGE_INDICATOR
+                            && ! FORCE_RANGE_LEN_1(r_cp))
+                        {
                             r++;
                             r_range_count = valid_utf8_to_uvchr(r,
                                                     &r_char_len) - r_cp + 1;
@@ -7537,7 +7555,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                  * code point in the rhs against any code point in the lhs. */
                 if ( ! pass2
                     && r_cp_end != TR_SPECIAL_HANDLING
-                    && UVCHR_SKIP(t_cp_end) < UVCHR_SKIP(r_cp_end))
+                    && CP_SKIP(t_cp_end) < CP_SKIP(r_cp_end))
                 {
                     /* Here, we will need to make a copy of the input string
                      * before doing the transliteration.  The worst possible
@@ -7560,8 +7578,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      * string not being UTF-8 */
                     NV t_size = (can_force_utf8 && t_cp < 256)
                                 ? 1
-                                : UVCHR_SKIP(t_cp_end);
-                    NV ratio = UVCHR_SKIP(r_cp_end) / t_size;
+                                : CP_SKIP(t_cp_end);
+                    NV ratio = CP_SKIP(r_cp_end) / t_size;
 
                     o->op_private |= OPpTRANS_GROWS;
 
@@ -7594,8 +7612,8 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                      * is if it 'grows'.  But in the 2nd pass, there's no
                      * reason to not merge */
                     if (   (i > 0 && (   pass2
-                                      || UVCHR_SKIP(t_array[i-1])
-                                                        == UVCHR_SKIP(t_cp)))
+                                      || CP_SKIP(t_array[i-1])
+                                                            == CP_SKIP(t_cp)))
                         && (   (   r_cp == TR_SPECIAL_HANDLING
                                 && r_map[i-1] == TR_SPECIAL_HANDLING)
                             || (   r_cp != TR_SPECIAL_HANDLING
@@ -7615,7 +7633,7 @@ S_pmtrans(pTHX_ OP *o, OP *expr, OP *repl)
                     adjacent_to_range_above = TRUE;
                     if (i + 1 < len)
                     if (    (   pass2
-                             || UVCHR_SKIP(t_cp) == UVCHR_SKIP(t_array[i+1]))
+                             || CP_SKIP(t_cp) == CP_SKIP(t_array[i+1]))
                         && (   (   r_cp == TR_SPECIAL_HANDLING
                                 && r_map[i+1] == (UV) TR_SPECIAL_HANDLING)
                             || (   r_cp != TR_SPECIAL_HANDLING
diff --git a/regen/ebcdic.pl b/regen/ebcdic.pl
@@ -779,56 +779,6 @@ END
         output_table(\@C9_utf8_dfa, "PL_c9_utf8_dfa_tab", $NUM_CLASSES);
     }
 
-    {
-        print $out_fh <<EOF;
-/* This table partitions all the code points of the platform into ranges which
- * have the property that all the code points in each range have the same
- * number of bytes in their UTF-EBCDIC representations, and the adjacent
- * ranges have a different number of bytes.
- *
- * Each number in the table begins such a range, which extends up to just
- * before the following table entry, except the final entry is understood to
- * extend to the platform's infinity
- */
-EOF
-        # The lengths of the characters between 0 and 255 are either 1 or 2,
-        # with those whose ASCII platform equivalents below 160 being 1, and
-        # the rest being 2.
-        my @list;
-        push @list, 0;
-        my $pushed_range_is_length_1 = 1;
-
-        for my $i (1 .. 0xFF) {
-            my $this_code_point_is_length_1 = ($e2a[$i] < 160);
-            if ($pushed_range_is_length_1 != $this_code_point_is_length_1) {
-                push @list, $i;
-                $pushed_range_is_length_1 = $this_code_point_is_length_1;
-            }
-        }
-
-        # Starting at 256, the length is 2.
-        push @list, 0x100 if $pushed_range_is_length_1;
-
-        # These are based on the fundamental properties of UTF-EBCDIC.  Each
-        # continuation byte has 5 bits of information.  Comments in utf8.h
-        # explain the rest.
-        my $UTF_ACCUMULATION_SHIFT = 5;
-        push @list, (32 * (1 << (    $UTF_ACCUMULATION_SHIFT)));
-        push @list, (16 * (1 << (2 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 8 * (1 << (3 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 4 * (1 << (4 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, ( 2 * (1 << (5 * $UTF_ACCUMULATION_SHIFT)));
-        push @list, (     (1 << (6 * $UTF_ACCUMULATION_SHIFT)));
-
-        output_table_start($out_fh, "UV", "PL_partition_by_byte_length", scalar @list);
-        print $out_fh "\t";
-
-        print $out_fh join ",\n\t", map { sprintf "0x%02x", $_ } @list;
-        print $out_fh "\n";
-
-        output_table_end($out_fh);
-    }
-
     print $out_fh get_conditional_compile_line_end();
 }