Update 'East Asian Width' table to comply with Unicode 13.0

alexdowad · alexdowad · commit d8c785b894e1 · 2021-01-19T20:38:44.000+02:00
Instead of manually maintaining the data in eaw_table.h, it is now automatically
generated by ucgendat/ucgendat.php, using the EastAsianWidth.txt file from
the Unicode Consortium.

Something must be said about the deleted test case. Back in 2004, someone
noticed that `mb_strwidth` didn't comply with Unicode 4.0. A test case was
added to expose the problem. Well, time keeps moving on, and with the changing
years, new Unicodes are born and old Unicodes die. Some characters which were
counted as double-width in Unicode 4.0 are no longer such in Unicode 13.0,
which renders the test case obsolete.

At the same time, make a couple of spelling/grammar fixes in ucgendat.php.
diff --git a/ext/mbstring/libmbfl/mbfl/eaw_table.h b/ext/mbstring/libmbfl/mbfl/eaw_table.h
@@ -1,4 +1,8 @@
-/* East Asian Width table
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ *                     DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
  *
  * Some characters in East Asian languages are intended to be displayed in a space
  * which is roughly square. (This contrasts with others such as the Latin alphabet,
@@ -7,49 +11,127 @@
  * when doing things like wrapping text to a specific width.
  *
  * Each pair of numbers in the below table is a range of Unicode codepoints
- * which should be displayed as double-width. */
+ * which should be displayed as double-width.
+ */
 
 static const struct {
 	int begin;
 	int end;
 } mbfl_eaw_table[] = {
 	{ 0x1100, 0x115f },
-	{ 0x11a3, 0x11a7 },
-	{ 0x11fa, 0x11ff },
+	{ 0x231a, 0x231b },
 	{ 0x2329, 0x232a },
+	{ 0x23e9, 0x23ec },
+	{ 0x23f0, 0x23f0 },
+	{ 0x23f3, 0x23f3 },
+	{ 0x25fd, 0x25fe },
+	{ 0x2614, 0x2615 },
+	{ 0x2648, 0x2653 },
+	{ 0x267f, 0x267f },
+	{ 0x2693, 0x2693 },
+	{ 0x26a1, 0x26a1 },
+	{ 0x26aa, 0x26ab },
+	{ 0x26bd, 0x26be },
+	{ 0x26c4, 0x26c5 },
+	{ 0x26ce, 0x26ce },
+	{ 0x26d4, 0x26d4 },
+	{ 0x26ea, 0x26ea },
+	{ 0x26f2, 0x26f3 },
+	{ 0x26f5, 0x26f5 },
+	{ 0x26fa, 0x26fa },
+	{ 0x26fd, 0x26fd },
+	{ 0x2705, 0x2705 },
+	{ 0x270a, 0x270b },
+	{ 0x2728, 0x2728 },
+	{ 0x274c, 0x274c },
+	{ 0x274e, 0x274e },
+	{ 0x2753, 0x2755 },
+	{ 0x2757, 0x2757 },
+	{ 0x2795, 0x2797 },
+	{ 0x27b0, 0x27b0 },
+	{ 0x27bf, 0x27bf },
+	{ 0x2b1b, 0x2b1c },
+	{ 0x2b50, 0x2b50 },
+	{ 0x2b55, 0x2b55 },
 	{ 0x2e80, 0x2e99 },
 	{ 0x2e9b, 0x2ef3 },
 	{ 0x2f00, 0x2fd5 },
 	{ 0x2ff0, 0x2ffb },
 	{ 0x3000, 0x303e },
 	{ 0x3041, 0x3096 },
 	{ 0x3099, 0x30ff },
-	{ 0x3105, 0x312d },
+	{ 0x3105, 0x312f },
 	{ 0x3131, 0x318e },
-	{ 0x3190, 0x31ba },
-	{ 0x31c0, 0x31e3 },
+	{ 0x3190, 0x31e3 },
 	{ 0x31f0, 0x321e },
 	{ 0x3220, 0x3247 },
-	{ 0x3250, 0x32fe },
-	{ 0x3300, 0x4dbf },
+	{ 0x3250, 0x4dbf },
 	{ 0x4e00, 0xa48c },
 	{ 0xa490, 0xa4c6 },
 	{ 0xa960, 0xa97c },
 	{ 0xac00, 0xd7a3 },
-	{ 0xd7b0, 0xd7c6 },
-	{ 0xd7cb, 0xd7fb },
 	{ 0xf900, 0xfaff },
 	{ 0xfe10, 0xfe19 },
 	{ 0xfe30, 0xfe52 },
 	{ 0xfe54, 0xfe66 },
 	{ 0xfe68, 0xfe6b },
 	{ 0xff01, 0xff60 },
 	{ 0xffe0, 0xffe6 },
-	{ 0x1b000, 0x1b001 },
+	{ 0x16fe0, 0x16fe4 },
+	{ 0x16ff0, 0x16ff1 },
+	{ 0x17000, 0x187f7 },
+	{ 0x18800, 0x18cd5 },
+	{ 0x18d00, 0x18d08 },
+	{ 0x1b000, 0x1b11e },
+	{ 0x1b150, 0x1b152 },
+	{ 0x1b164, 0x1b167 },
+	{ 0x1b170, 0x1b2fb },
+	{ 0x1f004, 0x1f004 },
+	{ 0x1f0cf, 0x1f0cf },
+	{ 0x1f18e, 0x1f18e },
+	{ 0x1f191, 0x1f19a },
 	{ 0x1f200, 0x1f202 },
-	{ 0x1f210, 0x1f23a },
+	{ 0x1f210, 0x1f23b },
 	{ 0x1f240, 0x1f248 },
 	{ 0x1f250, 0x1f251 },
+	{ 0x1f260, 0x1f265 },
+	{ 0x1f300, 0x1f320 },
+	{ 0x1f32d, 0x1f335 },
+	{ 0x1f337, 0x1f37c },
+	{ 0x1f37e, 0x1f393 },
+	{ 0x1f3a0, 0x1f3ca },
+	{ 0x1f3cf, 0x1f3d3 },
+	{ 0x1f3e0, 0x1f3f0 },
+	{ 0x1f3f4, 0x1f3f4 },
+	{ 0x1f3f8, 0x1f43e },
+	{ 0x1f440, 0x1f440 },
+	{ 0x1f442, 0x1f4fc },
+	{ 0x1f4ff, 0x1f53d },
+	{ 0x1f54b, 0x1f54e },
+	{ 0x1f550, 0x1f567 },
+	{ 0x1f57a, 0x1f57a },
+	{ 0x1f595, 0x1f596 },
+	{ 0x1f5a4, 0x1f5a4 },
+	{ 0x1f5fb, 0x1f64f },
+	{ 0x1f680, 0x1f6c5 },
+	{ 0x1f6cc, 0x1f6cc },
+	{ 0x1f6d0, 0x1f6d2 },
+	{ 0x1f6d5, 0x1f6d7 },
+	{ 0x1f6eb, 0x1f6ec },
+	{ 0x1f6f4, 0x1f6fc },
+	{ 0x1f7e0, 0x1f7eb },
+	{ 0x1f90c, 0x1f93a },
+	{ 0x1f93c, 0x1f945 },
+	{ 0x1f947, 0x1f978 },
+	{ 0x1f97a, 0x1f9cb },
+	{ 0x1f9cd, 0x1f9ff },
+	{ 0x1fa70, 0x1fa74 },
+	{ 0x1fa78, 0x1fa7a },
+	{ 0x1fa80, 0x1fa86 },
+	{ 0x1fa90, 0x1faa8 },
+	{ 0x1fab0, 0x1fab6 },
+	{ 0x1fac0, 0x1fac2 },
+	{ 0x1fad0, 0x1fad6 },
 	{ 0x20000, 0x2fffd },
-	{ 0x30000, 0x3fffd }
+	{ 0x30000, 0x3fffd },
 };
diff --git a/ext/mbstring/tests/bug28220.phpt b/ext/mbstring/tests/bug28220.phpt
diff --git a/ext/mbstring/ucgendat/ucgendat.php b/ext/mbstring/ucgendat/ucgendat.php
@@ -4,10 +4,10 @@
 /**
  * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
  * follows. This file is not necessary to build PHP. It's only necessary to
- * rebuild unicode_data.h from Unicode ucd files.
+ * rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
  *
  * Example usage:
- * php ucgendat.php UnicodeData.txt
+ * php ucgendat.php path/to/Unicode/data/files
  */
 
 /* Copyright 1998-2007 The OpenLDAP Foundation.
@@ -45,7 +45,7 @@
 if ($argc < 2) {
     echo "Usage: php ucgendata.php ./datadir\n";
     echo "./datadir must contain:\n";
-    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
+    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
     return;
 }
 
@@ -54,8 +54,9 @@
 $caseFoldingFile = $dir . '/CaseFolding.txt';
 $specialCasingFile = $dir . '/SpecialCasing.txt';
 $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
+$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
 
-$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
+$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
 foreach ($files as $file) {
     if (!file_exists($file)) {
         echo "File $file does not exist.\n";
@@ -72,6 +73,11 @@
 parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
 file_put_contents($outputFile, generateData($data));
 
+$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
+
+$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
+file_put_contents($eawFile, generateEastAsianWidthData($eawData));
+
 class Range {
     public $start;
     public $end;
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
     }
 }
 
+function parseEastAsianWidth(string $input) : array {
+    $wideRanges = [];
+
+    foreach (parseDataFile($input) as $fields) {
+        if ($fields[1] == 'W' || $fields[1] == 'F') {
+            if ($dotsPos = strpos($fields[0], '..')) {
+                $startCode = intval(substr($fields[0], 0, $dotsPos), 16);
+                $endCode = intval(substr($fields[0], $dotsPos + 2), 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($startCode == $lastRange->end + 1) {
+                        $lastRange->end = $endCode;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($startCode, $endCode);
+            } else {
+                $code = intval($fields[0], 16);
+
+                if (!empty($wideRanges)) {
+                    $lastRange = $wideRanges[count($wideRanges) - 1];
+                    if ($code == $lastRange->end + 1) {
+                        $lastRange->end++;
+                        continue;
+                    }
+                }
+
+                $wideRanges[] = new Range($code, $code);
+            }
+        }
+    }
+
+    return $wideRanges;
+}
+
 function formatArray(array $values, int $width, string $format) : string {
     $result = '';
     $i = 0;
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
     $propOffsets[] = $idx;
 
     // TODO ucgendat.c pads the prop offsets to the next multiple of 4
-    // for rather debious reasons of alignment. This should probably be
+    // for rather dubious reasons of alignment. This should probably be
     // dropped
     while (count($propOffsets) % 4 != 0) {
         $propOffsets[] = 0;
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
 
 function generateData(UnicodeData $data) {
     $result = <<<'HEADER'
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
  *
  *                     DO NOT EDIT THIS FILE!
  *
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from  http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
  *
  * More information can be found in the UCData package. Unfortunately,
  * the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
 HEADER;
     $result .= "\n\n" . generatePropData($data);
     $result .= generateCaseData($data);
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
 
     return $mph;
 }
+
+function generateEastAsianWidthData(array $wideRanges) {
+      $result = <<<'HEADER'
+/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
+ *
+ *                     DO NOT EDIT THIS FILE!
+ *
+ * East Asian Width table
+ *
+ * Some characters in East Asian languages are intended to be displayed in a space
+ * which is roughly square. (This contrasts with others such as the Latin alphabet,
+ * which are taller than they are wide.) To display these East Asian characters
+ * properly, twice the horizontal space is used. This must be taken into account
+ * when doing things like wrapping text to a specific width.
+ *
+ * Each pair of numbers in the below table is a range of Unicode codepoints
+ * which should be displayed as double-width.
+ */
+
+static const struct {
+	int begin;
+	int end;
+} mbfl_eaw_table[] = {
+
+HEADER;
+
+    foreach ($wideRanges as $range) {
+        $startCode = dechex($range->start);
+        $endCode = dechex($range->end);
+        $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
+    }
+
+    $result .= "};\n";
+    return $result;
+}
diff --git a/ext/mbstring/unicode_data.h b/ext/mbstring/unicode_data.h
@@ -1,14 +1,14 @@
-/* This file was generated from a modified version UCData's ucgendat.
+/* This file was generated from a modified version of UCData's ucgendat.
  *
  *                     DO NOT EDIT THIS FILE!
  *
- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
- * files from  http://www.unicode.org/Public/ and run this program.
+ * Instead, download the appropriate UnicodeData-x.x.x.txt and
+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
+ * and run ext/mbstring/ucgendat/ucgendat.php.
  *
  * More information can be found in the UCData package. Unfortunately,
  * the project's page doesn't seem to be live anymore, so you can use
- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
 
 static const unsigned short _ucprop_size = 44;
 

Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,14 @@`
`1`		`-/* This file was generated from a modified version UCData's ucgendat.`
	`1`	`+/* This file was generated from a modified version of UCData's ucgendat.`
`2`	`2`	`*`
`3`	`3`	`* DO NOT EDIT THIS FILE!`
`4`	`4`	`*`
`5`		`- * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download`
`6`		`- * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt`
`7`		`- * files from http://www.unicode.org/Public/ and run this program.`
	`5`	`+ * Instead, download the appropriate UnicodeData-x.x.x.txt and`
	`6`	`+ * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/`
	`7`	`+ * and run ext/mbstring/ucgendat/ucgendat.php.`
`8`	`8`	`*`
`9`	`9`	`* More information can be found in the UCData package. Unfortunately,`
`10`	`10`	`* the project's page doesn't seem to be live anymore, so you can use`
`11`		`- * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */`
	`11`	`+ * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */`
`12`	`12`
`13`	`13`	`static const unsigned short _ucprop_size = 44;`
`14`	`14`