Skip to content

Commit d8c785b

Browse files
committed
Update 'East Asian Width' table to comply with Unicode 13.0
Instead of manually maintaining the data in eaw_table.h, it is now automatically generated by ucgendat/ucgendat.php, using the EastAsianWidth.txt file from the Unicode Consortium. Something must be said about the deleted test case. Back in 2004, someone noticed that `mb_strwidth` didn't comply with Unicode 4.0. A test case was added to expose the problem. Well, time keeps moving on, and with the changing years, new Unicodes are born and old Unicodes die. Some characters which were counted as double-width in Unicode 4.0 are no longer such in Unicode 13.0, which renders the test case obsolete. At the same time, make a couple of spelling/grammar fixes in ucgendat.php.
1 parent 28fa0b6 commit d8c785b

File tree

4 files changed

+189
-54
lines changed

4 files changed

+189
-54
lines changed

ext/mbstring/libmbfl/mbfl/eaw_table.h

Lines changed: 96 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
/* East Asian Width table
1+
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
2+
*
3+
* DO NOT EDIT THIS FILE!
4+
*
5+
* East Asian Width table
26
*
37
* Some characters in East Asian languages are intended to be displayed in a space
48
* which is roughly square. (This contrasts with others such as the Latin alphabet,
@@ -7,49 +11,127 @@
711
* when doing things like wrapping text to a specific width.
812
*
913
* Each pair of numbers in the below table is a range of Unicode codepoints
10-
* which should be displayed as double-width. */
14+
* which should be displayed as double-width.
15+
*/
1116

1217
static const struct {
1318
int begin;
1419
int end;
1520
} mbfl_eaw_table[] = {
1621
{ 0x1100, 0x115f },
17-
{ 0x11a3, 0x11a7 },
18-
{ 0x11fa, 0x11ff },
22+
{ 0x231a, 0x231b },
1923
{ 0x2329, 0x232a },
24+
{ 0x23e9, 0x23ec },
25+
{ 0x23f0, 0x23f0 },
26+
{ 0x23f3, 0x23f3 },
27+
{ 0x25fd, 0x25fe },
28+
{ 0x2614, 0x2615 },
29+
{ 0x2648, 0x2653 },
30+
{ 0x267f, 0x267f },
31+
{ 0x2693, 0x2693 },
32+
{ 0x26a1, 0x26a1 },
33+
{ 0x26aa, 0x26ab },
34+
{ 0x26bd, 0x26be },
35+
{ 0x26c4, 0x26c5 },
36+
{ 0x26ce, 0x26ce },
37+
{ 0x26d4, 0x26d4 },
38+
{ 0x26ea, 0x26ea },
39+
{ 0x26f2, 0x26f3 },
40+
{ 0x26f5, 0x26f5 },
41+
{ 0x26fa, 0x26fa },
42+
{ 0x26fd, 0x26fd },
43+
{ 0x2705, 0x2705 },
44+
{ 0x270a, 0x270b },
45+
{ 0x2728, 0x2728 },
46+
{ 0x274c, 0x274c },
47+
{ 0x274e, 0x274e },
48+
{ 0x2753, 0x2755 },
49+
{ 0x2757, 0x2757 },
50+
{ 0x2795, 0x2797 },
51+
{ 0x27b0, 0x27b0 },
52+
{ 0x27bf, 0x27bf },
53+
{ 0x2b1b, 0x2b1c },
54+
{ 0x2b50, 0x2b50 },
55+
{ 0x2b55, 0x2b55 },
2056
{ 0x2e80, 0x2e99 },
2157
{ 0x2e9b, 0x2ef3 },
2258
{ 0x2f00, 0x2fd5 },
2359
{ 0x2ff0, 0x2ffb },
2460
{ 0x3000, 0x303e },
2561
{ 0x3041, 0x3096 },
2662
{ 0x3099, 0x30ff },
27-
{ 0x3105, 0x312d },
63+
{ 0x3105, 0x312f },
2864
{ 0x3131, 0x318e },
29-
{ 0x3190, 0x31ba },
30-
{ 0x31c0, 0x31e3 },
65+
{ 0x3190, 0x31e3 },
3166
{ 0x31f0, 0x321e },
3267
{ 0x3220, 0x3247 },
33-
{ 0x3250, 0x32fe },
34-
{ 0x3300, 0x4dbf },
68+
{ 0x3250, 0x4dbf },
3569
{ 0x4e00, 0xa48c },
3670
{ 0xa490, 0xa4c6 },
3771
{ 0xa960, 0xa97c },
3872
{ 0xac00, 0xd7a3 },
39-
{ 0xd7b0, 0xd7c6 },
40-
{ 0xd7cb, 0xd7fb },
4173
{ 0xf900, 0xfaff },
4274
{ 0xfe10, 0xfe19 },
4375
{ 0xfe30, 0xfe52 },
4476
{ 0xfe54, 0xfe66 },
4577
{ 0xfe68, 0xfe6b },
4678
{ 0xff01, 0xff60 },
4779
{ 0xffe0, 0xffe6 },
48-
{ 0x1b000, 0x1b001 },
80+
{ 0x16fe0, 0x16fe4 },
81+
{ 0x16ff0, 0x16ff1 },
82+
{ 0x17000, 0x187f7 },
83+
{ 0x18800, 0x18cd5 },
84+
{ 0x18d00, 0x18d08 },
85+
{ 0x1b000, 0x1b11e },
86+
{ 0x1b150, 0x1b152 },
87+
{ 0x1b164, 0x1b167 },
88+
{ 0x1b170, 0x1b2fb },
89+
{ 0x1f004, 0x1f004 },
90+
{ 0x1f0cf, 0x1f0cf },
91+
{ 0x1f18e, 0x1f18e },
92+
{ 0x1f191, 0x1f19a },
4993
{ 0x1f200, 0x1f202 },
50-
{ 0x1f210, 0x1f23a },
94+
{ 0x1f210, 0x1f23b },
5195
{ 0x1f240, 0x1f248 },
5296
{ 0x1f250, 0x1f251 },
97+
{ 0x1f260, 0x1f265 },
98+
{ 0x1f300, 0x1f320 },
99+
{ 0x1f32d, 0x1f335 },
100+
{ 0x1f337, 0x1f37c },
101+
{ 0x1f37e, 0x1f393 },
102+
{ 0x1f3a0, 0x1f3ca },
103+
{ 0x1f3cf, 0x1f3d3 },
104+
{ 0x1f3e0, 0x1f3f0 },
105+
{ 0x1f3f4, 0x1f3f4 },
106+
{ 0x1f3f8, 0x1f43e },
107+
{ 0x1f440, 0x1f440 },
108+
{ 0x1f442, 0x1f4fc },
109+
{ 0x1f4ff, 0x1f53d },
110+
{ 0x1f54b, 0x1f54e },
111+
{ 0x1f550, 0x1f567 },
112+
{ 0x1f57a, 0x1f57a },
113+
{ 0x1f595, 0x1f596 },
114+
{ 0x1f5a4, 0x1f5a4 },
115+
{ 0x1f5fb, 0x1f64f },
116+
{ 0x1f680, 0x1f6c5 },
117+
{ 0x1f6cc, 0x1f6cc },
118+
{ 0x1f6d0, 0x1f6d2 },
119+
{ 0x1f6d5, 0x1f6d7 },
120+
{ 0x1f6eb, 0x1f6ec },
121+
{ 0x1f6f4, 0x1f6fc },
122+
{ 0x1f7e0, 0x1f7eb },
123+
{ 0x1f90c, 0x1f93a },
124+
{ 0x1f93c, 0x1f945 },
125+
{ 0x1f947, 0x1f978 },
126+
{ 0x1f97a, 0x1f9cb },
127+
{ 0x1f9cd, 0x1f9ff },
128+
{ 0x1fa70, 0x1fa74 },
129+
{ 0x1fa78, 0x1fa7a },
130+
{ 0x1fa80, 0x1fa86 },
131+
{ 0x1fa90, 0x1faa8 },
132+
{ 0x1fab0, 0x1fab6 },
133+
{ 0x1fac0, 0x1fac2 },
134+
{ 0x1fad0, 0x1fad6 },
53135
{ 0x20000, 0x2fffd },
54-
{ 0x30000, 0x3fffd }
136+
{ 0x30000, 0x3fffd },
55137
};

ext/mbstring/tests/bug28220.phpt

Lines changed: 0 additions & 25 deletions
This file was deleted.

ext/mbstring/ucgendat/ucgendat.php

Lines changed: 88 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
/**
55
* This is based on the ucgendat.c file from the OpenLDAP project, licensed as
66
* follows. This file is not necessary to build PHP. It's only necessary to
7-
* rebuild unicode_data.h from Unicode ucd files.
7+
* rebuild unicode_data.h and eaw_width.h from Unicode ucd files.
88
*
99
* Example usage:
10-
* php ucgendat.php UnicodeData.txt
10+
* php ucgendat.php path/to/Unicode/data/files
1111
*/
1212

1313
/* Copyright 1998-2007 The OpenLDAP Foundation.
@@ -45,7 +45,7 @@
4545
if ($argc < 2) {
4646
echo "Usage: php ucgendata.php ./datadir\n";
4747
echo "./datadir must contain:\n";
48-
echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
48+
echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n";
4949
return;
5050
}
5151

@@ -54,8 +54,9 @@
5454
$caseFoldingFile = $dir . '/CaseFolding.txt';
5555
$specialCasingFile = $dir . '/SpecialCasing.txt';
5656
$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
57+
$eastAsianWidthFile = $dir . '/EastAsianWidth.txt';
5758

58-
$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
59+
$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile];
5960
foreach ($files as $file) {
6061
if (!file_exists($file)) {
6162
echo "File $file does not exist.\n";
@@ -72,6 +73,11 @@
7273
parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
7374
file_put_contents($outputFile, generateData($data));
7475

76+
$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h";
77+
78+
$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile));
79+
file_put_contents($eawFile, generateEastAsianWidthData($eawData));
80+
7581
class Range {
7682
public $start;
7783
public $end;
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
372378
}
373379
}
374380

381+
function parseEastAsianWidth(string $input) : array {
382+
$wideRanges = [];
383+
384+
foreach (parseDataFile($input) as $fields) {
385+
if ($fields[1] == 'W' || $fields[1] == 'F') {
386+
if ($dotsPos = strpos($fields[0], '..')) {
387+
$startCode = intval(substr($fields[0], 0, $dotsPos), 16);
388+
$endCode = intval(substr($fields[0], $dotsPos + 2), 16);
389+
390+
if (!empty($wideRanges)) {
391+
$lastRange = $wideRanges[count($wideRanges) - 1];
392+
if ($startCode == $lastRange->end + 1) {
393+
$lastRange->end = $endCode;
394+
continue;
395+
}
396+
}
397+
398+
$wideRanges[] = new Range($startCode, $endCode);
399+
} else {
400+
$code = intval($fields[0], 16);
401+
402+
if (!empty($wideRanges)) {
403+
$lastRange = $wideRanges[count($wideRanges) - 1];
404+
if ($code == $lastRange->end + 1) {
405+
$lastRange->end++;
406+
continue;
407+
}
408+
}
409+
410+
$wideRanges[] = new Range($code, $code);
411+
}
412+
}
413+
}
414+
415+
return $wideRanges;
416+
}
417+
375418
function formatArray(array $values, int $width, string $format) : string {
376419
$result = '';
377420
$i = 0;
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
412455
$propOffsets[] = $idx;
413456

414457
// TODO ucgendat.c pads the prop offsets to the next multiple of 4
415-
// for rather debious reasons of alignment. This should probably be
458+
// for rather dubious reasons of alignment. This should probably be
416459
// dropped
417460
while (count($propOffsets) % 4 != 0) {
418461
$propOffsets[] = 0;
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
509552

510553
function generateData(UnicodeData $data) {
511554
$result = <<<'HEADER'
512-
/* This file was generated from a modified version UCData's ucgendat.
555+
/* This file was generated from a modified version of UCData's ucgendat.
513556
*
514557
* DO NOT EDIT THIS FILE!
515558
*
516-
* Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
517-
* the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
518-
* files from http://www.unicode.org/Public/ and run this program.
559+
* Instead, download the appropriate UnicodeData-x.x.x.txt and
560+
* CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
561+
* and run ext/mbstring/ucgendat/ucgendat.php.
519562
*
520563
* More information can be found in the UCData package. Unfortunately,
521564
* the project's page doesn't seem to be live anymore, so you can use
522-
* OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
565+
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
523566
HEADER;
524567
$result .= "\n\n" . generatePropData($data);
525568
$result .= generateCaseData($data);
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
646689

647690
return $mph;
648691
}
692+
693+
function generateEastAsianWidthData(array $wideRanges) {
694+
$result = <<<'HEADER'
695+
/* This file was generated by ext/mbstring/ucgendat/ucgendat.php.
696+
*
697+
* DO NOT EDIT THIS FILE!
698+
*
699+
* East Asian Width table
700+
*
701+
* Some characters in East Asian languages are intended to be displayed in a space
702+
* which is roughly square. (This contrasts with others such as the Latin alphabet,
703+
* which are taller than they are wide.) To display these East Asian characters
704+
* properly, twice the horizontal space is used. This must be taken into account
705+
* when doing things like wrapping text to a specific width.
706+
*
707+
* Each pair of numbers in the below table is a range of Unicode codepoints
708+
* which should be displayed as double-width.
709+
*/
710+
711+
static const struct {
712+
int begin;
713+
int end;
714+
} mbfl_eaw_table[] = {
715+
716+
HEADER;
717+
718+
foreach ($wideRanges as $range) {
719+
$startCode = dechex($range->start);
720+
$endCode = dechex($range->end);
721+
$result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n";
722+
}
723+
724+
$result .= "};\n";
725+
return $result;
726+
}

ext/mbstring/unicode_data.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
1-
/* This file was generated from a modified version UCData's ucgendat.
1+
/* This file was generated from a modified version of UCData's ucgendat.
22
*
33
* DO NOT EDIT THIS FILE!
44
*
5-
* Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
6-
* the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
7-
* files from http://www.unicode.org/Public/ and run this program.
5+
* Instead, download the appropriate UnicodeData-x.x.x.txt and
6+
* CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/
7+
* and run ext/mbstring/ucgendat/ucgendat.php.
88
*
99
* More information can be found in the UCData package. Unfortunately,
1010
* the project's page doesn't seem to be live anymore, so you can use
11-
* OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
11+
* OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */
1212

1313
static const unsigned short _ucprop_size = 44;
1414

0 commit comments

Comments
 (0)