|
4 | 4 | /**
|
5 | 5 | * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
|
6 | 6 | * follows. This file is not necessary to build PHP. It's only necessary to
|
7 |
| - * rebuild unicode_data.h from Unicode ucd files. |
| 7 | + * rebuild unicode_data.h and eaw_width.h from Unicode ucd files. |
8 | 8 | *
|
9 | 9 | * Example usage:
|
10 |
| - * php ucgendat.php UnicodeData.txt |
| 10 | + * php ucgendat.php path/to/Unicode/data/files |
11 | 11 | */
|
12 | 12 |
|
13 | 13 | /* Copyright 1998-2007 The OpenLDAP Foundation.
|
|
45 | 45 | if ($argc < 2) {
|
46 | 46 | echo "Usage: php ucgendata.php ./datadir\n";
|
47 | 47 | echo "./datadir must contain:\n";
|
48 |
| - echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n"; |
| 48 | + echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n"; |
49 | 49 | return;
|
50 | 50 | }
|
51 | 51 |
|
|
54 | 54 | $caseFoldingFile = $dir . '/CaseFolding.txt';
|
55 | 55 | $specialCasingFile = $dir . '/SpecialCasing.txt';
|
56 | 56 | $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
|
| 57 | +$eastAsianWidthFile = $dir . '/EastAsianWidth.txt'; |
57 | 58 |
|
58 |
| -$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile]; |
| 59 | +$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile]; |
59 | 60 | foreach ($files as $file) {
|
60 | 61 | if (!file_exists($file)) {
|
61 | 62 | echo "File $file does not exist.\n";
|
|
72 | 73 | parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
|
73 | 74 | file_put_contents($outputFile, generateData($data));
|
74 | 75 |
|
| 76 | +$eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h"; |
| 77 | + |
| 78 | +$eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile)); |
| 79 | +file_put_contents($eawFile, generateEastAsianWidthData($eawData)); |
| 80 | + |
75 | 81 | class Range {
|
76 | 82 | public $start;
|
77 | 83 | public $end;
|
@@ -372,6 +378,43 @@ function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
|
372 | 378 | }
|
373 | 379 | }
|
374 | 380 |
|
| 381 | +function parseEastAsianWidth(string $input) : array { |
| 382 | + $wideRanges = []; |
| 383 | + |
| 384 | + foreach (parseDataFile($input) as $fields) { |
| 385 | + if ($fields[1] == 'W' || $fields[1] == 'F') { |
| 386 | + if ($dotsPos = strpos($fields[0], '..')) { |
| 387 | + $startCode = intval(substr($fields[0], 0, $dotsPos), 16); |
| 388 | + $endCode = intval(substr($fields[0], $dotsPos + 2), 16); |
| 389 | + |
| 390 | + if (!empty($wideRanges)) { |
| 391 | + $lastRange = $wideRanges[count($wideRanges) - 1]; |
| 392 | + if ($startCode == $lastRange->end + 1) { |
| 393 | + $lastRange->end = $endCode; |
| 394 | + continue; |
| 395 | + } |
| 396 | + } |
| 397 | + |
| 398 | + $wideRanges[] = new Range($startCode, $endCode); |
| 399 | + } else { |
| 400 | + $code = intval($fields[0], 16); |
| 401 | + |
| 402 | + if (!empty($wideRanges)) { |
| 403 | + $lastRange = $wideRanges[count($wideRanges) - 1]; |
| 404 | + if ($code == $lastRange->end + 1) { |
| 405 | + $lastRange->end++; |
| 406 | + continue; |
| 407 | + } |
| 408 | + } |
| 409 | + |
| 410 | + $wideRanges[] = new Range($code, $code); |
| 411 | + } |
| 412 | + } |
| 413 | + } |
| 414 | + |
| 415 | + return $wideRanges; |
| 416 | +} |
| 417 | + |
375 | 418 | function formatArray(array $values, int $width, string $format) : string {
|
376 | 419 | $result = '';
|
377 | 420 | $i = 0;
|
@@ -412,7 +455,7 @@ function generatePropData(UnicodeData $data) {
|
412 | 455 | $propOffsets[] = $idx;
|
413 | 456 |
|
414 | 457 | // TODO ucgendat.c pads the prop offsets to the next multiple of 4
|
415 |
| - // for rather debious reasons of alignment. This should probably be |
| 458 | + // for rather dubious reasons of alignment. This should probably be |
416 | 459 | // dropped
|
417 | 460 | while (count($propOffsets) % 4 != 0) {
|
418 | 461 | $propOffsets[] = 0;
|
@@ -509,17 +552,17 @@ function generateCaseData(UnicodeData $data) {
|
509 | 552 |
|
510 | 553 | function generateData(UnicodeData $data) {
|
511 | 554 | $result = <<<'HEADER'
|
512 |
| -/* This file was generated from a modified version UCData's ucgendat. |
| 555 | +/* This file was generated from a modified version of UCData's ucgendat. |
513 | 556 | *
|
514 | 557 | * DO NOT EDIT THIS FILE!
|
515 | 558 | *
|
516 |
| - * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download |
517 |
| - * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt |
518 |
| - * files from http://www.unicode.org/Public/ and run this program. |
| 559 | + * Instead, download the appropriate UnicodeData-x.x.x.txt and |
| 560 | + * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/ |
| 561 | + * and run ext/mbstring/ucgendat/ucgendat.php. |
519 | 562 | *
|
520 | 563 | * More information can be found in the UCData package. Unfortunately,
|
521 | 564 | * the project's page doesn't seem to be live anymore, so you can use
|
522 |
| - * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */ |
| 565 | + * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */ |
523 | 566 | HEADER;
|
524 | 567 | $result .= "\n\n" . generatePropData($data);
|
525 | 568 | $result .= generateCaseData($data);
|
@@ -646,3 +689,38 @@ function generateMPH(array $map, bool $fast) {
|
646 | 689 |
|
647 | 690 | return $mph;
|
648 | 691 | }
|
| 692 | + |
| 693 | +function generateEastAsianWidthData(array $wideRanges) { |
| 694 | + $result = <<<'HEADER' |
| 695 | +/* This file was generated by ext/mbstring/ucgendat/ucgendat.php. |
| 696 | + * |
| 697 | + * DO NOT EDIT THIS FILE! |
| 698 | + * |
| 699 | + * East Asian Width table |
| 700 | + * |
| 701 | + * Some characters in East Asian languages are intended to be displayed in a space |
| 702 | + * which is roughly square. (This contrasts with others such as the Latin alphabet, |
| 703 | + * which are taller than they are wide.) To display these East Asian characters |
| 704 | + * properly, twice the horizontal space is used. This must be taken into account |
| 705 | + * when doing things like wrapping text to a specific width. |
| 706 | + * |
| 707 | + * Each pair of numbers in the below table is a range of Unicode codepoints |
| 708 | + * which should be displayed as double-width. |
| 709 | + */ |
| 710 | +
|
| 711 | +static const struct { |
| 712 | + int begin; |
| 713 | + int end; |
| 714 | +} mbfl_eaw_table[] = { |
| 715 | + |
| 716 | +HEADER; |
| 717 | + |
| 718 | + foreach ($wideRanges as $range) { |
| 719 | + $startCode = dechex($range->start); |
| 720 | + $endCode = dechex($range->end); |
| 721 | + $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n"; |
| 722 | + } |
| 723 | + |
| 724 | + $result .= "};\n"; |
| 725 | + return $result; |
| 726 | +} |
0 commit comments