diff --git a/ext/mbstring/common_codepoints.txt b/ext/mbstring/common_codepoints.txt index 66c49c5a94ca8..3d86ba5fd9b81 100644 --- a/ext/mbstring/common_codepoints.txt +++ b/ext/mbstring/common_codepoints.txt @@ -3,18 +3,23 @@ 0x0020 0x007E # ASCII 0x00A1 0x00AC # Pound sign, Yen sign, copyright sign... 0x00AE 0x00FF # Accented Latin characters +0x0101 0x0101 # a with macron 0x0104 0x0107 # Polish 0x010C 0x010F # Czech +0x0113 0x0113 # e with macron 0x0118 0x011B # Polish, Czech 0x011F 0x011F # Turkish +0x012B 0x012B # i with macron 0x0130 0x0131 # Turkish 0x0141 0x0144 # Polish 0x0147 0x0148 # Czech +0x014D 0x014D # o with macron 0x0150 0x0151 # Hungarian 0x0158 0x015B # Czech, Polish 0x015F 0x015F # Turkish 0x0160 0x0161 # Used in Slavic names 0x0164 0x0165 # Czech +0x016B 0x016B # u with macron 0x016E 0x016F # Czech 0x0170 0x0171 # Hungarian 0x0179 0x017E # Polish, Czech, other Slavic languages diff --git a/ext/mbstring/rare_cp_bitvec.h b/ext/mbstring/rare_cp_bitvec.h index dec97a7d74f50..ac0027efbe397 100644 --- a/ext/mbstring/rare_cp_bitvec.h +++ b/ext/mbstring/rare_cp_bitvec.h @@ -11,7 +11,7 @@ static const uint32_t rare_codepoint_bitvec[] = { 0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000, -0x70ff0f0f, 0xfffcffff, 0x70fcfe61, 0x81fc3fcc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, +0x70f70f0d, 0xfffcf7ff, 0x70fcde61, 0x81fc37cc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index 11d5a1c31364f..b3c457738c1f8 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -85,6 +85,18 @@ $css = 'input[type="radio"]:checked + img { }'; echo mb_detect_encoding($css, mb_list_encodings(), true), "\n"; +// Test cases courtesy of Kirill Roskolii and Chris Burgess +echo "-- Māori text --\n"; + +echo mb_detect_encoding("Total Māori,31.5,33.3,31.8,33,36.4,33.2,33.2", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +// Names of native birds from Aotearoa: +echo mb_detect_encoding("Kākā", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +echo mb_detect_encoding("Whēkau", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +echo mb_detect_encoding("Tīwaiwaka", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +echo mb_detect_encoding("Kōtuku", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +echo mb_detect_encoding("Kererū", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; +echo mb_detect_encoding("Tūī", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n"; + echo "== DETECT ORDER ==\n"; mb_detect_order('auto'); @@ -408,6 +420,14 @@ UTF-8 UTF-8 SJIS UTF-8 +-- Māori text -- +UTF-8 +UTF-8 +UTF-8 +UTF-8 +UTF-8 +UTF-8 +UTF-8 == DETECT ORDER == JIS: JIS EUC-JP: EUC-JP