Skip to content

Commit 50d6772

Browse files
committed
Improve mb_detect_encoding accuracy for text containing the word Māori (with accent)
1 parent 782ffd7 commit 50d6772

File tree

3 files changed

+6
-1
lines changed

3 files changed

+6
-1
lines changed

ext/mbstring/common_codepoints.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
0x0020 0x007E # ASCII
44
0x00A1 0x00AC # Pound sign, Yen sign, copyright sign...
55
0x00AE 0x00FF # Accented Latin characters
6+
0x0101 0x0101 # Accented a, used in the word Māori
67
0x0104 0x0107 # Polish
78
0x010C 0x010F # Czech
89
0x0118 0x011B # Polish, Czech

ext/mbstring/rare_cp_bitvec.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
static const uint32_t rare_codepoint_bitvec[] = {
1313
0xffffd9ff, 0x00000000, 0x00000000, 0x80000000, 0xffffffff, 0x00002001, 0x00000000, 0x00000000,
14-
0x70ff0f0f, 0xfffcffff, 0x70fcfe61, 0x81fc3fcc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
14+
0x70ff0f0d, 0xfffcffff, 0x70fcfe61, 0x81fc3fcc, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1515
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
1616
0xfffff800, 0xffffffff, 0xffffffff, 0x0300ffff, 0x0000280f, 0x00000004, 0x00000000, 0x00000000,
1717
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

ext/mbstring/tests/mb_detect_encoding.phpt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ $css = 'input[type="radio"]:checked + img {
8585
}';
8686
echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
8787

88+
// This test case courtesy of Kirill Roskolii
89+
echo "Māori: ", mb_detect_encoding("Total Māori,31.5,33.3,31.8,33,36.4,33.2,33.2", ['UTF-8', 'ISO-8859-1', 'Windows-1251']), "\n";
90+
8891
echo "== DETECT ORDER ==\n";
8992

9093
mb_detect_order('auto');
@@ -408,6 +411,7 @@ UTF-8
408411
UTF-8
409412
SJIS
410413
UTF-8
414+
Māori: UTF-8
411415
== DETECT ORDER ==
412416
JIS: JIS
413417
EUC-JP: EUC-JP

0 commit comments

Comments
 (0)