Skip to content

Commit d134809

Browse files
Pusnowzhangyangyu
authored andcommitted
bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (pythonGH-1958)
Hangul composition check boundaries are wrong for the second character ([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3) instead of [0x11A7, 0x11C3]).
1 parent ceeef10 commit d134809

File tree

4 files changed

+22
-3
lines changed

4 files changed

+22
-3
lines changed

Lib/test/test_unicodedata.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,19 @@ def test_issue10254(self):
208208
b = 'C\u0338' * 20 + '\xC7'
209209
self.assertEqual(self.db.normalize('NFC', a), b)
210210

211+
def test_issue29456(self):
212+
# Fix #29456
213+
u1176_str_a = '\u1100\u1176\u11a8'
214+
u1176_str_b = '\u1100\u1176\u11a8'
215+
u11a7_str_a = '\u1100\u1175\u11a7'
216+
u11a7_str_b = '\uae30\u11a7'
217+
u11c3_str_a = '\u1100\u1175\u11c3'
218+
u11c3_str_b = '\uae30\u11c3'
219+
self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
220+
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
221+
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
222+
223+
211224
def test_east_asian_width(self):
212225
eaw = self.db.east_asian_width
213226
self.assertRaises(TypeError, eaw, b'a')

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,6 +1800,7 @@ Jason Yeo
18001800
EungJun Yi
18011801
Bob Yodlowski
18021802
Danny Yoo
1803+
Wonsup Yoon
18031804
Rory Yorke
18041805
George Yoshida
18051806
Kazuhiro Yoshida
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix bugs in hangul normalization: u1176, u11a7 and u11c3

Modules/unicodedata.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
681681
if (LBase <= code && code < (LBase+LCount) &&
682682
i + 1 < len &&
683683
VBase <= PyUnicode_READ(kind, data, i+1) &&
684-
PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
684+
PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
685+
/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
686+
and V character is a modern vowel (0x1161 ~ 0x1175). */
685687
int LIndex, VIndex;
686688
LIndex = code - LBase;
687689
VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
688690
code = SBase + (LIndex*VCount+VIndex)*TCount;
689691
i+=2;
690692
if (i < len &&
691-
TBase <= PyUnicode_READ(kind, data, i) &&
692-
PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693+
TBase < PyUnicode_READ(kind, data, i) &&
694+
PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
695+
/* check T character is a modern trailing consonant
696+
(0x11A8 ~ 0x11C2). */
693697
code += PyUnicode_READ(kind, data, i)-TBase;
694698
i++;
695699
}

0 commit comments

Comments
 (0)