Skip to content

Commit 73633bf

Browse files
committed
Optimize conversion of SJIS-2004 text to Unicode
Conversion of SJIS-2004 text to UTF-8 using `mb_convert_encoding` is now about 60% faster than before. (Many other mbstring functions will also be faster now on SJIS-2004 text.)
1 parent c717c79 commit 73633bf

File tree

2 files changed

+72
-72
lines changed

2 files changed

+72
-72
lines changed

ext/mbstring/libmbfl/filters/mbfilter_cjk.c

Lines changed: 20 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,7 +1651,6 @@ static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
16511651

16521652
/* conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
16531653
if (w <= 0) {
1654-
w1 = (s1 << 8) | s2;
16551654
k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
16561655
if (k >= 0) {
16571656
w = jisx0213_jis_u5_tbl[k] + 0x20000;
@@ -1718,8 +1717,7 @@ static int mbfl_filt_conv_jis2004_wchar(int c, mbfl_convert_filter *filter)
17181717

17191718
/* check for japanese chars in CJK Unified Ideographs ext.B (U+2XXXX) */
17201719
if (w <= 0) {
1721-
w1 = ((c1 + k + 94) << 8) | c2;
1722-
k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
1720+
k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
17231721
if (k >= 0) {
17241722
w = jisx0213_jis_u5_tbl[k] + 0x20000;
17251723
}
@@ -2165,7 +2163,7 @@ static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint
21652163

21662164
/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
21672165
if (!w) {
2168-
int k = mbfl_bisec_srch2((c << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2166+
int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
21692167
if (k >= 0) {
21702168
w = jisx0213_jis_u5_tbl[k] + 0x20000;
21712169
}
@@ -2192,7 +2190,7 @@ static size_t mb_iso2022jp2004_to_wchar(unsigned char **in, size_t *in_len, uint
21922190

21932191
/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
21942192
if (!w) {
2195-
k = mbfl_bisec_srch2(((c + k + 94) << 8) | c2, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
2193+
k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
21962194
if (k >= 0) {
21972195
w = jisx0213_jis_u5_tbl[k] + 0x20000;
21982196
}
@@ -7142,25 +7140,17 @@ static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t
71427140
}
71437141
} else if (c >= 0xA1 && c <= 0xDF) {
71447142
*out++ = 0xFEC0 + c;
7145-
} else if (c > 0x80 && c < 0xFD && c != 0xA0) {
7143+
} else {
71467144
if (p == e) {
71477145
*out++ = MBFL_BAD_INPUT;
71487146
break;
71497147
}
71507148
unsigned char c2 = *p++;
7151-
7152-
if (c2 < 0x40 || c2 > 0xFC || c2 == 0x7F) {
7153-
*out++ = MBFL_BAD_INPUT;
7154-
continue;
7155-
}
7156-
7157-
unsigned int s1, s2;
7158-
SJIS_DECODE(c, c2, s1, s2);
7159-
unsigned int w1 = (s1 << 8) | s2, w = 0;
7149+
uint32_t w1 = sjis_mobile_decode_tbl1[c] + sjis_decode_tbl2[c2];
71607150

71617151
/* Conversion for combining characters */
7162-
if ((w1 >= 0x2477 && w1 <= 0x2479) || (w1 >= 0x2479 && w1 <= 0x247B) || (w1 >= 0x2577 && w1 <= 0x257E) || w1 == 0x2678 || w1 == 0x2B44 || (w1 >= 0x2B48 && w1 <= 0x2B4F) || (w1 >= 0x2B65 && w1 <= 0x2B66)) {
7163-
int k = mbfl_bisec_srch2(w1, jisx0213_u2_key, jisx0213_u2_tbl_len);
7152+
if (w1 >= 0x0170 && w1 <= 0x03F1) {
7153+
int k = mbfl_bisec_srch2(w1, jisx0213_u2_key_b, jisx0213_u2_tbl_len);
71647154
if (k >= 0) {
71657155
*out++ = jisx0213_u2_tbl[2*k];
71667156
*out++ = jisx0213_u2_tbl[2*k+1];
@@ -7169,23 +7159,24 @@ static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t
71697159
}
71707160

71717161
/* Conversion for BMP */
7172-
w1 = (s1 - 0x21)*94 + s2 - 0x21;
71737162
if (w1 < jisx0213_ucs_table_size) {
7174-
w = jisx0213_ucs_table[w1];
7163+
uint32_t w = jisx0213_ucs_table[w1];
7164+
if (w) {
7165+
*out++ = w;
7166+
continue;
7167+
}
71757168
}
71767169

71777170
/* Conversion for CJK Unified Ideographs extension B (U+2XXXX) */
7178-
if (!w) {
7179-
w1 = (s1 << 8) | s2;
7180-
int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7181-
if (k >= 0) {
7182-
w = jisx0213_jis_u5_tbl[k] + 0x20000;
7171+
int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
7172+
if (k >= 0) {
7173+
*out++ = jisx0213_jis_u5_tbl[k] + 0x20000;
7174+
} else {
7175+
if (c == 0x80 || c == 0xA0 || c >= 0xFD) {
7176+
p--;
71837177
}
7178+
*out++ = MBFL_BAD_INPUT;
71847179
}
7185-
7186-
*out++ = w ? w : MBFL_BAD_INPUT;
7187-
} else {
7188-
*out++ = MBFL_BAD_INPUT;
71897180
}
71907181
}
71917182

@@ -9174,7 +9165,6 @@ static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t
91749165

91759166
/* Conversion for CJK Unified Ideographs ext.B (U+2XXXX) */
91769167
if (!w) {
9177-
w1 = (s1 << 8) | s2;
91789168
int k = mbfl_bisec_srch2(w1, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
91799169
if (k >= 0) {
91809170
w = jisx0213_jis_u5_tbl[k] + 0x20000;
@@ -9218,7 +9208,7 @@ static size_t mb_eucjp2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t
92189208

92199209
/* Check for Japanese chars in CJK Unified Ideographs ext B (U+2XXXX) */
92209210
if (!w) {
9221-
k = mbfl_bisec_srch2(((c2 - 0x80 + k + 94) << 8) | (c3 - 0x80), jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
9211+
k = mbfl_bisec_srch2(s, jisx0213_jis_u5_key, jisx0213_u5_tbl_len);
92229212
if (k >= 0) {
92239213
w = jisx0213_jis_u5_tbl[k] + 0x20000;
92249214
}

ext/mbstring/libmbfl/filters/unicode_table_jis2004.h

Lines changed: 52 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4888,44 +4888,45 @@ static const unsigned short jisx0213_jis_u5_tbl[] = {
48884888
0x2218,0xA38C,0xA437,0xA5F1,0xA602,0xA61A,0xA6B2,};
48894889

48904890
static const unsigned short jisx0213_jis_u5_key[] = {
4891-
0x2E22,0x2F42,0x2F4C,0x2F60,0x2F7B,0x4F54,0x4F63,0x4F6E,
4892-
0x753A,0x7572,0x7629,0x7632,0x7660,0x776C,0x787E,0x7929,
4893-
0x7947,0x7954,0x796E,0x7A5D,0x7B33,0x7B49,0x7B6C,0x7C49,
4894-
0x7C51,0x7E66,0x7F21,0x7F2B,0x7F2E,0x7F36,0x7F46,0x7F70,
4895-
0x7F77,0x7F79,0x8030,0x8037,0x8038,0x803A,0x803B,0x803F,
4896-
0x8040,0x8045,0x8048,0x804A,0x804B,0x805B,0x8066,0x806C,
4897-
0x8122,0x8125,0x8127,0x8131,0x8132,0x8138,0x813F,0x8141,
4898-
0x814A,0x8152,0x8153,0x8159,0x815C,0x8177,0x822A,0x8231,
4899-
0x8232,0x823A,0x823D,0x8259,0x825C,0x825E,0x8263,0x826A,
4900-
0x826B,0x8272,0x8274,0x8275,0x8325,0x8332,0x833E,0x8344,
4901-
0x8347,0x8355,0x8356,0x837E,0x8422,0x842B,0x8430,0x8450,
4902-
0x8465,0x846D,0x8472,0x8524,0x8529,0x852A,0x8532,0x8534,
4903-
0x8535,0x8539,0x8556,0x857D,0x8623,0x8624,0x863A,0x863C,
4904-
0x863D,0x8642,0x8643,0x8644,0x8647,0x8649,0x8655,0x8656,
4905-
0x8657,0x865B,0x8677,0x8678,0x872A,0x873F,0x8740,0x8742,
4906-
0x8743,0x874E,0x8759,0x8761,0x8769,0x876A,0x8770,0x8775,
4907-
0x8823,0x8834,0x8849,0x885C,0x885E,0x885F,0x8860,0x8932,
4908-
0x8947,0x894D,0x8961,0x8964,0x8A22,0x8A33,0x8A39,0x8A53,
4909-
0x8A7B,0x8B2E,0x8B30,0x8B35,0x8B44,0x8B5D,0x8B61,0x8B66,
4910-
0x8B69,0x8B75,0x8B77,0x8B7A,0x8C21,0x8C23,0x8C24,0x8C28,
4911-
0x8C2C,0x8C3D,0x8C48,0x8C5B,0x8C75,0x8C76,0x8D32,0x8D3D,
4912-
0x8D3E,0x8D40,0x8D52,0x8D5D,0x8D5E,0x8D73,0x8D74,0x8D75,
4913-
0x8D77,0x8D7B,0x8D7D,0x8E22,0x8E24,0x8E27,0x8E2E,0x8E2F,
4914-
0x8E34,0x8E35,0x8E3D,0x8E42,0x8E4F,0x8E69,0x8E6B,0x8E72,
4915-
0x8E75,0x8E79,0x8F35,0x8F3A,0x8F46,0x8F56,0x8F58,0x8F5A,
4916-
0x8F5D,0x8F5F,0x8F63,0x8F6A,0x8F70,0x8F73,0x9044,0x904E,
4917-
0x905D,0x9075,0x907E,0x9121,0x9122,0x9133,0x9136,0x9164,
4918-
0x9165,0x916B,0x916E,0x9173,0x9229,0x922A,0x922C,0x9234,
4919-
0x923C,0x923E,0x9242,0x9256,0x9263,0x9277,0x9279,0x927A,
4920-
0x9325,0x932F,0x9332,0x9339,0x9342,0x9348,0x9359,0x935E,
4921-
0x9366,0x936B,0x937A,0x937E,0x9421,0x942C,0x942F,0x944F,
4922-
0x9450,0x9457,0x9465,0x9466,0x9471,0x9472,0x947E,0x9521,
4923-
0x952C,0x952D,0x9536,0x9537,0x953D,0x953E,0x954E,0x954F,
4924-
0x9557,0x955A,0x955C,0x955D,0x9561,0x9565,0x9567,0x9569,
4925-
0x9571,0x9622,0x9623,0x9638,0x9642,0x964C,0x9656,0x9659,
4926-
0x965D,0x9676,0x972C,0x974B,0x974C,0x9759,0x975B,0x975D,
4927-
0x9767,0x976D,0x9770,0x9825,0x9829,0x982B,0x9832,0x9835,
4928-
0x9853,0x9858,0x985A,0x986E,0x9870,0x9872,0x9876,};
4891+
0x04C7,0x0545,0x054F,0x0563,0x057E,0x1117,0x1126,0x1131,
4892+
0x1EF1,0x1F29,0x1F3E,0x1F47,0x1F75,0x1FDF,0x204F,0x2058,
4893+
0x2076,0x2083,0x209D,0x20EA,0x211E,0x2134,0x2157,0x2192,
4894+
0x219A,0x226B,0x2284,0x228E,0x2291,0x2299,0x22A9,0x22D3,
4895+
0x22DA,0x22DC,0x22F1,0x22F8,0x22F9,0x22FB,0x22FC,0x2300,
4896+
0x2301,0x2306,0x2309,0x230B,0x230C,0x231C,0x2327,0x232D,
4897+
0x2341,0x2344,0x2346,0x2350,0x2351,0x2357,0x235E,0x2360,
4898+
0x2369,0x2371,0x2372,0x2378,0x237B,0x2396,0x23A7,0x23AE,
4899+
0x23AF,0x23B7,0x23BA,0x23D6,0x23D9,0x23DB,0x23E0,0x23E7,
4900+
0x23E8,0x23EF,0x23F1,0x23F2,0x2400,0x240D,0x2419,0x241F,
4901+
0x2422,0x2430,0x2431,0x2459,0x245B,0x2464,0x2469,0x2489,
4902+
0x249E,0x24A6,0x24AB,0x24BB,0x24C0,0x24C1,0x24C9,0x24CB,
4903+
0x24CC,0x24D0,0x24ED,0x2514,0x2518,0x2519,0x252F,0x2531,
4904+
0x2532,0x2537,0x2538,0x2539,0x253C,0x253E,0x254A,0x254B,
4905+
0x254C,0x2550,0x256C,0x256D,0x257D,0x2592,0x2593,0x2595,
4906+
0x2596,0x25A1,0x25AC,0x25B4,0x25BC,0x25BD,0x25C3,0x25C8,
4907+
0x25D4,0x25E5,0x25FA,0x260D,0x260F,0x2610,0x2611,0x2641,
4908+
0x2656,0x265C,0x2670,0x2673,0x268F,0x26A0,0x26A6,0x26C0,
4909+
0x26E8,0x26F9,0x26FB,0x2700,0x270F,0x2728,0x272C,0x2731,
4910+
0x2734,0x2740,0x2742,0x2745,0x274A,0x274C,0x274D,0x2751,
4911+
0x2755,0x2766,0x2771,0x2784,0x279E,0x279F,0x27B9,0x27C4,
4912+
0x27C5,0x27C7,0x27D9,0x27E4,0x27E5,0x27FA,0x27FB,0x27FC,
4913+
0x27FE,0x2802,0x2804,0x2807,0x2809,0x280C,0x2813,0x2814,
4914+
0x2819,0x281A,0x2822,0x2827,0x2834,0x284E,0x2850,0x2857,
4915+
0x285A,0x285E,0x2878,0x287D,0x2889,0x2899,0x289B,0x289D,
4916+
0x28A0,0x28A2,0x28A6,0x28AD,0x28B3,0x28B6,0x28E5,0x28EF,
4917+
0x28FE,0x2916,0x291F,0x2920,0x2921,0x2932,0x2935,0x2963,
4918+
0x2964,0x296A,0x296D,0x2972,0x2986,0x2987,0x2989,0x2991,
4919+
0x2999,0x299B,0x299F,0x29B3,0x29C0,0x29D4,0x29D6,0x29D7,
4920+
0x29E0,0x29EA,0x29ED,0x29F4,0x29FD,0x2A03,0x2A14,0x2A19,
4921+
0x2A21,0x2A26,0x2A35,0x2A39,0x2A3A,0x2A45,0x2A48,0x2A68,
4922+
0x2A69,0x2A70,0x2A7E,0x2A7F,0x2A8A,0x2A8B,0x2A97,0x2A98,
4923+
0x2AA3,0x2AA4,0x2AAD,0x2AAE,0x2AB4,0x2AB5,0x2AC5,0x2AC6,
4924+
0x2ACE,0x2AD1,0x2AD3,0x2AD4,0x2AD8,0x2ADC,0x2ADE,0x2AE0,
4925+
0x2AE8,0x2AF7,0x2AF8,0x2B0D,0x2B17,0x2B21,0x2B2B,0x2B2E,
4926+
0x2B32,0x2B4B,0x2B5F,0x2B7E,0x2B7F,0x2B8C,0x2B8E,0x2B90,
4927+
0x2B9A,0x2BA0,0x2BA3,0x2BB6,0x2BBA,0x2BBC,0x2BC3,0x2BC6,
4928+
0x2BE4,0x2BE9,0x2BEB,0x2BFF,0x2C01,0x2C03,0x2C07
4929+
};
49294930

49304931
static const unsigned short jisx0213_u5_jis_tbl[] = {
49314932
0x2E22,0x7F21,0x7F2B,0x7F2E,0x7F36,0x7F46,0x7F70,0x7F79,
@@ -5016,7 +5017,15 @@ static const unsigned short jisx0213_u2_key[] = {
50165017
0x2477,0x2478,0x2479,0x247A,0x247B,0x2577,0x2578,0x2579,
50175018
0x257A,0x257B,0x257C,0x257D,0x257E,0x2678,0x2B44,0x2B48,
50185019
0x2B49,0x2B4A,0x2B4B,0x2B4C,0x2B4D,0x2B4E,0x2B4F,0x2B65,
5019-
0x2B66};
5020+
0x2B66
5021+
};
5022+
5023+
static const unsigned short jisx0213_u2_key_b[] = {
5024+
0x0170,0x0171,0x0172,0x0173,0x0174,0x01CE,0x01CF,0x01D0,
5025+
0x01D1,0x01D2,0x01D3,0x01D4,0x01D5,0x022D,0x03CF,0x03D3,
5026+
0x03D4,0x03D5,0x03D6,0x03D7,0x03D8,0x03D9,0x03DA,0x03F0,
5027+
0x03F1
5028+
};
50205029

50215030
/* combined pairs in Unicode */
50225031
static const unsigned short jisx0213_u2_tbl[] = {
@@ -5026,18 +5035,19 @@ static const unsigned short jisx0213_u2_tbl[] = {
50265035
0x30C8,0x309A,0x31F7,0x309A,0x00E6,0x0300,0x0254,0x0300,
50275036
0x0254,0x0301,0x028C,0x0300,0x028C,0x0301,0x0259,0x0300,
50285037
0x0259,0x0301,0x025A,0x0300,0x025A,0x0301,0x02E9,0x02E5,
5029-
0x02E5,0x02E9};
5038+
0x02E5,0x02E9
5039+
};
50305040

50315041
/* fallback chars for combined chars in Unicode */
50325042
static const unsigned short jisx0213_u2_fb_tbl[] = {
50335043
0x242B,0x242D,0x242F,0x2431,0x2433,0x252B,0x252D,0x252F,
50345044
0x2531,0x2533,0x253B,0x2544,0x2548,0x2675,0x295C,0x2B38,
50355045
0x2B38,0x2B37,0x2B37,0x2B30,0x2B30,0x2B43,0x2B43,0x2B64,
5036-
0x2B60};
5046+
0x2B60
5047+
};
50375048

50385049
static const int jisx0213_u2_tbl_len = sizeof(jisx0213_u2_key)/sizeof(unsigned short);
50395050

5040-
50415051
static const unsigned short jisx0213_p2_ofst[] = {
50425052
0, 7, 2, 3, 4, 11, 12, 13, 14, 77, 78, 79, 80, 81,
50435053
82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93

0 commit comments

Comments
 (0)