Skip to content

Commit 3e7acf9

Browse files
committed
Remove mbstring identify filters
mbstring had an 'identify filter' for almost every supported text encoding which was used when auto-detecting the most likely encoding for a string. It would run over the string and set a 'flag' if it saw anything which did not appear likely to be the encoding in question. One problem with this scheme was that encodings which merely appeared less likely to be the correct one were completely rejected, even if there was no better candidate. Another problem was that the 'identify filters' had a huge amount of code duplication with the 'conversion filters'. Eliminate the identify filters. Instead, when auto-detecting text encoding, use conversion filters to see whether the input string is valid in candidate encodings or not. At the same type, watch the type of codepoints which the string decodes to and mark it as less likely if non-printable characters (ESC, form feed, bell, etc.) or 'private use area' codepoints are seen. Interestingly, one old test case in which JIS text was misidentified as UTF-8 (and this wrong behavior was enshrined in the test) was 'fixed' and the JIS string is now auto-detected as JIS.
1 parent a416f93 commit 3e7acf9

File tree

121 files changed

+120
-2819
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+120
-2819
lines changed

ext/mbstring/config.m4

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,6 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
161161
libmbfl/mbfl/mbfl_convert.c
162162
libmbfl/mbfl/mbfl_encoding.c
163163
libmbfl/mbfl/mbfl_filter_output.c
164-
libmbfl/mbfl/mbfl_ident.c
165164
libmbfl/mbfl/mbfl_language.c
166165
libmbfl/mbfl/mbfl_memory_device.c
167166
libmbfl/mbfl/mbfl_string.c
@@ -177,7 +176,7 @@ AC_DEFUN([PHP_MBSTRING_SETUP_LIBMBFL], [
177176
libmbfl/nls/nls_tr.c
178177
libmbfl/nls/nls_ua.c
179178
])
180-
PHP_MBSTRING_ADD_INSTALL_HEADERS([libmbfl/config.h libmbfl/mbfl/eaw_table.h libmbfl/mbfl/mbfilter.h libmbfl/mbfl/mbfilter_8bit.h libmbfl/mbfl/mbfilter_pass.h libmbfl/mbfl/mbfilter_wchar.h libmbfl/mbfl/mbfl_consts.h libmbfl/mbfl/mbfl_convert.h libmbfl/mbfl/mbfl_defs.h libmbfl/mbfl/mbfl_encoding.h libmbfl/mbfl/mbfl_filter_output.h libmbfl/mbfl/mbfl_ident.h libmbfl/mbfl/mbfl_language.h libmbfl/mbfl/mbfl_memory_device.h libmbfl/mbfl/mbfl_string.h])
179+
PHP_MBSTRING_ADD_INSTALL_HEADERS([libmbfl/config.h libmbfl/mbfl/eaw_table.h libmbfl/mbfl/mbfilter.h libmbfl/mbfl/mbfilter_8bit.h libmbfl/mbfl/mbfilter_pass.h libmbfl/mbfl/mbfilter_wchar.h libmbfl/mbfl/mbfl_consts.h libmbfl/mbfl/mbfl_convert.h libmbfl/mbfl/mbfl_defs.h libmbfl/mbfl/mbfl_encoding.h libmbfl/mbfl/mbfl_filter_output.h libmbfl/mbfl/mbfl_language.h libmbfl/mbfl/mbfl_memory_device.h libmbfl/mbfl/mbfl_string.h])
181180
])
182181

183182
dnl

ext/mbstring/config.w32

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,14 @@ if (PHP_MBSTRING != "no") {
4040

4141
ADD_SOURCES("ext/mbstring/libmbfl/mbfl", "mbfilter.c mbfilter_8bit.c \
4242
mbfilter_pass.c mbfilter_wchar.c mbfl_convert.c mbfl_encoding.c \
43-
mbfl_filter_output.c mbfl_ident.c mbfl_language.c mbfl_memory_device.c \
43+
mbfl_filter_output.c mbfl_language.c mbfl_memory_device.c \
4444
mbfl_string.c", "mbstring");
4545

4646
ADD_SOURCES("ext/mbstring/libmbfl/nls", "nls_de.c nls_en.c nls_ja.c \
4747
nls_kr.c nls_neutral.c nls_ru.c nls_uni.c nls_zh.c nls_hy.c \
4848
nls_ua.c nls_tr.c", "mbstring");
4949

50-
PHP_INSTALL_HEADERS("ext/mbstring", "mbstring.h libmbfl/config.h libmbfl/mbfl/eaw_table.h libmbfl/mbfl/mbfilter.h libmbfl/mbfl/mbfilter_8bit.h libmbfl/mbfl/mbfilter_pass.h libmbfl/mbfl/mbfilter_wchar.h libmbfl/mbfl/mbfl_consts.h libmbfl/mbfl/mbfl_convert.h libmbfl/mbfl/mbfl_defs.h libmbfl/mbfl/mbfl_encoding.h libmbfl/mbfl/mbfl_filter_output.h libmbfl/mbfl/mbfl_ident.h libmbfl/mbfl/mbfl_language.h libmbfl/mbfl/mbfl_memory_device.h libmbfl/mbfl/mbfl_string.h");
50+
PHP_INSTALL_HEADERS("ext/mbstring", "mbstring.h libmbfl/config.h libmbfl/mbfl/eaw_table.h libmbfl/mbfl/mbfilter.h libmbfl/mbfl/mbfilter_8bit.h libmbfl/mbfl/mbfilter_pass.h libmbfl/mbfl/mbfilter_wchar.h libmbfl/mbfl/mbfl_consts.h libmbfl/mbfl/mbfl_convert.h libmbfl/mbfl/mbfl_defs.h libmbfl/mbfl/mbfl_encoding.h libmbfl/mbfl/mbfl_filter_output.h libmbfl/mbfl/mbfl_language.h libmbfl/mbfl/mbfl_memory_device.h libmbfl/mbfl/mbfl_string.h");
5151

5252
AC_DEFINE('HAVE_MBSTRING', 1, 'Have mbstring support');
5353

ext/mbstring/libmbfl/filters/mbfilter_7bit.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "mbfilter.h"
3232
#include "mbfilter_7bit.h"
3333

34-
static int mbfl_filt_ident_7bit(int c, mbfl_identify_filter *filter);
35-
3634
const mbfl_encoding mbfl_encoding_7bit = {
3735
mbfl_no_encoding_7bit,
3836
"7bit",
@@ -44,12 +42,6 @@ const mbfl_encoding mbfl_encoding_7bit = {
4442
NULL
4543
};
4644

47-
const struct mbfl_identify_vtbl vtbl_identify_7bit = {
48-
mbfl_no_encoding_7bit,
49-
mbfl_filt_ident_common_ctor,
50-
mbfl_filt_ident_7bit
51-
};
52-
5345
const struct mbfl_convert_vtbl vtbl_8bit_7bit = {
5446
mbfl_no_encoding_8bit,
5547
mbfl_no_encoding_7bit,
@@ -88,11 +80,3 @@ int mbfl_filt_conv_any_7bit(int c, mbfl_convert_filter *filter)
8880
}
8981
return c;
9082
}
91-
92-
static int mbfl_filt_ident_7bit(int c, mbfl_identify_filter *filter)
93-
{
94-
if (c >= 0x80) {
95-
filter->flag = 1;
96-
}
97-
return c;
98-
}

ext/mbstring/libmbfl/filters/mbfilter_7bit.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
#include "mbfilter.h"
3535

3636
extern const mbfl_encoding mbfl_encoding_7bit;
37-
extern const struct mbfl_identify_vtbl vtbl_identify_7bit;
3837
extern const struct mbfl_convert_vtbl vtbl_8bit_7bit;
3938
extern const struct mbfl_convert_vtbl vtbl_7bit_8bit;
4039

ext/mbstring/libmbfl/filters/mbfilter_armscii8.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@
3030
#include "mbfilter_armscii8.h"
3131
#include "unicode_table_armscii8.h"
3232

33-
static int mbfl_filt_ident_armscii8(int c, mbfl_identify_filter *filter);
34-
3533
static const char *mbfl_encoding_armscii8_aliases[] = {"ArmSCII-8", "ArmSCII8", "ARMSCII-8", "ARMSCII8", NULL};
3634

3735
const mbfl_encoding mbfl_encoding_armscii8 = {
@@ -45,12 +43,6 @@ const mbfl_encoding mbfl_encoding_armscii8 = {
4543
&vtbl_wchar_armscii8
4644
};
4745

48-
const struct mbfl_identify_vtbl vtbl_identify_armscii8 = {
49-
mbfl_no_encoding_armscii8,
50-
mbfl_filt_ident_common_ctor,
51-
mbfl_filt_ident_armscii8
52-
};
53-
5446
const struct mbfl_convert_vtbl vtbl_wchar_armscii8 = {
5547
mbfl_no_encoding_wchar,
5648
mbfl_no_encoding_armscii8,
@@ -108,11 +100,3 @@ int mbfl_filt_conv_wchar_armscii8(int c, mbfl_convert_filter *filter)
108100

109101
return c;
110102
}
111-
112-
static int mbfl_filt_ident_armscii8(int c, mbfl_identify_filter *filter)
113-
{
114-
if (c >= armscii8_ucs_table_min && !armscii8_ucs_table[c - armscii8_ucs_table_min]) {
115-
filter->flag = 1;
116-
}
117-
return c;
118-
}

ext/mbstring/libmbfl/filters/mbfilter_armscii8.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include "mbfilter.h"
2929

3030
extern const mbfl_encoding mbfl_encoding_armscii8;
31-
extern const struct mbfl_identify_vtbl vtbl_identify_armscii8;
3231
extern const struct mbfl_convert_vtbl vtbl_wchar_armscii8;
3332
extern const struct mbfl_convert_vtbl vtbl_armscii8_wchar;
3433

ext/mbstring/libmbfl/filters/mbfilter_ascii.c

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "mbfilter.h"
3232
#include "mbfilter_ascii.h"
3333

34-
static int mbfl_filt_ident_ascii(int c, mbfl_identify_filter *filter);
35-
3634
static const char *mbfl_encoding_ascii_aliases[] = {"ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "US-ASCII", "ISO646-US", "us", "IBM367", "IBM-367", "cp367", "csASCII", NULL};
3735

3836
const mbfl_encoding mbfl_encoding_ascii = {
@@ -46,12 +44,6 @@ const mbfl_encoding mbfl_encoding_ascii = {
4644
&vtbl_wchar_ascii
4745
};
4846

49-
const struct mbfl_identify_vtbl vtbl_identify_ascii = {
50-
mbfl_no_encoding_ascii,
51-
mbfl_filt_ident_common_ctor,
52-
mbfl_filt_ident_ascii
53-
};
54-
5547
const struct mbfl_convert_vtbl vtbl_ascii_wchar = {
5648
mbfl_no_encoding_ascii,
5749
mbfl_no_encoding_wchar,
@@ -101,16 +93,3 @@ int mbfl_filt_conv_wchar_ascii(int c, mbfl_convert_filter *filter)
10193

10294
return c;
10395
}
104-
105-
static int mbfl_filt_ident_ascii(int c, mbfl_identify_filter *filter)
106-
{
107-
if (c >= 0x20 && c < 0x80) {
108-
;
109-
} else if (c == 0x0d || c == 0x0a || c == 0x09 || c == 0) { /* CR or LF or HTAB or null */
110-
;
111-
} else {
112-
filter->flag = 1;
113-
}
114-
115-
return c;
116-
}

ext/mbstring/libmbfl/filters/mbfilter_ascii.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
#include "mbfilter.h"
3535

3636
extern const mbfl_encoding mbfl_encoding_ascii;
37-
extern const struct mbfl_identify_vtbl vtbl_identify_ascii;
3837
extern const struct mbfl_convert_vtbl vtbl_ascii_wchar;
3938
extern const struct mbfl_convert_vtbl vtbl_wchar_ascii;
4039

ext/mbstring/libmbfl/filters/mbfilter_big5.c

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@
3232

3333
#include "unicode_table_big5.h"
3434

35-
static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter);
36-
3735
static const unsigned char mblen_table_big5[] = { /* 0x81-0xFE */
3836
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3937
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -77,18 +75,6 @@ const mbfl_encoding mbfl_encoding_cp950 = {
7775
&vtbl_wchar_cp950
7876
};
7977

80-
const struct mbfl_identify_vtbl vtbl_identify_big5 = {
81-
mbfl_no_encoding_big5,
82-
mbfl_filt_ident_common_ctor,
83-
mbfl_filt_ident_big5
84-
};
85-
86-
const struct mbfl_identify_vtbl vtbl_identify_cp950 = {
87-
mbfl_no_encoding_cp950,
88-
mbfl_filt_ident_common_ctor,
89-
mbfl_filt_ident_big5
90-
};
91-
9278
const struct mbfl_convert_vtbl vtbl_big5_wchar = {
9379
mbfl_no_encoding_big5,
9480
mbfl_no_encoding_wchar,
@@ -322,28 +308,3 @@ mbfl_filt_conv_wchar_big5(int c, mbfl_convert_filter *filter)
322308

323309
return c;
324310
}
325-
326-
static int mbfl_filt_ident_big5(int c, mbfl_identify_filter *filter)
327-
{
328-
int c1;
329-
if (filter->encoding->no_encoding == mbfl_no_encoding_cp950) {
330-
c1 = 0x80;
331-
} else {
332-
c1 = 0xa0;
333-
}
334-
335-
if (filter->status) { /* kanji second char */
336-
if (c < 0x40 || (c > 0x7e && c < 0xa1) ||c > 0xfe) { /* bad */
337-
filter->flag = 1;
338-
}
339-
filter->status = 0;
340-
} else if (c >= 0 && c < 0x80) { /* latin ok */
341-
;
342-
} else if (c > c1 && c < 0xff) { /* DBCS lead byte */
343-
filter->status = 1;
344-
} else { /* bad */
345-
filter->flag = 1;
346-
}
347-
348-
return c;
349-
}

ext/mbstring/libmbfl/filters/mbfilter_big5.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,12 +33,10 @@
3333
#include "mbfilter.h"
3434

3535
extern const mbfl_encoding mbfl_encoding_big5;
36-
extern const struct mbfl_identify_vtbl vtbl_identify_big5;
3736
extern const struct mbfl_convert_vtbl vtbl_big5_wchar;
3837
extern const struct mbfl_convert_vtbl vtbl_wchar_big5;
3938

4039
extern const mbfl_encoding mbfl_encoding_cp950;
41-
extern const struct mbfl_identify_vtbl vtbl_identify_cp950;
4240
extern const struct mbfl_convert_vtbl vtbl_cp950_wchar;
4341
extern const struct mbfl_convert_vtbl vtbl_wchar_cp950;
4442

ext/mbstring/libmbfl/filters/mbfilter_cp1251.c

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "mbfilter_cp1251.h"
3232
#include "unicode_table_cp1251.h"
3333

34-
static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter);
35-
3634
static const char *mbfl_encoding_cp1251_aliases[] = {"CP1251", "CP-1251", "WINDOWS-1251", NULL};
3735

3836
const mbfl_encoding mbfl_encoding_cp1251 = {
@@ -46,12 +44,6 @@ const mbfl_encoding mbfl_encoding_cp1251 = {
4644
&vtbl_wchar_cp1251
4745
};
4846

49-
const struct mbfl_identify_vtbl vtbl_identify_cp1251 = {
50-
mbfl_no_encoding_cp1251,
51-
mbfl_filt_ident_common_ctor,
52-
mbfl_filt_ident_cp1251
53-
};
54-
5547
const struct mbfl_convert_vtbl vtbl_wchar_cp1251 = {
5648
mbfl_no_encoding_wchar,
5749
mbfl_no_encoding_cp1251,
@@ -107,12 +99,3 @@ int mbfl_filt_conv_wchar_cp1251(int c, mbfl_convert_filter *filter)
10799

108100
return c;
109101
}
110-
111-
static int mbfl_filt_ident_cp1251(int c, mbfl_identify_filter *filter)
112-
{
113-
/* Only one byte in this single-byte encoding is not used */
114-
if (c == 0x98) {
115-
filter->flag = 1;
116-
}
117-
return c;
118-
}

ext/mbstring/libmbfl/filters/mbfilter_cp1251.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "mbfilter.h"
3434

3535
extern const mbfl_encoding mbfl_encoding_cp1251;
36-
extern const struct mbfl_identify_vtbl vtbl_identify_cp1251;
3736
extern const struct mbfl_convert_vtbl vtbl_wchar_cp1251;
3837
extern const struct mbfl_convert_vtbl vtbl_cp1251_wchar;
3938

ext/mbstring/libmbfl/filters/mbfilter_cp1252.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "mbfilter_cp1252.h"
3232
#include "unicode_table_cp1252.h"
3333

34-
static int mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter);
35-
3634
static const char *mbfl_encoding_cp1252_aliases[] = {"cp1252", NULL};
3735

3836
const mbfl_encoding mbfl_encoding_cp1252 = {
@@ -46,12 +44,6 @@ const mbfl_encoding mbfl_encoding_cp1252 = {
4644
&vtbl_wchar_cp1252
4745
};
4846

49-
const struct mbfl_identify_vtbl vtbl_identify_cp1252 = {
50-
mbfl_no_encoding_cp1252,
51-
mbfl_filt_ident_common_ctor,
52-
mbfl_filt_ident_cp1252
53-
};
54-
5547
const struct mbfl_convert_vtbl vtbl_cp1252_wchar = {
5648
mbfl_no_encoding_cp1252,
5749
mbfl_no_encoding_wchar,
@@ -115,11 +107,3 @@ int mbfl_filt_conv_cp1252_wchar(int c, mbfl_convert_filter *filter)
115107

116108
return c;
117109
}
118-
119-
static int mbfl_filt_ident_cp1252(int c, mbfl_identify_filter *filter)
120-
{
121-
if (c >= 0x80 && c < 0xA0 && !cp1252_ucs_table[c - 0x80]) {
122-
filter->flag = 1;
123-
}
124-
return c;
125-
}

ext/mbstring/libmbfl/filters/mbfilter_cp1252.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "mbfilter.h"
3434

3535
extern const mbfl_encoding mbfl_encoding_cp1252;
36-
extern const struct mbfl_identify_vtbl vtbl_identify_cp1252;
3736
extern const struct mbfl_convert_vtbl vtbl_cp1252_wchar;
3837
extern const struct mbfl_convert_vtbl vtbl_wchar_cp1252;
3938

ext/mbstring/libmbfl/filters/mbfilter_cp1254.c

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@
3131
#include "mbfilter_cp1254.h"
3232
#include "unicode_table_cp1254.h"
3333

34-
static int mbfl_filt_ident_cp1254(int c, mbfl_identify_filter *filter);
35-
3634
static const char *mbfl_encoding_cp1254_aliases[] = {"CP1254", "CP-1254", "WINDOWS-1254", NULL};
3735

3836
const mbfl_encoding mbfl_encoding_cp1254 = {
@@ -46,12 +44,6 @@ const mbfl_encoding mbfl_encoding_cp1254 = {
4644
&vtbl_wchar_cp1254
4745
};
4846

49-
const struct mbfl_identify_vtbl vtbl_identify_cp1254 = {
50-
mbfl_no_encoding_cp1254,
51-
mbfl_filt_ident_common_ctor,
52-
mbfl_filt_ident_cp1254
53-
};
54-
5547
const struct mbfl_convert_vtbl vtbl_cp1254_wchar = {
5648
mbfl_no_encoding_cp1254,
5749
mbfl_no_encoding_wchar,
@@ -107,11 +99,3 @@ int mbfl_filt_conv_cp1254_wchar(int c, mbfl_convert_filter *filter)
10799
CK((*filter->output_function)(s, filter->data));
108100
return c;
109101
}
110-
111-
static int mbfl_filt_ident_cp1254(int c, mbfl_identify_filter *filter)
112-
{
113-
if (c >= 0x81 && c <= 0x9E && !cp1254_ucs_table[c - cp1254_ucs_table_min]) {
114-
filter->flag = 1;
115-
}
116-
return c;
117-
}

ext/mbstring/libmbfl/filters/mbfilter_cp1254.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@
3333
#include "mbfilter.h"
3434

3535
extern const mbfl_encoding mbfl_encoding_cp1254;
36-
extern const struct mbfl_identify_vtbl vtbl_identify_cp1254;
3736
extern const struct mbfl_convert_vtbl vtbl_cp1254_wchar;
3837
extern const struct mbfl_convert_vtbl vtbl_wchar_cp1254;
3938

0 commit comments

Comments
 (0)