diff --git a/NEWS b/NEWS index 45661b5dffe18..f3e5c25ef9126 100644 --- a/NEWS +++ b/NEWS @@ -48,6 +48,11 @@ PHP NEWS - MBString: . ext/mbstring: fix new_value length check. (Max Kellermann) . Fix bug GH-10627 (mb_convert_encoding crashes PHP on Windows). (nielsdos) + . Fix bug GH-10192 (mb_detect_encoding() results for UTF-7 differ between + PHP 8.0 and 8.1 (if UTF-7 is present in the encodings list and the string + contains '+' character)). (pakutoma) + . Fix bug GH-10648 (mb_check_encoding() returns true for incorrect but + interpretable ISO-2022-JP byte sequences). (pakutoma) - Opcache: . Fix incorrect page_size check. (nielsdos) diff --git a/UPGRADING b/UPGRADING index 744fa57c1c8dc..05b33b231ee34 100644 --- a/UPGRADING +++ b/UPGRADING @@ -218,6 +218,16 @@ PHP 8.2 UPGRADE NOTES dba_fetch(string|array $key, $skip, $dba): string|false is still accepted, but it is recommended to use the new standard variant. +- MBString + . mb_check_encoding() now checks input encoding more strictly. + . mb_detect_encoding() now checks input encoding more strictly + when strict detection is enabled. + . mb_convert_encoding() checks the input encoding more strictly + if multiple encodings are passed to from_encoding + and the mbstring.strict_detection INI directive is set to 1. + This change only affects the encoding selection, + not the result of the conversion. + - Random . random_bytes() and random_int() now throw \Random\RandomException on CSPRNG failure. Previously a plain \Exception was thrown. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index 208792720af46..fd62602ab17b6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_7bit = { &vtbl_7bit_wchar, &vtbl_wchar_7bit, mb_7bit_to_wchar, - mb_wchar_to_7bit + mb_wchar_to_7bit, + NULL }; #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index ede3eef18ce7c..162e9b1bda87d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_base64 = { NULL, NULL, mb_base64_to_wchar, - mb_wchar_to_base64 + mb_wchar_to_base64, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_b64 = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_big5.c b/ext/mbstring/libmbfl/filters/mbfilter_big5.c index 58f89d1b5759e..7618130aac81e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_big5.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_big5.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_big5 = { &vtbl_big5_wchar, &vtbl_wchar_big5, mb_big5_to_wchar, - mb_wchar_to_big5 + mb_wchar_to_big5, + NULL }; const mbfl_encoding mbfl_encoding_cp950 = { @@ -82,7 +83,8 @@ const mbfl_encoding mbfl_encoding_cp950 = { &vtbl_cp950_wchar, &vtbl_wchar_cp950, mb_cp950_to_wchar, - mb_wchar_to_cp950 + mb_wchar_to_cp950, + NULL }; const struct mbfl_convert_vtbl vtbl_big5_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index 32a8bdf15f59a..93c33da9543d0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = { &vtbl_cp50220_wchar, &vtbl_wchar_cp50220, mb_cp5022x_to_wchar, - mb_wchar_to_cp50220 + mb_wchar_to_cp50220, + NULL }; const mbfl_encoding mbfl_encoding_cp50221 = { @@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = { &vtbl_cp50221_wchar, &vtbl_wchar_cp50221, mb_cp5022x_to_wchar, - mb_wchar_to_cp50221 + mb_wchar_to_cp50221, + NULL }; const mbfl_encoding mbfl_encoding_cp50222 = { @@ -87,7 +89,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = { &vtbl_cp50222_wchar, &vtbl_wchar_cp50222, mb_cp5022x_to_wchar, - mb_wchar_to_cp50222 + mb_wchar_to_cp50222, + NULL }; const struct mbfl_convert_vtbl vtbl_cp50220_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index 6311f9b72139a..d3aae8b10f56e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = { &vtbl_cp51932_wchar, &vtbl_wchar_cp51932, mb_cp51932_to_wchar, - mb_wchar_to_cp51932 + mb_wchar_to_cp51932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp51932_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index cf8e461e1d9c0..506c24393906d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -100,7 +100,8 @@ const mbfl_encoding mbfl_encoding_cp932 = { &vtbl_cp932_wchar, &vtbl_wchar_cp932, mb_cp932_to_wchar, - mb_wchar_to_cp932 + mb_wchar_to_cp932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp932_wchar = { @@ -133,7 +134,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = { &vtbl_sjiswin_wchar, &vtbl_wchar_sjiswin, mb_cp932_to_wchar, - mb_wchar_to_sjiswin + mb_wchar_to_sjiswin, + NULL }; const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c index 40ae8c86f9119..02e808ce9282c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_cp936 = { &vtbl_cp936_wchar, &vtbl_wchar_cp936, mb_cp936_to_wchar, - mb_wchar_to_cp936 + mb_wchar_to_cp936, + NULL }; const struct mbfl_convert_vtbl vtbl_cp936_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c index 50a0368a923f4..cec5f5d41d5e6 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = { &vtbl_euccn_wchar, &vtbl_wchar_euccn, mb_euccn_to_wchar, - mb_wchar_to_euccn + mb_wchar_to_euccn, + NULL }; const struct mbfl_convert_vtbl vtbl_euccn_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c index 2b0ae77534d56..aa5f323db6f0a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = { &vtbl_eucjp_wchar, &vtbl_wchar_eucjp, mb_eucjp_to_wchar, - mb_wchar_to_eucjp + mb_wchar_to_eucjp, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c index 09287a9d8f634..d35cec9541093 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { &vtbl_eucjpwin_wchar, &vtbl_wchar_eucjpwin, mb_eucjpwin_to_wchar, - mb_wchar_to_eucjpwin + mb_wchar_to_eucjpwin, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c index 69e6811922e30..b0cb1954739c9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = { &vtbl_euckr_wchar, &vtbl_wchar_euckr, mb_euckr_to_wchar, - mb_wchar_to_euckr + mb_wchar_to_euckr, + NULL }; const struct mbfl_convert_vtbl vtbl_euckr_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c index de1deb47705f1..522f5f4a05a5b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = { &vtbl_euctw_wchar, &vtbl_wchar_euctw, mb_euctw_to_wchar, - mb_wchar_to_euctw + mb_wchar_to_euctw, + NULL }; const struct mbfl_convert_vtbl vtbl_euctw_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c index 492df6046244f..d607aafef49e4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = { &vtbl_gb18030_wchar, &vtbl_wchar_gb18030, mb_gb18030_to_wchar, - mb_wchar_to_gb18030 + mb_wchar_to_gb18030, + NULL }; const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index afebdfd00811f..a75a9c757cb83 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_html_ent = { &vtbl_html_wchar, &vtbl_wchar_html, mb_htmlent_to_wchar, - mb_wchar_to_htmlent + mb_wchar_to_htmlent, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_html = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_hz.c b/ext/mbstring/libmbfl/filters/mbfilter_hz.c index 72e5963acfc18..b047bfc8b7b27 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_hz.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_hz.c @@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_hz = { &vtbl_hz_wchar, &vtbl_wchar_hz, mb_hz_to_wchar, - mb_wchar_to_hz + mb_wchar_to_hz, + NULL }; const struct mbfl_convert_vtbl vtbl_hz_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c index 65b6d66d2ec2e..e3676d30e2904 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = { &vtbl_2022jpms_wchar, &vtbl_wchar_2022jpms, mb_iso2022jpms_to_wchar, - mb_wchar_to_iso2022jpms + mb_wchar_to_iso2022jpms, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index c4b2bf0b9f1b9..d51fd720e9704 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_2022kr = { &vtbl_2022kr_wchar, &vtbl_wchar_2022kr, mb_iso2022kr_to_wchar, - mb_wchar_to_iso2022kr + mb_wchar_to_iso2022kr, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_2022kr = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c index ded492cafc13a..91ed4e1e84a21 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { &vtbl_2022jp_kddi_wchar, &vtbl_wchar_2022jp_kddi, mb_iso2022jp_kddi_to_wchar, - mb_wchar_to_iso2022jp_kddi + mb_wchar_to_iso2022jp_kddi, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_jis.c b/ext/mbstring/libmbfl/filters/mbfilter_jis.c index fc5f18aeb5d1c..80af0e695644c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c @@ -37,6 +37,8 @@ static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter); static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len); +static bool mb_check_jis(unsigned char *in, size_t in_len); const mbfl_encoding mbfl_encoding_jis = { mbfl_no_encoding_jis, @@ -49,6 +51,7 @@ const mbfl_encoding mbfl_encoding_jis = { &vtbl_wchar_jis, mb_iso2022jp_to_wchar, mb_wchar_to_jis, + mb_check_jis }; const mbfl_encoding mbfl_encoding_2022jp = { @@ -61,7 +64,8 @@ const mbfl_encoding mbfl_encoding_2022jp = { &vtbl_2022jp_wchar, &vtbl_wchar_2022jp, mb_iso2022jp_to_wchar, - mb_wchar_to_iso2022jp + mb_wchar_to_iso2022jp, + mb_check_iso2022jp }; const struct mbfl_convert_vtbl vtbl_jis_wchar = { @@ -780,3 +784,161 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +#define JISX_0201_KANA_SO 5 + +static bool mb_check_jis(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if (state == JISX_0201_KANA_SO) { + return false; + } + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else if (c3 == '(') { + if (p == e) { + return false; + } + unsigned char c4 = *p++; + if (c4 == '@' || c4 == 'B') { + state = JISX_0208; + } else if (c4 == 'D') { + state = JISX_0212; + } else { + return false; + } + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons. + * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */ + if (c3 == 'B' || c3 == 'H') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else if (c3 == 'I') { + state = JISX_0201_KANA; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE) { + /* "Kana In" marker */ + if (state != ASCII) { + return false; + } + state = JISX_0201_KANA_SO; + } else if (c == 0xF) { + /* "Kana Out" marker */ + if (state != JISX_0201_KANA_SO) { + return false; + } + state = ASCII; + } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (state == JISX_0208) { + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + } else { + if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) { + continue; + } + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else if (c >= 0xA1 && c <= 0xDF) { + /* GR-invoked Kana */ + continue; + } else { + return false; + } + } + + return state == ASCII; +} + + +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + if (c3 == 'B') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE || c == 0xF) { + /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */ + return false; + } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else { + return false; + } + } + + return state == ASCII; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 5fde30ee80935..2bcddedede337 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = { NULL, NULL, mb_qprint_to_wchar, - mb_wchar_to_qprint + mb_wchar_to_qprint, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_qprint = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index 56c9b2dbc85d9..c5872335a8526 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -86,7 +86,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int &vtbl_##id##_wchar, \ &vtbl_wchar_##id, \ mb_##id##_to_wchar, \ - mb_wchar_to_##id \ + mb_wchar_to_##id, \ + NULL \ } /* For single-byte encodings which use a conversion table */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index f23a8b08aceab..59399bf7217f0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis = { &vtbl_sjis_wchar, &vtbl_wchar_sjis, mb_sjis_to_wchar, - mb_wchar_to_sjis + mb_wchar_to_sjis, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 737871eda8a31..bc4d932187061 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { &vtbl_sjis2004_wchar, &vtbl_wchar_sjis2004, mb_sjis2004_to_wchar, - mb_wchar_to_sjis2004 + mb_wchar_to_sjis2004, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { @@ -100,7 +101,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { &vtbl_eucjp2004_wchar, &vtbl_wchar_eucjp2004, mb_eucjp2004_to_wchar, - mb_wchar_to_eucjp2004 + mb_wchar_to_eucjp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { @@ -133,7 +135,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = { &vtbl_2022jp_2004_wchar, &vtbl_wchar_2022jp_2004, mb_iso2022jp2004_to_wchar, - mb_wchar_to_iso2022jp2004 + mb_wchar_to_iso2022jp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 0ff2a198d36c8..8fb569b36c483 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, mb_sjismac_to_wchar, - mb_wchar_to_sjismac + mb_wchar_to_sjismac, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index 448e0a74ca3aa..7e8f412230b5a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -78,7 +78,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { &vtbl_sjis_docomo_wchar, &vtbl_wchar_sjis_docomo, mb_sjis_docomo_to_wchar, - mb_wchar_to_sjis_docomo + mb_wchar_to_sjis_docomo, + NULL }; const mbfl_encoding mbfl_encoding_sjis_kddi = { @@ -91,7 +92,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { &vtbl_sjis_kddi_wchar, &vtbl_wchar_sjis_kddi, mb_sjis_kddi_to_wchar, - mb_wchar_to_sjis_kddi + mb_wchar_to_sjis_kddi, + NULL }; const mbfl_encoding mbfl_encoding_sjis_sb = { @@ -104,7 +106,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { &vtbl_sjis_sb_wchar, &vtbl_wchar_sjis_sb, mb_sjis_sb_to_wchar, - mb_wchar_to_sjis_sb + mb_wchar_to_sjis_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index 3e0d0828cfa62..e6711d82f8a70 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = { &vtbl_ucs2_wchar, &vtbl_wchar_ucs2, mb_ucs2_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = { &vtbl_ucs2be_wchar, &vtbl_wchar_ucs2be, mb_ucs2be_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = { &vtbl_ucs2le_wchar, &vtbl_wchar_ucs2le, mb_ucs2le_to_wchar, - mb_wchar_to_ucs2le + mb_wchar_to_ucs2le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs2_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index 90312b8d501d5..410be0ace74f5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = { &vtbl_ucs4_wchar, &vtbl_wchar_ucs4, mb_ucs4_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = { &vtbl_ucs4be_wchar, &vtbl_wchar_ucs4be, mb_ucs4be_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = { &vtbl_ucs4le_wchar, &vtbl_wchar_ucs4le, mb_ucs4le_to_wchar, - mb_wchar_to_ucs4le + mb_wchar_to_ucs4le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs4_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c index 2ac351d644cdb..644e0b063d9b9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_uhc = { &vtbl_uhc_wchar, &vtbl_wchar_uhc, mb_uhc_to_wchar, - mb_wchar_to_uhc + mb_wchar_to_uhc, + NULL }; const struct mbfl_convert_vtbl vtbl_uhc_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index eddd56f362756..2a7d98721df79 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf16 = { &vtbl_utf16_wchar, &vtbl_wchar_utf16, mb_utf16_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf16be = { &vtbl_utf16be_wchar, &vtbl_wchar_utf16be, mb_utf16be_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf16le = { &vtbl_utf16le_wchar, &vtbl_wchar_utf16le, mb_utf16le_to_wchar, - mb_wchar_to_utf16le + mb_wchar_to_utf16le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf16_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index e8cd4ad454f2e..58551c8b3932d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf32 = { &vtbl_utf32_wchar, &vtbl_wchar_utf32, mb_utf32_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf32be = { &vtbl_utf32be_wchar, &vtbl_wchar_utf32be, mb_utf32be_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf32le = { &vtbl_utf32le_wchar, &vtbl_wchar_utf32le, mb_utf32le_to_wchar, - mb_wchar_to_utf32le + mb_wchar_to_utf32le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf32_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index f5fe261f69d00..57641a4bbe41d 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -29,10 +29,12 @@ #include "mbfilter.h" #include "mbfilter_utf7.h" +#include "utf7_helper.h" static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7(unsigned char *in, size_t in_len); static const unsigned char mbfl_base64_table[] = { /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */ @@ -59,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf7 = { &vtbl_utf7_wchar, &vtbl_wchar_utf7, mb_utf7_to_wchar, - mb_wchar_to_utf7 + mb_wchar_to_utf7, + mb_check_utf7 }; const struct mbfl_convert_vtbl vtbl_utf7_wchar = { @@ -408,16 +411,24 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFD -#define ASCII 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; } +static bool is_optional_direct(unsigned char c) +{ + /* Characters that are allowed to be encoded by Base64 or directly encoded */ + return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' || + c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' || + c == '|' || c == '}'; +} + +static bool can_end_base64(uint32_t c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; +} + static unsigned char decode_base64(unsigned char c) { if (c >= 'A' && c <= 'Z') { @@ -432,6 +443,8 @@ static unsigned char decode_base64(unsigned char c) return 63; } else if (c == '-') { return DASH; + } else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') { + return DIRECT; } else if (c <= 0x7F) { return ASCII; } @@ -470,7 +483,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t if (n == ILLEGAL) { *out++ = MBFL_BAD_INPUT; - } else if (n == ASCII) { + } else if (n == DIRECT || n == ASCII) { (*p)--; /* Unconsume byte */ } @@ -596,11 +609,6 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf return out - buf; } -static bool can_end_base64(uint32_t c) -{ - return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; -} - static bool should_direct_encode(uint32_t c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c); @@ -700,3 +708,129 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else { + /* 2nd part of surrogate pair came unexpectedly */ + return !(cp >= 0xDC00 && cp <= 0xDFFF); + } +} + +static bool can_encode_directly(unsigned char c) +{ + return should_direct_encode(c) || is_optional_direct(c) || c == '\0'; +} + +static bool mb_check_utf7(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + /* It is an error if trailing padding bits are not zeroes or if we were + * expecting the 2nd part of a surrogate pair when Base64 section ends */ + return !((n3 & 0x3) || is_surrogate); + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return !((n6 & 0xF) || is_surrogate); + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '+') { + if (p == e) { + base64 = true; + return !is_surrogate; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n > DASH) { + /* If a "+" character followed immediately by any character other than base64 or "-" */ + return false; + } else { + base64 = true; + } + } else if (can_encode_directly(c)) { + continue; + } else { + return false; + } + } + } + return !is_surrogate; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index 850edfbd63a1e..77b65bbeee8a4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -77,11 +77,13 @@ #include "mbfilter.h" #include "mbfilter_utf7imap.h" +#include "utf7_helper.h" static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7imap(unsigned char *in, size_t in_len); static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL}; @@ -95,7 +97,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { &vtbl_utf7imap_wchar, &vtbl_wchar_utf7imap, mb_utf7imap_to_wchar, - mb_wchar_to_utf7imap + mb_wchar_to_utf7imap, + mb_check_utf7imap }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { @@ -444,10 +447,6 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; @@ -732,3 +731,124 @@ static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else if (cp >= 0xDC00 && cp <= 0xDFFF) { + /* 2nd part of surrogate pair came unexpectedly */ + return false; + } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') { + return false; + } + return true; +} + +static bool mb_check_utf7imap(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + /* Base64 section */ + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '&') { + if (p == e) { + return false; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n == ILLEGAL) { + return false; + } else { + base64 = true; + } + } else if (c >= 0x20 && c <= 0x7E) { + continue; + } else { + return false; + } + } + } + return !base64; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 6c7bad0e805ac..44df3ab4257cd 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_utf8 = { &vtbl_utf8_wchar, &vtbl_wchar_utf8, mb_utf8_to_wchar, - mb_wchar_to_utf8 + mb_wchar_to_utf8, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index c573ec70f3bc9..59e2676208b90 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { &vtbl_utf8_docomo_wchar, &vtbl_wchar_utf8_docomo, mb_utf8_docomo_to_wchar, - mb_wchar_to_utf8_docomo + mb_wchar_to_utf8_docomo, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_a = { @@ -76,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { &vtbl_utf8_kddi_a_wchar, &vtbl_wchar_utf8_kddi_a, mb_utf8_kddi_a_to_wchar, - mb_wchar_to_utf8_kddi_a + mb_wchar_to_utf8_kddi_a, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_b = { @@ -89,7 +91,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { &vtbl_utf8_kddi_b_wchar, &vtbl_wchar_utf8_kddi_b, mb_utf8_kddi_b_to_wchar, - mb_wchar_to_utf8_kddi_b + mb_wchar_to_utf8_kddi_b, + NULL }; const mbfl_encoding mbfl_encoding_utf8_sb = { @@ -102,7 +105,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { &vtbl_utf8_sb_wchar, &vtbl_wchar_utf8_sb, mb_utf8_sb_to_wchar, - mb_wchar_to_utf8_sb + mb_wchar_to_utf8_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index cc90997c2fcae..83a56977d3e0e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_uuencode = { NULL, NULL, mb_uuencode_to_wchar, - mb_wchar_to_uuencode + mb_wchar_to_uuencode, + NULL }; const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { diff --git a/ext/mbstring/libmbfl/filters/utf7_helper.h b/ext/mbstring/libmbfl/filters/utf7_helper.h new file mode 100644 index 0000000000000..0e71a5a449031 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/utf7_helper.h @@ -0,0 +1,22 @@ +#ifndef MBFL_UTF7_HELPER_H +#define MBFL_UTF7_HELPER_H + +#include "mbfilter.h" + +/* Ways which a Base64-encoded section can end: */ +#define DASH 0xFC +#define DIRECT 0xFD +#define ASCII 0xFE +#define ILLEGAL 0xFF + +static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate) +{ + return !(gap || is_surrogate || n == ASCII || n == ILLEGAL); +} + +static inline bool has_surrogate(uint16_t cp, bool is_surrogate) +{ + return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF; +} + +#endif /* MBFL_UTF7_HELPER_H */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index e0cfa13e0b4f6..5f5ce07ce6f66 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -312,6 +312,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str unsigned char *p = string->val; int bad = 0; + if (identd->strict) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + mbfl_encoding_detector_data *data = &identd->filter_data[i]; + if (filter->from->check != NULL && !(filter->from->check)(p, n)) { + data->num_illegalchars++; + } + } + } + while (n--) { for (int i = 0; i < num; i++) { mbfl_convert_filter *filter = identd->filter_list[i]; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 8fe51c9fd4cbb..43db2f7f5b20b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_8bit = { &vtbl_8bit_wchar, &vtbl_wchar_8bit, mb_8bit_to_wchar, - mb_wchar_to_8bit + mb_wchar_to_8bit, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_wchar = { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c index 3fb7e991141cd..b932603e1c5f4 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c @@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_pass = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c index 5472d792a83cb..2bd9cca7b5b2a 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c @@ -42,5 +42,6 @@ const mbfl_encoding mbfl_encoding_wchar = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index e5ae285098ea0..f66e85acd8a81 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -143,6 +143,7 @@ typedef struct { typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state); typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end); +typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len); /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`, * the buffer must be at least this size (to work with all supported text encodings) */ @@ -232,6 +233,7 @@ typedef struct { const struct mbfl_convert_vtbl *output_filter; mb_to_wchar_fn to_wchar; mb_from_wchar_fn from_wchar; + mb_check_fn check; } mbfl_encoding; MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index a9ffbef13317b..f53031aaf9157 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -4412,6 +4412,10 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m unsigned char *in = (unsigned char*)input; unsigned int state = 0; + if (encoding->check != NULL) { + return encoding->check(in, length); + } + /* If the input string is not encoded in the given encoding, there is a significant chance * that this will be seen in the first bytes. Therefore, rather than converting an entire * buffer of 128 codepoints, convert and check just a few codepoints first */ diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt new file mode 100644 index 0000000000000..2930942c12c5a --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -0,0 +1,542 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A + B', + 'non-base64 character after -' => 'A - B', + 'base64 character before +' => 'A 1+ B', + 'base64 character before -' => 'A 1- B', + 'base64 character after +' => 'A +1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after +' => 'A 1+1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with +' => 'A +', + 'string ends with -' => 'A -', + '+ and -' => 'A +- B', + '- and +' => 'A -+ B', + 'valid direct encoding character =' => 'A = B', + 'invalid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character \\' => 'A \\ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character = after +' => 'A += B', + 'invalid direct encoding character ~ after +' => 'A +~ B', + 'invalid direct encoding character \\ after +' => 'A +\\ B', + 'invalid direct encoding character ESC after +' => "A +\x1b B", + 'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B', + 'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B', + 'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B', + 'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B', + 'valid base64 character between + and end of string' => 'A +ZeVnLIqe', + 'invalid base64 character between + and end of string' => 'A +ZeVnLIq', + 'valid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'valid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B', + 'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B', + 'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ', + 'first 16 bits of base64 character using surrogate pair between + and end of string' => 'A +2Gc', + 'invalid base64 character using surrogate pair in reverse order between + and -' => 'A +3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between + and -' => 'A +3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j0 B', + 'invalid base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j01 B', + 'invalid base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false)); + var_dump(mb_detect_encoding($case, 'UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-7', false)); + var_dump(mb_check_encoding($case, 'UTF-7')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +non-base64 character after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A B" +int(0) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A - B" +int(0) + +base64 character before + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A 1 B" +int(0) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 1- B" +int(0) + +base64 character after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(1) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A -1 B" +int(1) + +base64 character before and after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A 1? B" +int(2) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(7) "A 1-1 B" +int(2) + +string ends with + +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(2) "A " +int(2) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(3) "A -" +int(2) + ++ and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A + B" +int(2) + +- and + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A - B" +int(2) + +valid direct encoding character = +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid direct encoding character = after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid base64 character between + and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(2) + +invalid base64 character between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(3) + +valid base64 character between + and non-base64 character +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(3) + +invalid base64 character between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(4) + +valid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A 日本語? B" +int(5) + +invalid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本誵 B" +int(5) + +valid base64 character between + and end of string +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A 日本語" +int(5) + +invalid base64 character between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 日本?" +int(6) + +valid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(7) + +invalid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(8) + +valid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +invalid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +valid base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(8) + +first 16 bits of base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(9) + +first 16 bits of base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(10) + +valid base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 𩸽? B" +int(11) + +first 16 bits of base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(12) + +valid base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 𩸽" +int(12) + +first 16 bits of base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(13) + +invalid base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(15) + +last 16 bits of base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(16) + +invalid base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(18) + +last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(19) + +invalid base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(21) + +last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(23) + +invalid base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A ??" +int(25) + +last 16 bits of base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(26) diff --git a/ext/mbstring/tests/gh10192_utf7imap.phpt b/ext/mbstring/tests/gh10192_utf7imap.phpt new file mode 100644 index 0000000000000..c4f50884f6daf --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7imap.phpt @@ -0,0 +1,423 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A & B', + 'non-base64 character after -' => 'A - B', + 'base64 character before &' => 'A 1& B', + 'base64 character before -' => 'A 1- B', + 'base64 character after &' => 'A &1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after &' => 'A 1&1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with &' => 'A &', + 'string ends with -' => 'A -', + '& and -' => 'A &- B', + '- and &' => 'A -& B', + 'valid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character ~ after &' => 'A &~ B', + 'invalid direct encoding character ESC after &' => "A &\x1b B", + 'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B', + 'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B', + 'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B', + 'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B', + 'valid base64 character between & and end of string' => 'A &ZeVnLIqe', + 'invalid base64 character between & and end of string' => 'A &ZeVnLIq', + 'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B', + 'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B', + 'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ', + 'first 16 bits of base64 character using surrogate pair between & and end of string' => 'A &2Gc', + 'invalid base64 character using surrogate pair in reverse order between & and -' => 'A &3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between & and -' => 'A &3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j0 B', + 'invalid base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j01 B', + 'invalid base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false)); + var_dump(mb_check_encoding($case, 'UTF7-IMAP')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} + +?> +--EXPECT-- +non-base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(1) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A - B" +int(1) + +base64 character before & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(2) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A 1- B" +int(2) + +base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(3) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A -1 B" +int(3) + +base64 character before and after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(4) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(7) "A 1-1 B" +int(4) + +string ends with & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(3) "A ?" +int(5) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(3) "A -" +int(5) + +& and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A & B" +int(5) + +- and & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A -?B" +int(6) + +valid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A ~ B" +int(6) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(7) + +valid direct encoding character ~ after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(8) + +invalid direct encoding character ESC after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character between & and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(13) "A 日本語 B" +int(9) + +invalid base64 character between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(11) "A 日本? B" +int(10) + +valid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(11) + +invalid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(10) "A 日本?B" +int(12) + +valid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(13) + +invalid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本誵?B" +int(14) + +valid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(12) "A 日本語?" +int(15) + +invalid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(9) "A 日本?" +int(16) + +valid base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(8) "A 𩸽 B" +int(16) + +first 16 bits of base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(17) + +valid base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(18) + +first 16 bits of base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(19) + +valid base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(20) + +first 16 bits of base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(21) + +valid base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(7) "A 𩸽?" +int(22) + +first 16 bits of base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(24) + +invalid base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(6) "A ?? B" +int(26) + +last 16 bits of base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(27) + +invalid base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(29) + +last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(31) + +invalid base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(33) + +last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(35) + +invalid base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ???" +int(38) + +last 16 bits of base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(40) diff --git a/ext/mbstring/tests/gh10648.phpt b/ext/mbstring/tests/gh10648.phpt new file mode 100644 index 0000000000000..9f0b4b4db153a --- /dev/null +++ b/ext/mbstring/tests/gh10648.phpt @@ -0,0 +1,155 @@ +--TEST-- +GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences) +--EXTENSIONS-- +mbstring +--FILE-- + '1b244224221b2842', // 'あ' in ISO-2022-JP + 'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS + 'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS + 'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS + 'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208 + 'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212 + 'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213 + 'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII + 'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + echo 'JIS:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'JIS')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo 'ISO-2022-JP:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +ISO-2022-JP bytes +JIS: +bool(true) +あ +int(0) +ISO-2022-JP: +bool(true) +あ +int(0) + +ISO-2022-JP bytes without escape sequence +JIS: +bool(false) +あ +int(0) +ISO-2022-JP: +bool(false) +あ +int(0) + +JIS X 0201 7bit kana with escape sequence +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO/SI +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 8bit kana +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO and ESC +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with ESC and SI +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0208 character +JIS: +bool(true) +鯛 +int(0) +ISO-2022-JP: +bool(true) +鯛 +int(0) + +JIS X 0212 character +JIS: +bool(true) +鮋 +int(0) +ISO-2022-JP: +bool(false) +鮋 +int(0) + +JIS X 0213 character +JIS: +bool(false) +?$(P}L +int(1) +ISO-2022-JP: +bool(false) +?$(P}L +int(2) + +JIS C 6220-1969 ESC ( H +JIS: +bool(true) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) + +SO/SI when not in ASCII mode +JIS: +bool(false) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt index 634f0976994c3..5da1899c855b1 100644 --- a/ext/mbstring/tests/iso2022jp_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_encoding.phpt @@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) { /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ if (substr($from, 0, 3) == "\x1B(B") $from = substr($from, 3, strlen($from) - 3); - /* If the string switches to a different charset, it should switch back to - * ASCII at the end */ - if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false) - $from .= "\x1B(B"; - convertValidString($to, $from, 'UTF-16BE', $encoding, false); } } @@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) { for ($i = 0; $i < 0x80; $i++) { if ($i == 0xE || $i == 0xF || $i == 0x1B) continue; - testValid(chr($i), "\x00" . chr($i), 'JIS'); - testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */ - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); - testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid(chr($i), "\x00" . chr($i), 'JIS'); + convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */ + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); + testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); } for ($i = 0x80; $i < 256; $i++) { @@ -92,27 +87,27 @@ echo "ASCII support OK\n"; foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { if (ord($jisx0201) >= 128) { $kana = chr(ord($jisx0201) - 128); - testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false); - testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */ + testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false); + testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */ testValid($jisx0201, $utf16BE, 'JIS', false); } else { - testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80"); + testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80"); } } for ($i = 0x80; $i < 256; $i++) { if ($i >= 0xA1 && $i <= 0xDF) continue; - testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS'); - testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS'); } echo "JIS X 0201 support OK\n"; /* All valid JISX0208 characters */ foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS'); - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP'); } /* All invalid 2-byte JISX0208 characters */ @@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0208Chars[$testString])) { - testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS'); - testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP'); } } } @@ -142,7 +137,7 @@ echo "JIS X 0208 support OK\n"; /* All valid JISX0212 characters */ foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { - testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false); + testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false); } /* All invalid 2-byte JISX0212 characters */ @@ -150,14 +145,14 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0212Chars[$testString])) { - testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS'); } } } /* Try truncated JISX0212 characters */ for ($i = 0x21; $i <= 0x7E; $i++) { - testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS'); } testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false); @@ -167,29 +162,36 @@ convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false); echo "JIS X 0212 support OK\n"; /* All possible escape sequences */ -$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true]; for ($i = 0; $i <= 0xFF; $i++) { for ($j = 0; $j <= 0xFF; $j++) { $escapeSequence = "\x1B" . chr($i) . chr($j); if ($escapeSequence === "\x1B\$(") continue; - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } } for ($i = 0; $i <= 0xFF; $i++) { $escapeSequence = "\x1B\$(" . chr($i); - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } /* Also try a bare ESC */ diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 2f050c657c2f3..23166142088cc 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -1011,17 +1011,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-' // (Just trying to be exhaustive here) testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); -// + section terminated by a non-Base64 ASCII character which is NOT - -for ($i = 0; $i < 128; $i++) { - if ($i >= ord('A') && $i <= ord('Z')) - continue; - if ($i >= ord('a') && $i <= ord('z')) - continue; - if ($i >= ord('0') && $i <= ord('9')) - continue; - if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~')) - continue; - $char = chr($i); +// + section terminated by a non-Base64 direct character which is NOT - +foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) { testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); }