Skip to content

Commit be1a215

Browse files
committed
Optimize (AND FIX) mb_check_encoding (cut execution time by 50%+)
Previously, `mb_check_encoding` did an awful lot of unneeded work. In order to determine whether a string was valid or not, it would convert the whole string into wchar (code points), which required dynamically allocating a (potentially large) buffer. Then it would turn right around and convert that big 'ol buffer of code points back to the original encoding again. Finally, it would check whether any invalid bytes were detected during that long and onerous process. The thing is, mbstring _already_ has machinery for detecting whether a string is valid in a certain encoding or not, and it doesn't require copying any data around or allocating buffers. Better yet, it can fail fast when an invalid byte is found. Why not use it? It's sure a lot faster! Further, the legacy code was also badly broken. Why? Because aside from checking whether illegal characters were detected, it would also check whether the conversion to and from wchars was lossless. But, some encodings have more than one valid encoding for the same character. In such cases, it is not possible to make the conversion to and from wchars lossless for every valid character. So `mb_check_encoding` would actually reject good strings in a lot of encodings!
1 parent 335c1b9 commit be1a215

File tree

1 file changed

+15
-51
lines changed

1 file changed

+15
-51
lines changed

ext/mbstring/mbstring.c

Lines changed: 15 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3856,84 +3856,50 @@ PHP_FUNCTION(mb_get_info)
38563856
/* }}} */
38573857

38583858

3859-
static inline mbfl_buffer_converter *php_mb_init_convd(const mbfl_encoding *encoding)
3859+
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
38603860
{
3861-
mbfl_buffer_converter *convd;
3862-
3863-
convd = mbfl_buffer_converter_new(encoding, encoding, 0);
3864-
if (convd == NULL) {
3865-
return NULL;
3866-
}
3867-
mbfl_buffer_converter_illegal_mode(convd, MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE);
3868-
mbfl_buffer_converter_illegal_substchar(convd, 0);
3869-
return convd;
3870-
}
3871-
3872-
3873-
static inline int php_mb_check_encoding_impl(mbfl_buffer_converter *convd, const char *input, size_t length, const mbfl_encoding *encoding) {
3874-
mbfl_string string, result;
3875-
3876-
mbfl_string_init_set(&string, encoding);
3877-
mbfl_string_init(&result);
3878-
3879-
string.val = (unsigned char *) input;
3880-
string.len = length;
3881-
3882-
mbfl_string *ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
3883-
size_t illegalchars = mbfl_buffer_illegalchars(convd);
3884-
3885-
if (ret != NULL) {
3886-
if (illegalchars == 0 && string.len == result.len && memcmp(string.val, result.val, string.len) == 0) {
3887-
mbfl_string_clear(&result);
3888-
return 1;
3861+
mbfl_identify_filter *ident = mbfl_identify_filter_new2(encoding);
3862+
3863+
while (length--) {
3864+
unsigned char c = *input++;
3865+
(ident->filter_function)(c, ident);
3866+
if (ident->flag) {
3867+
mbfl_identify_filter_delete(ident);
3868+
return 0;
38893869
}
3890-
mbfl_string_clear(&result);
38913870
}
3892-
return 0;
3893-
}
3894-
3895-
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
3896-
{
3897-
mbfl_buffer_converter *convd = php_mb_init_convd(encoding);
3898-
/* If this assertion fails this means some memory allocation failure which is a bug */
3899-
ZEND_ASSERT(convd != NULL);
39003871

3901-
int result = php_mb_check_encoding_impl(convd, input, length, encoding);
3902-
mbfl_buffer_converter_delete(convd);
3872+
/* String must not end in the middle of a multi-byte character */
3873+
int result = (ident->status == 0);
3874+
mbfl_identify_filter_delete(ident);
39033875
return result;
39043876
}
39053877

39063878
static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
39073879
{
3908-
mbfl_buffer_converter *convd;
39093880
zend_long idx;
39103881
zend_string *key;
39113882
zval *entry;
39123883
int valid = 1;
39133884

3914-
(void)(idx);
3915-
3916-
convd = php_mb_init_convd(encoding);
3917-
/* If this assertion fails this means some memory allocation failure which is a bug */
3918-
ZEND_ASSERT(convd != NULL);
3885+
(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
39193886

39203887
if (GC_IS_RECURSIVE(vars)) {
3921-
mbfl_buffer_converter_delete(convd);
39223888
php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
39233889
return 0;
39243890
}
39253891
GC_TRY_PROTECT_RECURSION(vars);
39263892
ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
39273893
ZVAL_DEREF(entry);
39283894
if (key) {
3929-
if (!php_mb_check_encoding_impl(convd, ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
3895+
if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
39303896
valid = 0;
39313897
break;
39323898
}
39333899
}
39343900
switch (Z_TYPE_P(entry)) {
39353901
case IS_STRING:
3936-
if (!php_mb_check_encoding_impl(convd, Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
3902+
if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
39373903
valid = 0;
39383904
break;
39393905
}
@@ -3957,11 +3923,9 @@ static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding
39573923
}
39583924
} ZEND_HASH_FOREACH_END();
39593925
GC_TRY_UNPROTECT_RECURSION(vars);
3960-
mbfl_buffer_converter_delete(convd);
39613926
return valid;
39623927
}
39633928

3964-
39653929
/* {{{ Check if the string is valid for the specified encoding */
39663930
PHP_FUNCTION(mb_check_encoding)
39673931
{

0 commit comments

Comments
 (0)