Skip to content

Commit ea08ef8

Browse files
committed
Optimize mb_check_encoding (cut execution time by 50%+)
Previously, mb_check_encoding did an awful lot of unneeded work. In order to determine whether a string was valid or not, it would convert the whole string into wchar (code points), which required dynamically allocating a (potentially large) buffer. Then it would turn right around and convert that big 'ol buffer of code points back to the original encoding again. Finally, it would check whether any invalid bytes were detected during that long and onerous process. The thing is, mbstring _already_ has machinery for detecting whether a string is valid in a certain encoding or not, and it doesn't require copying any data around or allocating buffers. Better yet, it can fail fast when an invalid byte is found. Why not use it? It's sure a lot faster!
1 parent f280c75 commit ea08ef8

File tree

1 file changed

+15
-51
lines changed

1 file changed

+15
-51
lines changed

ext/mbstring/mbstring.c

Lines changed: 15 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3980,84 +3980,50 @@ PHP_FUNCTION(mb_get_info)
39803980
/* }}} */
39813981

39823982

3983-
static inline mbfl_buffer_converter *php_mb_init_convd(const mbfl_encoding *encoding)
3983+
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
39843984
{
3985-
mbfl_buffer_converter *convd;
3986-
3987-
convd = mbfl_buffer_converter_new(encoding, encoding, 0);
3988-
if (convd == NULL) {
3989-
return NULL;
3990-
}
3991-
mbfl_buffer_converter_illegal_mode(convd, MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE);
3992-
mbfl_buffer_converter_illegal_substchar(convd, 0);
3993-
return convd;
3994-
}
3995-
3996-
3997-
static inline int php_mb_check_encoding_impl(mbfl_buffer_converter *convd, const char *input, size_t length, const mbfl_encoding *encoding) {
3998-
mbfl_string string, result;
3999-
4000-
mbfl_string_init_set(&string, encoding);
4001-
mbfl_string_init(&result);
4002-
4003-
string.val = (unsigned char *) input;
4004-
string.len = length;
4005-
4006-
mbfl_string *ret = mbfl_buffer_converter_feed_result(convd, &string, &result);
4007-
size_t illegalchars = mbfl_buffer_illegalchars(convd);
4008-
4009-
if (ret != NULL) {
4010-
if (illegalchars == 0 && string.len == result.len && memcmp(string.val, result.val, string.len) == 0) {
4011-
mbfl_string_clear(&result);
4012-
return 1;
3985+
mbfl_identify_filter *ident = mbfl_identify_filter_new2(encoding);
3986+
3987+
while (length--) {
3988+
unsigned char c = *input++;
3989+
(ident->filter_function)(c, ident);
3990+
if (ident->flag) {
3991+
mbfl_identify_filter_delete(ident);
3992+
return 0;
40133993
}
4014-
mbfl_string_clear(&result);
40153994
}
4016-
return 0;
4017-
}
4018-
4019-
MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding)
4020-
{
4021-
mbfl_buffer_converter *convd = php_mb_init_convd(encoding);
4022-
/* If this assertion fails this means some memory allocation failure which is a bug */
4023-
ZEND_ASSERT(convd != NULL);
40243995

4025-
int result = php_mb_check_encoding_impl(convd, input, length, encoding);
4026-
mbfl_buffer_converter_delete(convd);
3996+
/* String must not end in the middle of a multi-byte character */
3997+
int result = (ident->status == 0);
3998+
mbfl_identify_filter_delete(ident);
40273999
return result;
40284000
}
40294001

40304002
static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding *encoding)
40314003
{
4032-
mbfl_buffer_converter *convd;
40334004
zend_long idx;
40344005
zend_string *key;
40354006
zval *entry;
40364007
int valid = 1;
40374008

4038-
(void)(idx);
4039-
4040-
convd = php_mb_init_convd(encoding);
4041-
/* If this assertion fails this means some memory allocation failure which is a bug */
4042-
ZEND_ASSERT(convd != NULL);
4009+
(void)(idx); /* Suppress spurious compiler warning that `idx` is not used */
40434010

40444011
if (GC_IS_RECURSIVE(vars)) {
4045-
mbfl_buffer_converter_delete(convd);
40464012
php_error_docref(NULL, E_WARNING, "Cannot not handle circular references");
40474013
return 0;
40484014
}
40494015
GC_TRY_PROTECT_RECURSION(vars);
40504016
ZEND_HASH_FOREACH_KEY_VAL(vars, idx, key, entry) {
40514017
ZVAL_DEREF(entry);
40524018
if (key) {
4053-
if (!php_mb_check_encoding_impl(convd, ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
4019+
if (!php_mb_check_encoding(ZSTR_VAL(key), ZSTR_LEN(key), encoding)) {
40544020
valid = 0;
40554021
break;
40564022
}
40574023
}
40584024
switch (Z_TYPE_P(entry)) {
40594025
case IS_STRING:
4060-
if (!php_mb_check_encoding_impl(convd, Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
4026+
if (!php_mb_check_encoding(Z_STRVAL_P(entry), Z_STRLEN_P(entry), encoding)) {
40614027
valid = 0;
40624028
break;
40634029
}
@@ -4081,11 +4047,9 @@ static int php_mb_check_encoding_recursive(HashTable *vars, const mbfl_encoding
40814047
}
40824048
} ZEND_HASH_FOREACH_END();
40834049
GC_TRY_UNPROTECT_RECURSION(vars);
4084-
mbfl_buffer_converter_delete(convd);
40854050
return valid;
40864051
}
40874052

4088-
40894053
/* {{{ Check if the string is valid for the specified encoding */
40904054
PHP_FUNCTION(mb_check_encoding)
40914055
{

0 commit comments

Comments
 (0)