Skip to content

Commit be5aa65

Browse files
committed
Take order of candidate encodings into account when guessing text encoding
1 parent a8838e3 commit be5aa65

File tree

3 files changed

+39
-14
lines changed

3 files changed

+39
-14
lines changed

ext/mbstring/mb_gpc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i
234234
} else if (info->num_from_encodings == 1) {
235235
from_encoding = info->from_encodings[0];
236236
} else {
237-
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection));
237+
from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection), true);
238238
if (!from_encoding) {
239239
if (info->report_errors) {
240240
php_error_docref(NULL, E_WARNING, "Unable to detect encoding");

ext/mbstring/mbstring.c

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
9090

9191
static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
9292

93-
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
93+
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
9494

9595
static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
9696

@@ -452,7 +452,7 @@ static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *a
452452
list_size = MBSTRG(current_detect_order_list_size);
453453
}
454454

455-
return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false);
455+
return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false, true);
456456
}
457457

458458
static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
@@ -2695,7 +2695,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t leng
26952695
from_encoding = *from_encodings;
26962696
} else {
26972697
/* auto detect */
2698-
from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection));
2698+
from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
26992699
if (!from_encoding) {
27002700
php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
27012701
return NULL;
@@ -2996,9 +2996,10 @@ struct candidate {
29962996
size_t in_len;
29972997
uint64_t demerits; /* Wide bit size to prevent overflow */
29982998
unsigned int state;
2999+
float multiplier;
29993000
};
30003001

3001-
static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict)
3002+
static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
30023003
{
30033004
size_t j = 0;
30043005

@@ -3018,6 +3019,10 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
30183019
array[j].enc = enc;
30193020
array[j].state = 0;
30203021
array[j].demerits = 0;
3022+
/* This multiplier can optionally be used to make candidate encodings listed
3023+
* first more likely to be chosen. It is a weight factor which multiplies
3024+
* the number of demerits counted for each candidate. */
3025+
array[j].multiplier = order_significant ? 1.0 + ((0.2 * i) / length) : 1.0;
30213026
j++;
30223027
skip_to_next: ;
30233028
}
@@ -3093,10 +3098,14 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict
30933098
}
30943099
}
30953100

3101+
for (size_t i = 0; i < length; i++) {
3102+
array[i].demerits *= array[i].multiplier;
3103+
}
3104+
30963105
return length;
30973106
}
30983107

3099-
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
3108+
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
31003109
{
31013110
if (elist_size == 0) {
31023111
return NULL;
@@ -3117,7 +3126,7 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
31173126

31183127
/* Allocate on stack; when we return, this array is automatically freed */
31193128
struct candidate *array = alloca(elist_size * sizeof(struct candidate));
3120-
elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict);
3129+
elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
31213130

31223131
while (n--) {
31233132
start_string(array, elist_size, strings[n], str_lengths[n]);
@@ -3141,9 +3150,9 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
31413150
/* When doing 'strict' detection, any string which is invalid in the candidate encoding
31423151
* is rejected. With non-strict detection, we just continue, but apply demerits for
31433152
* each invalid byte sequence */
3144-
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
3153+
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
31453154
{
3146-
return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict);
3155+
return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
31473156
}
31483157

31493158
/* {{{ Encodings of the given string is returned (as a string) */
@@ -3162,8 +3171,17 @@ PHP_FUNCTION(mb_detect_encoding)
31623171
Z_PARAM_BOOL(strict)
31633172
ZEND_PARSE_PARAMETERS_END();
31643173

3174+
/* Should we pay attention to the order of the provided candidate encodings and prefer
3175+
* the earlier ones (if more than one candidate encoding matches)?
3176+
* If the entire list of supported encodings returned by `mb_list_encodings` is passed
3177+
* in, then don't treat the order as significant */
3178+
bool order_significant = true;
3179+
31653180
/* make encoding list */
31663181
if (encoding_ht) {
3182+
if (encoding_ht == MBSTRG(all_encodings_list)) {
3183+
order_significant = false;
3184+
}
31673185
if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
31683186
RETURN_THROWS();
31693187
}
@@ -3195,7 +3213,7 @@ PHP_FUNCTION(mb_detect_encoding)
31953213
if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
31963214
ret = &mbfl_encoding_utf8;
31973215
} else {
3198-
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
3216+
ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
31993217
}
32003218

32013219
efree(ZEND_VOIDP(elist));
@@ -3556,8 +3574,15 @@ PHP_FUNCTION(mb_convert_variables)
35563574

35573575
from_encoding = MBSTRG(current_internal_encoding);
35583576

3577+
bool order_significant = true;
3578+
35593579
/* pre-conversion encoding */
35603580
if (from_enc_ht) {
3581+
if (from_enc_ht == MBSTRG(all_encodings_list)) {
3582+
/* If entire list of supported encodings returned by `mb_list_encodings` is passed
3583+
* in, then don't treat the order of the list as significant */
3584+
order_significant = false;
3585+
}
35613586
if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
35623587
RETURN_THROWS();
35633588
}
@@ -3595,7 +3620,7 @@ PHP_FUNCTION(mb_convert_variables)
35953620
RETURN_FALSE;
35963621
}
35973622
}
3598-
from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection));
3623+
from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
35993624
efree(ZEND_VOIDP(val_list));
36003625
efree(len_list);
36013626
if (!from_encoding) {
@@ -4313,7 +4338,7 @@ PHP_FUNCTION(mb_send_mail)
43134338
/* Subject: */
43144339
const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
43154340
if (enc == &mbfl_encoding_pass) {
4316-
enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4341+
enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), true);
43174342
}
43184343
const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
43194344
size_t line_sep_len = strlen(line_sep);
@@ -4323,7 +4348,7 @@ PHP_FUNCTION(mb_send_mail)
43234348
/* message body */
43244349
const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
43254350
if (msg_enc == &mbfl_encoding_pass) {
4326-
msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
4351+
msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), true);
43274352
}
43284353

43294354
unsigned int num_errors = 0;

ext/mbstring/mbstring.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
6767
MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc);
6868
MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);
6969

70-
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
70+
MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
7171

7272
ZEND_BEGIN_MODULE_GLOBALS(mbstring)
7373
char *internal_encoding_name;

0 commit comments

Comments
 (0)