Take order of candidate encodings into account when guessing text encoding

alexdowad · alexdowad · commit 3ab10da758f0 · 2023-05-16T07:01:07.000-07:00
The documentation for mb_detect_encoding says that this function "Detects the most likely character encoding for string `string` from an ordered list of candidates". Prior to 28b346b, mb_detect_encoding did not really attempt to determine the "most likely" text encoding for the input string. It would just return the first candidate encoding for which the string was valid. In 28b346b, I amended this function so that it uses heuristics to try to guess which candidate encoding is "most likely". However, the caller did not have any way to indicate which candidate text encoding(s) they consider to be more likely, in case the heuristics applied are inconclusive. In the language of Bayesian probability, there was no way for the caller to indicate their 'prior' assignment of probabilities. Further, the documentation for mb_detect_encoding also says that the second parameter `encodings` is "a list of character encodings to try, in order". The documentation clearly implies that the order of the `encodings` argument should be significant. Therefore, amend mb_detect_encoding so that while it still uses heuristics to guess the most likely text encoding for the input string, it favors those which are earlier in the list of candidate encodings. One complication is that many callers of mb_detect_encoding use it in this way: mb_detect_encoding($string, mb_list_encodings()); In a majority of cases, this is bad code; mb_detect_encoding will both be much slower and the results will be less reliable than if a smaller list of candidates is used. However, since such code already exists and people are using it in production, we should not unnecessarily break it. The order of candidate encodings obviously does not express any prior belief of which candidates are more likely in this case, and treating it as if it did will degrade the accuracy of the result. Since mb_list_encodings now returns a single, immutable array on each call, we can avoid that problem by turning off the new behavior when we receive the array of encodings returned by mb_list_encodings. This implementation means that if the user does this: $a = mb_list_encodings(); mb_detect_encoding($string, $a); ...then the order of candidate encodings will not be considered. However, if the user explicitly initializes their own array of all supported legacy text encodings, then the order *will* be considered. The other functions which also follow this new behavior are: • mb_convert_variables • mb_convert_encoding (when multiple candidate input encodings are listed) Other places where "detection" (or really "guessing") of text encoding may be performed include: • mb_send_mail • Zend engine, when determining the encoding of a PHP script • mbstring processing of HTTP request contents, when http_input INI parameter is set to a list In these cases, the new logic based on order of candidate encodings is *not* enabled. It *might* be logical to consider the order of candidate encodings in some or all of these cases, but I'm not sure if that is true, so it seems wiser to avoid more behavior changes than is necessary. Further, ever since the new encoding detection heuristics were implemented in 28b346b, we have not received any complaints of user code being broken in these areas. So I am reluctant to "fix what isn't broken". Well, some might say that applying the new detection heuristics to mb_send_mail, etc. in 28b346b was "fixing what wasn't broken", but (cough cough) I don't have any comment on that...
diff --git a/ext/mbstring/mb_gpc.c b/ext/mbstring/mb_gpc.c
@@ -234,7 +234,7 @@ const mbfl_encoding *_php_mb_encoding_handler_ex(const php_mb_encoding_handler_i
 	} else if (info->num_from_encodings == 1) {
 		from_encoding = info->from_encodings[0];
 	} else {
-		from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection));
+		from_encoding = mb_guess_encoding_for_strings((const unsigned char**)val_list, len_list, num, info->from_encodings, info->num_from_encodings, MBSTRG(strict_detection), false);
 		if (!from_encoding) {
 			if (info->report_errors) {
 				php_error_docref(NULL, E_WARNING, "Unable to detect encoding");
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -90,7 +90,7 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
 
 static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encoding);
 
-static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
+static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
 
 static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
 
@@ -452,7 +452,7 @@ static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *a
 		list_size = MBSTRG(current_detect_order_list_size);
 	}
 
-	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false);
+	return (const zend_encoding*)mb_guess_encoding((unsigned char*)arg_string, arg_length, (const mbfl_encoding **)list, list_size, false, false);
 }
 
 static size_t php_mb_zend_encoding_converter(unsigned char **to, size_t *to_length, const unsigned char *from, size_t from_length, const zend_encoding *encoding_to, const zend_encoding *encoding_from)
@@ -2695,7 +2695,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t leng
 		from_encoding = *from_encodings;
 	} else {
 		/* auto detect */
-		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection));
+		from_encoding = mb_guess_encoding((unsigned char*)input, length, from_encodings, num_from_encodings, MBSTRG(strict_detection), true);
 		if (!from_encoding) {
 			php_error_docref(NULL, E_WARNING, "Unable to detect character encoding");
 			return NULL;
@@ -2996,9 +2996,10 @@ struct candidate {
 	size_t in_len;
 	uint64_t demerits; /* Wide bit size to prevent overflow */
 	unsigned int state;
+	float multiplier;
 };
 
-static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict)
+static size_t init_candidate_array(struct candidate *array, size_t length, const mbfl_encoding **encodings, const unsigned char **in, size_t *in_len, size_t n, bool strict, bool order_significant)
 {
 	size_t j = 0;
 
@@ -3018,6 +3019,10 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
 		array[j].enc = enc;
 		array[j].state = 0;
 		array[j].demerits = 0;
+		/* This multiplier can optionally be used to make candidate encodings listed
+		 * first more likely to be chosen. It is a weight factor which multiplies
+		 * the number of demerits counted for each candidate. */
+		array[j].multiplier = order_significant ? 1.0 + ((0.3 * i) / length) : 1.0;
 		j++;
 skip_to_next: ;
 	}
@@ -3093,10 +3098,14 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict
 		}
 	}
 
+	for (size_t i = 0; i < length; i++) {
+		array[i].demerits *= array[i].multiplier;
+	}
+
 	return length;
 }
 
-MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
+MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
 {
 	if (elist_size == 0) {
 		return NULL;
@@ -3117,7 +3126,7 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
 
 	/* Allocate on stack; when we return, this array is automatically freed */
 	struct candidate *array = alloca(elist_size * sizeof(struct candidate));
-	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict);
+	elist_size = init_candidate_array(array, elist_size, elist, strings, str_lengths, n, strict, order_significant);
 
 	while (n--) {
 		start_string(array, elist_size, strings[n], str_lengths[n]);
@@ -3141,9 +3150,9 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
 /* When doing 'strict' detection, any string which is invalid in the candidate encoding
  * is rejected. With non-strict detection, we just continue, but apply demerits for
  * each invalid byte sequence */
-static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict)
+static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant)
 {
-	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict);
+	return mb_guess_encoding_for_strings((const unsigned char**)&in, &in_len, 1, elist, elist_size, strict, order_significant);
 }
 
 /* {{{ Encodings of the given string is returned (as a string) */
@@ -3162,8 +3171,17 @@ PHP_FUNCTION(mb_detect_encoding)
 		Z_PARAM_BOOL(strict)
 	ZEND_PARSE_PARAMETERS_END();
 
+	/* Should we pay attention to the order of the provided candidate encodings and prefer
+	 * the earlier ones (if more than one candidate encoding matches)?
+	 * If the entire list of supported encodings returned by `mb_list_encodings` is passed
+	 * in, then don't treat the order as significant */
+	bool order_significant = true;
+
 	/* make encoding list */
 	if (encoding_ht) {
+		if (encoding_ht == MBSTRG(all_encodings_list)) {
+			order_significant = false;
+		}
 		if (FAILURE == php_mb_parse_encoding_array(encoding_ht, &elist, &size, 2)) {
 			RETURN_THROWS();
 		}
@@ -3195,7 +3213,7 @@ PHP_FUNCTION(mb_detect_encoding)
 	if (size == 1 && *elist == &mbfl_encoding_utf8 && (GC_FLAGS(str) & IS_STR_VALID_UTF8)) {
 		ret = &mbfl_encoding_utf8;
 	} else {
-		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict);
+		ret = mb_guess_encoding((unsigned char*)ZSTR_VAL(str), ZSTR_LEN(str), elist, size, strict, order_significant);
 	}
 
 	efree(ZEND_VOIDP(elist));
@@ -3556,8 +3574,15 @@ PHP_FUNCTION(mb_convert_variables)
 
 	from_encoding = MBSTRG(current_internal_encoding);
 
+	bool order_significant = true;
+
 	/* pre-conversion encoding */
 	if (from_enc_ht) {
+		if (from_enc_ht == MBSTRG(all_encodings_list)) {
+			/* If entire list of supported encodings returned by `mb_list_encodings` is passed
+			 * in, then don't treat the order of the list as significant */
+			order_significant = false;
+		}
 		if (php_mb_parse_encoding_array(from_enc_ht, &elist, &elistsz, 2) == FAILURE) {
 			RETURN_THROWS();
 		}
@@ -3595,7 +3620,7 @@ PHP_FUNCTION(mb_convert_variables)
 				RETURN_FALSE;
 			}
 		}
-		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection));
+		from_encoding = mb_guess_encoding_for_strings(val_list, len_list, num, elist, elistsz, MBSTRG(strict_detection), order_significant);
 		efree(ZEND_VOIDP(val_list));
 		efree(len_list);
 		if (!from_encoding) {
@@ -4313,7 +4338,7 @@ PHP_FUNCTION(mb_send_mail)
 	/* Subject: */
 	const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
 	if (enc == &mbfl_encoding_pass) {
-		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
+		enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
 	}
 	const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
 	size_t line_sep_len = strlen(line_sep);
@@ -4323,7 +4348,7 @@ PHP_FUNCTION(mb_send_mail)
 	/* message body */
 	const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
 	if (msg_enc == &mbfl_encoding_pass) {
-		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
+		msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection), false);
 	}
 
 	unsigned int num_errors = 0;
diff --git a/ext/mbstring/mbstring.h b/ext/mbstring/mbstring.h
@@ -67,7 +67,7 @@ MBSTRING_API size_t php_mb_mbchar_bytes(const char *s, const mbfl_encoding *enc)
 MBSTRING_API size_t php_mb_stripos(bool mode, zend_string *haystack, zend_string *needle, zend_long offset, const mbfl_encoding *enc);
 MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const mbfl_encoding *encoding);
 
-MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
+MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned char **strings, size_t *str_lengths, size_t n, const mbfl_encoding **elist, unsigned int elist_size, bool strict, bool order_significant);
 
 ZEND_BEGIN_MODULE_GLOBALS(mbstring)
 	char *internal_encoding_name;
diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt
@@ -88,6 +88,17 @@ print("EUC-JP: " . mb_detect_encoding($euc_jp) . "\n");
 
 print("SJIS: " . mb_detect_encoding($sjis) . "\n");
 
+// Thanks to Ulrik Nielsen for the following tests; the hex strings are the same file, but in two
+// different text encodings
+// We do not have any strong hints showing that the second one is actually UTF-8...
+// but mb_detect_encoding still guesses UTF-8 because it is the first one in the list
+
+$win1252text = hex2bin("2320546869732066696c6520636f6e7461696e732057696e646f77732d3132353220656e636f646564206461746120616e642048544d4c20656e7469746965730a61626364650ae6f8e50af00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790af00ad00a2623383739313b0a262331373238373b0a262333383937393b0a2623353437333b0a616263646520e6f8e520f020d0203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e202623383739313b20262331373238373b20262333383937393b202623353437333b0a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320a8c0a890a2320506f6c69736820737472696e670a50727a656a6426233337383b20646f2070727a65676c26233236313b64750a");
+echo mb_detect_encoding($win1252text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n";
+
+$utf8text = hex2bin("2320546869732066696c6520636f6e7461696e73205554462d3820656e636f64656420646174610a61626364650ac3a6c3b8c3a50ac3b00a3c703e476f646461673c6272202f3e0a7b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e0a23205468697320697320746f20656e73757265207468617420646966666572656e74206b696e6473206f662048544d4c20656e74697469657320617265206265696e6720636f6e76657274656420636f72726563746c790ac3b00ac3900ae289970ae48e870ae9a1830ae195a10a616263646520c3a6c3b8c3a520c3b020c390203c703e476f646461673c6272202f3e207b726561646f626a206f626a65637469643d24726573756c745b305d2e706172656e7469642061737369676e3d22646f63227d3c6272202f3e20e2899720e48e8720e9a18320e195a10a232054686520666f6c6c6f77696e67206368617261637465727320617265207370656369616c206368617261637465727320746861742068617320646966666572656e7420636f646520706f696e747320696e2049534f2d383835392d3120616e642057696e646f77732d31323532202d207468617420776520646966666572656e746961746520636f72726563746c79206265747765656e2049534f2d383835392d3120616e642057696e646f77732d313235320ac5920ae280b00a2320506f6c69736820737472696e670a50727a656a64c5ba20646f2070727a65676cc48564750a");
+echo mb_detect_encoding($utf8text, ['UTF-8', 'CP1252', 'ISO-8859-1'], true), "\n";
+
 echo "== INVALID PARAMETER ==\n";
 
 print("INT: " . mb_detect_encoding(1234, 'EUC-JP') . "\n"); // EUC-JP
@@ -393,6 +404,8 @@ SJIS
 JIS: JIS
 EUC-JP: EUC-JP
 SJIS: SJIS
+Windows-1252
+UTF-8
 == INVALID PARAMETER ==
 INT: EUC-JP
 EUC-JP: EUC-JP