Use pakutoma's encoding check functions for mb_detect_encoding even in non-strict mode

alexdowad · alexdowad · commit 7914b8cefd21 · 2023-05-16T07:01:07.000-07:00
In 6fc8d01, pakutoma added specialized validity checking functions for some legacy text encodings like ISO-2022-JP and UTF-7. These check functions perform a more strict validity check than the encoding conversion functions for the same text encodings. For example, the check function for ISO-2022-JP verifies that the string ends in the correct state required by the specification for ISO-2022-JP. These check functions are already being used to make detection of text encoding more accurate when 'strict' detection mode is enabled. However, since the default is 'non-strict' detection (a bad API design but we're stuck with it now), most users will not benefit from pakutoma's work. I was previously reluctant to enable this new logic for non-strict detection mode. My intention was to reduce the scope of behavior changes, since almost *any* behavior change may affect *some* user in a way we don't expect. However, we definitely have users whose (production) code was broken by the changes I made in 28b346b, and enabling pakutoma's check functions for non-strict detection mode would un-break it. (See GH-10192 as an example.) The added checks do also make sense. In non-strict detection mode, we will not immediately reject candidate encodings whose validity check function returns false; but they will be much less likely to be selected. However, failure of the validity check function is weighted less heavily than an encoding error detected by the encoding conversion function.
diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
@@ -1816,7 +1816,6 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
 	}
 
-
 	uint32_t wchar_buf[128];
 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
 	size_t in_len = ZSTR_LEN(string);
@@ -3006,19 +3005,24 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
 	for (size_t i = 0; i < length; i++) {
 		const mbfl_encoding *enc = encodings[i];
 
+		array[j].enc = enc;
+		array[j].state = 0;
+		array[j].demerits = 0;
+
 		/* If any candidate encodings have specialized validation functions, use them
 		 * to eliminate as many candidates as possible */
-		if (strict && enc->check != NULL) {
+		if (enc->check != NULL) {
 			for (size_t k = 0; k < n; k++) {
 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
-					goto skip_to_next;
+					if (strict) {
+						goto skip_to_next;
+					} else {
+						array[j].demerits += 500;
+					}
 				}
 			}
 		}
 
-		array[j].enc = enc;
-		array[j].state = 0;
-		array[j].demerits = 0;
 		/* This multiplier can optionally be used to make candidate encodings listed
 		 * first more likely to be chosen. It is a weight factor which multiplies
 		 * the number of demerits counted for each candidate. */
diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt
@@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) {
 --EXPECT--
 non-base64 character after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -93,7 +93,7 @@ int(0)
 
 base64 character before +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -174,7 +174,7 @@ int(2)
 
 - and +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -219,7 +219,7 @@ int(2)
 
 valid direct encoding character = after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -228,7 +228,7 @@ int(2)
 
 invalid direct encoding character ~ after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -237,7 +237,7 @@ int(2)
 
 invalid direct encoding character \ after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -246,7 +246,7 @@ int(2)
 
 invalid direct encoding character ESC after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt
@@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-
 echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
 echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
 
+// This test case courtesy of Adrien Foulon
+// It depends on the below use of '+' being recognized as invalid UTF-7
+$css = 'input[type="radio"]:checked + img {
+    border: 5px solid #0083ca;
+}';
+echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
+
 echo "== DETECT ORDER ==\n";
 
 mb_detect_order('auto');
@@ -400,6 +407,7 @@ UTF-8
 UTF-8
 UTF-8
 SJIS
+UTF-8
 == DETECT ORDER ==
 JIS: JIS
 EUC-JP: EUC-JP