@@ -90,7 +90,7 @@ static inline bool php_mb_is_no_encoding_utf8(enum mbfl_no_encoding no_enc);
90
90
91
91
static bool mb_check_str_encoding (zend_string * str , const mbfl_encoding * encoding );
92
92
93
- static const mbfl_encoding * mb_guess_encoding (unsigned char * in , size_t in_len , const mbfl_encoding * * elist , unsigned int elist_size , bool strict );
93
+ static const mbfl_encoding * mb_guess_encoding (unsigned char * in , size_t in_len , const mbfl_encoding * * elist , unsigned int elist_size , bool strict , bool order_significant );
94
94
95
95
static zend_string * mb_mime_header_encode (zend_string * input , const mbfl_encoding * incode , const mbfl_encoding * outcode , bool base64 , char * linefeed , size_t linefeed_len , zend_long indent );
96
96
@@ -452,7 +452,7 @@ static const zend_encoding *php_mb_zend_encoding_detector(const unsigned char *a
452
452
list_size = MBSTRG (current_detect_order_list_size );
453
453
}
454
454
455
- return (const zend_encoding * )mb_guess_encoding ((unsigned char * )arg_string , arg_length , (const mbfl_encoding * * )list , list_size , false);
455
+ return (const zend_encoding * )mb_guess_encoding ((unsigned char * )arg_string , arg_length , (const mbfl_encoding * * )list , list_size , false, true );
456
456
}
457
457
458
458
static size_t php_mb_zend_encoding_converter (unsigned char * * to , size_t * to_length , const unsigned char * from , size_t from_length , const zend_encoding * encoding_to , const zend_encoding * encoding_from )
@@ -2695,7 +2695,7 @@ MBSTRING_API zend_string* php_mb_convert_encoding(const char *input, size_t leng
2695
2695
from_encoding = * from_encodings ;
2696
2696
} else {
2697
2697
/* auto detect */
2698
- from_encoding = mb_guess_encoding ((unsigned char * )input , length , from_encodings , num_from_encodings , MBSTRG (strict_detection ));
2698
+ from_encoding = mb_guess_encoding ((unsigned char * )input , length , from_encodings , num_from_encodings , MBSTRG (strict_detection ), true );
2699
2699
if (!from_encoding ) {
2700
2700
php_error_docref (NULL , E_WARNING , "Unable to detect character encoding" );
2701
2701
return NULL ;
@@ -2996,9 +2996,10 @@ struct candidate {
2996
2996
size_t in_len ;
2997
2997
uint64_t demerits ; /* Wide bit size to prevent overflow */
2998
2998
unsigned int state ;
2999
+ float multiplier ;
2999
3000
};
3000
3001
3001
- static size_t init_candidate_array (struct candidate * array , size_t length , const mbfl_encoding * * encodings , const unsigned char * * in , size_t * in_len , size_t n , bool strict )
3002
+ static size_t init_candidate_array (struct candidate * array , size_t length , const mbfl_encoding * * encodings , const unsigned char * * in , size_t * in_len , size_t n , bool strict , bool order_significant )
3002
3003
{
3003
3004
size_t j = 0 ;
3004
3005
@@ -3018,6 +3019,10 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
3018
3019
array [j ].enc = enc ;
3019
3020
array [j ].state = 0 ;
3020
3021
array [j ].demerits = 0 ;
3022
+ /* This multiplier can optionally be used to make candidate encodings listed
3023
+ * first more likely to be chosen. It is a weight factor which multiplies
3024
+ * the number of demerits counted for each candidate. */
3025
+ array [j ].multiplier = order_significant ? 1.0 + ((0.2 * i ) / length ) : 1.0 ;
3021
3026
j ++ ;
3022
3027
skip_to_next : ;
3023
3028
}
@@ -3093,10 +3098,14 @@ static size_t count_demerits(struct candidate *array, size_t length, bool strict
3093
3098
}
3094
3099
}
3095
3100
3101
+ for (size_t i = 0 ; i < length ; i ++ ) {
3102
+ array [i ].demerits *= array [i ].multiplier ;
3103
+ }
3104
+
3096
3105
return length ;
3097
3106
}
3098
3107
3099
- MBSTRING_API const mbfl_encoding * mb_guess_encoding_for_strings (const unsigned char * * strings , size_t * str_lengths , size_t n , const mbfl_encoding * * elist , unsigned int elist_size , bool strict )
3108
+ MBSTRING_API const mbfl_encoding * mb_guess_encoding_for_strings (const unsigned char * * strings , size_t * str_lengths , size_t n , const mbfl_encoding * * elist , unsigned int elist_size , bool strict , bool order_significant )
3100
3109
{
3101
3110
if (elist_size == 0 ) {
3102
3111
return NULL ;
@@ -3117,7 +3126,7 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
3117
3126
3118
3127
/* Allocate on stack; when we return, this array is automatically freed */
3119
3128
struct candidate * array = alloca (elist_size * sizeof (struct candidate ));
3120
- elist_size = init_candidate_array (array , elist_size , elist , strings , str_lengths , n , strict );
3129
+ elist_size = init_candidate_array (array , elist_size , elist , strings , str_lengths , n , strict , order_significant );
3121
3130
3122
3131
while (n -- ) {
3123
3132
start_string (array , elist_size , strings [n ], str_lengths [n ]);
@@ -3141,9 +3150,9 @@ MBSTRING_API const mbfl_encoding* mb_guess_encoding_for_strings(const unsigned c
3141
3150
/* When doing 'strict' detection, any string which is invalid in the candidate encoding
3142
3151
* is rejected. With non-strict detection, we just continue, but apply demerits for
3143
3152
* each invalid byte sequence */
3144
- static const mbfl_encoding * mb_guess_encoding (unsigned char * in , size_t in_len , const mbfl_encoding * * elist , unsigned int elist_size , bool strict )
3153
+ static const mbfl_encoding * mb_guess_encoding (unsigned char * in , size_t in_len , const mbfl_encoding * * elist , unsigned int elist_size , bool strict , bool order_significant )
3145
3154
{
3146
- return mb_guess_encoding_for_strings ((const unsigned char * * )& in , & in_len , 1 , elist , elist_size , strict );
3155
+ return mb_guess_encoding_for_strings ((const unsigned char * * )& in , & in_len , 1 , elist , elist_size , strict , order_significant );
3147
3156
}
3148
3157
3149
3158
/* {{{ Encodings of the given string is returned (as a string) */
@@ -3162,8 +3171,17 @@ PHP_FUNCTION(mb_detect_encoding)
3162
3171
Z_PARAM_BOOL (strict )
3163
3172
ZEND_PARSE_PARAMETERS_END ();
3164
3173
3174
+ /* Should we pay attention to the order of the provided candidate encodings and prefer
3175
+ * the earlier ones (if more than one candidate encoding matches)?
3176
+ * If the entire list of supported encodings returned by `mb_list_encodings` is passed
3177
+ * in, then don't treat the order as significant */
3178
+ bool order_significant = true;
3179
+
3165
3180
/* make encoding list */
3166
3181
if (encoding_ht ) {
3182
+ if (encoding_ht == MBSTRG (all_encodings_list )) {
3183
+ order_significant = false;
3184
+ }
3167
3185
if (FAILURE == php_mb_parse_encoding_array (encoding_ht , & elist , & size , 2 )) {
3168
3186
RETURN_THROWS ();
3169
3187
}
@@ -3195,7 +3213,7 @@ PHP_FUNCTION(mb_detect_encoding)
3195
3213
if (size == 1 && * elist == & mbfl_encoding_utf8 && (GC_FLAGS (str ) & IS_STR_VALID_UTF8 )) {
3196
3214
ret = & mbfl_encoding_utf8 ;
3197
3215
} else {
3198
- ret = mb_guess_encoding ((unsigned char * )ZSTR_VAL (str ), ZSTR_LEN (str ), elist , size , strict );
3216
+ ret = mb_guess_encoding ((unsigned char * )ZSTR_VAL (str ), ZSTR_LEN (str ), elist , size , strict , order_significant );
3199
3217
}
3200
3218
3201
3219
efree (ZEND_VOIDP (elist ));
@@ -3556,8 +3574,15 @@ PHP_FUNCTION(mb_convert_variables)
3556
3574
3557
3575
from_encoding = MBSTRG (current_internal_encoding );
3558
3576
3577
+ bool order_significant = true;
3578
+
3559
3579
/* pre-conversion encoding */
3560
3580
if (from_enc_ht ) {
3581
+ if (from_enc_ht == MBSTRG (all_encodings_list )) {
3582
+ /* If entire list of supported encodings returned by `mb_list_encodings` is passed
3583
+ * in, then don't treat the order of the list as significant */
3584
+ order_significant = false;
3585
+ }
3561
3586
if (php_mb_parse_encoding_array (from_enc_ht , & elist , & elistsz , 2 ) == FAILURE ) {
3562
3587
RETURN_THROWS ();
3563
3588
}
@@ -3595,7 +3620,7 @@ PHP_FUNCTION(mb_convert_variables)
3595
3620
RETURN_FALSE ;
3596
3621
}
3597
3622
}
3598
- from_encoding = mb_guess_encoding_for_strings (val_list , len_list , num , elist , elistsz , MBSTRG (strict_detection ));
3623
+ from_encoding = mb_guess_encoding_for_strings (val_list , len_list , num , elist , elistsz , MBSTRG (strict_detection ), order_significant );
3599
3624
efree (ZEND_VOIDP (val_list ));
3600
3625
efree (len_list );
3601
3626
if (!from_encoding ) {
@@ -4313,7 +4338,7 @@ PHP_FUNCTION(mb_send_mail)
4313
4338
/* Subject: */
4314
4339
const mbfl_encoding * enc = MBSTRG (current_internal_encoding );
4315
4340
if (enc == & mbfl_encoding_pass ) {
4316
- enc = mb_guess_encoding ((unsigned char * )ZSTR_VAL (subject ), ZSTR_LEN (subject ), MBSTRG (current_detect_order_list ), MBSTRG (current_detect_order_list_size ), MBSTRG (strict_detection ));
4341
+ enc = mb_guess_encoding ((unsigned char * )ZSTR_VAL (subject ), ZSTR_LEN (subject ), MBSTRG (current_detect_order_list ), MBSTRG (current_detect_order_list_size ), MBSTRG (strict_detection ), true );
4317
4342
}
4318
4343
const char * line_sep = PG (mail_mixed_lf_and_crlf ) ? "\n" : CRLF ;
4319
4344
size_t line_sep_len = strlen (line_sep );
@@ -4323,7 +4348,7 @@ PHP_FUNCTION(mb_send_mail)
4323
4348
/* message body */
4324
4349
const mbfl_encoding * msg_enc = MBSTRG (current_internal_encoding );
4325
4350
if (msg_enc == & mbfl_encoding_pass ) {
4326
- msg_enc = mb_guess_encoding ((unsigned char * )message , message_len , MBSTRG (current_detect_order_list ), MBSTRG (current_detect_order_list_size ), MBSTRG (strict_detection ));
4351
+ msg_enc = mb_guess_encoding ((unsigned char * )message , message_len , MBSTRG (current_detect_order_list ), MBSTRG (current_detect_order_list_size ), MBSTRG (strict_detection ), true );
4327
4352
}
4328
4353
4329
4354
unsigned int num_errors = 0 ;
0 commit comments