@@ -35,12 +35,14 @@ SUCH DAMAGE.
35
35
#include <string.h>
36
36
37
37
#ifdef __x86_64__
38
+ #include <cpuid.h>
38
39
#include <emmintrin.h>
39
40
#include <immintrin.h>
40
- #include <cpuid.h>
41
- #if ( __GNUC__ >= 7 || __GNUC__ == 6 && __GNUC_MINOR__ >= 3 || defined( __clang_major__ )) && !defined( __STDC_NO_ATOMICS__ )
42
- #include <tmmintrin.h>
41
+ #if ( __GNUC__ >= 7 || __GNUC__ == 6 && __GNUC_MINOR__ >= 3 || \
42
+ defined( __clang_major__ )) && \
43
+ !defined( __STDC_NO_ATOMICS__ )
43
44
#include <stdatomic.h>
45
+ #include <tmmintrin.h>
44
46
#else
45
47
// This is needed to support CentOS 7, which has a very old GCC.
46
48
#define CRUFTY_GCC
@@ -64,7 +66,8 @@ static inline uint64_t read_uint64(const uint64_t *p) {
64
66
return r ;
65
67
}
66
68
67
- static inline int is_valid_utf8_fallback (uint8_t const * const src , size_t const len ) {
69
+ static inline int is_valid_utf8_fallback (uint8_t const * const src ,
70
+ size_t const len ) {
68
71
uint8_t const * ptr = (uint8_t const * )src ;
69
72
// This is 'one past the end' to make loop termination and bounds checks
70
73
// easier.
@@ -83,10 +86,11 @@ static inline int is_valid_utf8_fallback(uint8_t const *const src, size_t const
83
86
// Non-ASCII bytes have a set MSB. Thus, if we AND with 0x80 in every
84
87
// 'lane', we will get 0 if everything is ASCII, and something else
85
88
// otherwise.
86
- uint64_t results [4 ] = {to_little_endian (read_uint64 (big_ptr )) & high_bits_mask ,
87
- to_little_endian (read_uint64 ((big_ptr + 1 ))) & high_bits_mask ,
88
- to_little_endian (read_uint64 ((big_ptr + 2 ))) & high_bits_mask ,
89
- to_little_endian (read_uint64 ((big_ptr + 3 ))) & high_bits_mask };
89
+ uint64_t results [4 ] = {
90
+ to_little_endian (read_uint64 (big_ptr )) & high_bits_mask ,
91
+ to_little_endian (read_uint64 ((big_ptr + 1 ))) & high_bits_mask ,
92
+ to_little_endian (read_uint64 ((big_ptr + 2 ))) & high_bits_mask ,
93
+ to_little_endian (read_uint64 ((big_ptr + 3 ))) & high_bits_mask };
90
94
if (results [0 ] == 0 ) {
91
95
ptr += 8 ;
92
96
if (results [1 ] == 0 ) {
@@ -331,10 +335,26 @@ static int8_t const ef_fe_lookup[16] = {
331
335
};
332
336
333
337
__attribute__((target ("ssse3" ))) static inline bool
334
- is_ascii_sse2 (__m128i const * src ) {
338
+ is_ascii_sse2 (__m128i const * src , __m128i const prev_first_len ) {
339
+ // Check if we have ASCII, and also that we don't have to treat the prior
340
+ // block as special.
341
+ // First, verify that we didn't see any non-ASCII bytes in the first half of
342
+ // the stride.
343
+ __m128i const first_half_clean = _mm_or_si128 (src [0 ], src [1 ]);
344
+ // Then do the same for the second half of the stride.
345
+ __m128i const second_half_clean = _mm_or_si128 (src [2 ], src [3 ]);
346
+ // Check cleanliness of the entire stride.
347
+ __m128i const stride_clean =
348
+ _mm_or_si128 (first_half_clean , second_half_clean );
349
+ // Finally, check that we didn't have any leftover marker bytes in the
350
+ // previous block: these are indicated by non-zeroes in prev_first_len. In
351
+ // order to trigger a failure, we have to have non-zeros set the high bit of
352
+ // the lane: we do this by doing a greater-than comparison with a block of
353
+ // zeroes.
354
+ __m128i const no_prior_dirt =
355
+ _mm_cmpgt_epi8 (prev_first_len , _mm_setzero_si128 ());
335
356
// OR together everything, then check for a high bit anywhere.
336
- __m128i const ored =
337
- _mm_or_si128 (_mm_or_si128 (src [0 ], src [1 ]), _mm_or_si128 (src [2 ], src [3 ]));
357
+ __m128i const ored = _mm_or_si128 (stride_clean , no_prior_dirt );
338
358
return (_mm_movemask_epi8 (ored ) == 0 );
339
359
}
340
360
@@ -415,7 +435,7 @@ is_valid_utf8_ssse3(uint8_t const *const src, size_t const len) {
415
435
_mm_loadu_si128 (big_ptr ), _mm_loadu_si128 (big_ptr + 1 ),
416
436
_mm_loadu_si128 (big_ptr + 2 ), _mm_loadu_si128 (big_ptr + 3 )};
417
437
// Check if we have ASCII.
418
- if (is_ascii_sse2 (inputs )) {
438
+ if (is_ascii_sse2 (inputs , prev_first_len )) {
419
439
// Prev_first_len cheaply.
420
440
prev_first_len =
421
441
_mm_shuffle_epi8 (first_len_tbl , high_nibbles_of (inputs [3 ]));
@@ -598,10 +618,26 @@ is_valid_utf8_avx2(uint8_t const *const src, size_t const len) {
598
618
__m256i const inputs [4 ] = {
599
619
_mm256_loadu_si256 (big_ptr ), _mm256_loadu_si256 (big_ptr + 1 ),
600
620
_mm256_loadu_si256 (big_ptr + 2 ), _mm256_loadu_si256 (big_ptr + 3 )};
601
- // Check if we have ASCII.
602
- bool is_ascii = _mm256_movemask_epi8 (_mm256_or_si256 (
603
- _mm256_or_si256 (inputs [0 ], inputs [1 ]),
604
- _mm256_or_si256 (inputs [2 ], inputs [3 ]))) == 0 ;
621
+ // Check if we have ASCII, and also that we don't have to treat the prior
622
+ // block as special.
623
+ // First, verify that we didn't see any non-ASCII bytes in the first half of
624
+ // the stride.
625
+ __m256i const first_half_clean = _mm256_or_si256 (inputs [0 ], inputs [1 ]);
626
+ // Then do the same for the second half of the stride.
627
+ __m256i const second_half_clean = _mm256_or_si256 (inputs [2 ], inputs [3 ]);
628
+ // Check cleanliness of the entire stride.
629
+ __m256i const stride_clean =
630
+ _mm256_or_si256 (first_half_clean , second_half_clean );
631
+ // Finally, check that we didn't have any leftover marker bytes in the
632
+ // previous block: these are indicated by non-zeroes in prev_first_len.
633
+ // In order to trigger a failure, we have to have non-zeros set the high bit
634
+ // of the lane: we do this by doing a greater-than comparison with a block
635
+ // of zeroes.
636
+ __m256i const no_prior_dirt =
637
+ _mm256_cmpgt_epi8 (prev_first_len , _mm256_setzero_si256 ());
638
+ // Combine all checks together, and check if any high bits are set.
639
+ bool is_ascii =
640
+ _mm256_movemask_epi8 (_mm256_or_si256 (stride_clean , no_prior_dirt )) == 0 ;
605
641
if (is_ascii ) {
606
642
// Prev_first_len cheaply
607
643
prev_first_len =
@@ -683,7 +719,7 @@ static inline bool has_avx2() {
683
719
}
684
720
#endif
685
721
686
- typedef int (* is_valid_utf8_t ) (uint8_t const * const , size_t const );
722
+ typedef int (* is_valid_utf8_t )(uint8_t const * const , size_t const );
687
723
688
724
int bytestring_is_valid_utf8 (uint8_t const * const src , size_t const len ) {
689
725
if (len == 0 ) {
@@ -693,7 +729,10 @@ int bytestring_is_valid_utf8(uint8_t const *const src, size_t const len) {
693
729
static _Atomic is_valid_utf8_t s_impl = (is_valid_utf8_t )NULL ;
694
730
is_valid_utf8_t impl = atomic_load_explicit (& s_impl , memory_order_relaxed );
695
731
if (!impl ) {
696
- impl = has_avx2 () ? is_valid_utf8_avx2 : (has_ssse3 () ? is_valid_utf8_ssse3 : (has_sse2 () ? is_valid_utf8_sse2 : is_valid_utf8_fallback ));
732
+ impl = has_avx2 () ? is_valid_utf8_avx2
733
+ : (has_ssse3 () ? is_valid_utf8_ssse3
734
+ : (has_sse2 () ? is_valid_utf8_sse2
735
+ : is_valid_utf8_fallback ));
697
736
atomic_store_explicit (& s_impl , impl , memory_order_relaxed );
698
737
}
699
738
return (* impl )(src , len );
0 commit comments