From bfffb438dbae14e39c53aa0950bc3c0f22eb7940 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Wed, 11 Dec 2019 16:02:00 +0100
Subject: [PATCH 01/11] Add SIMD based validate_ascii_fast and
 validate_utf8_fast

---
 simdasciicheck.h |  55 ++++++
 simdutf8check.h  | 458 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 513 insertions(+)
 create mode 100644 simdasciicheck.h
 create mode 100644 simdutf8check.h

diff --git a/simdasciicheck.h b/simdasciicheck.h
new file mode 100644
index 00000000000000..6c119ff3728963
--- /dev/null
+++ b/simdasciicheck.h
@@ -0,0 +1,55 @@
+#ifndef SIMDASCIICHECK_H
+#define SIMDASCIICHECK_H
+
+#include <emmintrin.h> // SSE2
+#include <stdbool.h>   // c99 bool
+#include <stddef.h>    // size_t
+
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+static bool validate_ascii_fast(const char *src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+      has_error = _mm_or_si128(has_error, current_bytes);
+    }
+  }
+  int error_mask = _mm_movemask_epi8(has_error);
+
+  char tail_has_error = 0;
+  for (; i < len; i++) {
+    tail_has_error |= src[i];
+  }
+  error_mask |= (tail_has_error & 0x80);
+
+  return !error_mask;
+}
+
+#ifdef __AVX2__
+#include <x86intrin.h>
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+static bool validate_ascii_fast_avx(const char *src, size_t len) {
+  size_t i = 0;
+  __m256i has_error = _mm256_setzero_si256();
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      has_error = _mm256_or_si256(has_error, current_bytes);
+    }
+  }
+  int error_mask = _mm256_movemask_epi8(has_error);
+
+  char tail_has_error = 0;
+  for (; i < len; i++) {
+    tail_has_error |= src[i];
+  }
+  error_mask |= (tail_has_error & 0x80);
+
+  return !error_mask;
+}
+#endif
+
+#endif
diff --git a/simdutf8check.h b/simdutf8check.h
new file mode 100644
index 00000000000000..3e16779830e3c0
--- /dev/null
+++ b/simdutf8check.h
@@ -0,0 +1,458 @@
+
+#ifndef SIMDUTF8CHECK_H
+#define SIMDUTF8CHECK_H
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <x86intrin.h>
+/*
+ * legal utf-8 byte sequence
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ *  Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ *
+ */
+
+// all byte values must be no larger than 0xF4
+static inline void checkSmallerThan0xF4(__m128i current_bytes,
+                                        __m128i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm_or_si128(*has_error,
+                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
+}
+
+static inline __m128i continuationLengths(__m128i high_nibbles) {
+  return _mm_shuffle_epi8(
+      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                    0, 0, 0, 0,             // 10xx (continuation)
+                    2, 2,                   // 110x
+                    3,                      // 1110
+                    4), // 1111, next should be 0 (not checked here)
+      high_nibbles);
+}
+
+static inline __m128i carryContinuations(__m128i initial_lengths,
+                                         __m128i previous_carries) {
+
+  __m128i right1 =
+      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
+                    _mm_set1_epi8(1));
+  __m128i sum = _mm_add_epi8(initial_lengths, right1);
+
+  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
+                                 _mm_set1_epi8(2));
+  return _mm_add_epi8(sum, right2);
+}
+
+static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
+                                      __m128i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m128i overunder =
+      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
+                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
+
+  *has_error = _mm_or_si128(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void checkFirstContinuationMax(__m128i current_bytes,
+                                             __m128i off1_current_bytes,
+                                             __m128i *has_error) {
+  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
+  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
+
+  __m128i badfollowED =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
+  __m128i badfollowF4 =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
+
+  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+static inline void checkOverlong(__m128i current_bytes,
+                                 __m128i off1_current_bytes, __m128i hibits,
+                                 __m128i previous_hibits, __m128i *has_error) {
+  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
+  __m128i initial_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    0xC2, -128, // 110x
+                    0xE1,       // 1110
+                    0xF1),
+      off1_hibits);
+
+  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m128i second_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    127, 127,   // 110x => true
+                    0xA0,       // 1110
+                    0x90),
+      off1_hibits);
+  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
+  *has_error =
+      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+}
+
+struct processed_utf_bytes {
+  __m128i rawbytes;
+  __m128i high_nibbles;
+  __m128i carried_continuations;
+};
+
+static inline void count_nibbles(__m128i bytes,
+                                 struct processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct processed_utf_bytes
+checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
+               __m128i *has_error) {
+  struct processed_utf_bytes pb;
+  count_nibbles(current_bytes, &pb);
+
+  checkSmallerThan0xF4(current_bytes, has_error);
+
+  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      carryContinuations(initial_lengths, previous->carried_continuations);
+
+  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m128i off1_current_bytes =
+      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
+  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast(const char *src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
+                                         .high_nibbles = _mm_setzero_si128(),
+                                         .carried_continuations =
+                                             _mm_setzero_si128()};
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[16];
+    memset(buffer, 0, 16);
+    memcpy(buffer, src + i, len - i);
+    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
+    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     has_error);
+  }
+
+  return _mm_testz_si128(has_error, has_error);
+}
+
+#ifdef __AVX2__
+
+/*****************************/
+static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
+  return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 15);
+}
+
+static inline __m256i push_last_2bytes_of_a_to_b(__m256i a, __m256i b) {
+  return _mm256_alignr_epi8(b, _mm256_permute2x128_si256(a, b, 0x21), 14);
+}
+
+// all byte values must be no larger than 0xF4
+static inline void avxcheckSmallerThan0xF4(__m256i current_bytes,
+                                           __m256i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm256_or_si256(
+      *has_error, _mm256_subs_epu8(current_bytes, _mm256_set1_epi8(0xF4)));
+}
+
+static inline __m256i avxcontinuationLengths(__m256i high_nibbles) {
+  return _mm256_shuffle_epi8(
+      _mm256_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                       0, 0, 0, 0,             // 10xx (continuation)
+                       2, 2,                   // 110x
+                       3,                      // 1110
+                       4, // 1111, next should be 0 (not checked here)
+                       1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                       0, 0, 0, 0,             // 10xx (continuation)
+                       2, 2,                   // 110x
+                       3,                      // 1110
+                       4 // 1111, next should be 0 (not checked here)
+                       ),
+      high_nibbles);
+}
+
+static inline __m256i avxcarryContinuations(__m256i initial_lengths,
+                                            __m256i previous_carries) {
+
+  __m256i right1 = _mm256_subs_epu8(
+      push_last_byte_of_a_to_b(previous_carries, initial_lengths),
+      _mm256_set1_epi8(1));
+  __m256i sum = _mm256_add_epi8(initial_lengths, right1);
+
+  __m256i right2 = _mm256_subs_epu8(
+      push_last_2bytes_of_a_to_b(previous_carries, sum), _mm256_set1_epi8(2));
+  return _mm256_add_epi8(sum, right2);
+}
+
+static inline void avxcheckContinuations(__m256i initial_lengths,
+                                         __m256i carries, __m256i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m256i overunder = _mm256_cmpeq_epi8(
+      _mm256_cmpgt_epi8(carries, initial_lengths),
+      _mm256_cmpgt_epi8(initial_lengths, _mm256_setzero_si256()));
+
+  *has_error = _mm256_or_si256(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void avxcheckFirstContinuationMax(__m256i current_bytes,
+                                                __m256i off1_current_bytes,
+                                                __m256i *has_error) {
+  __m256i maskED =
+      _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xED));
+  __m256i maskF4 =
+      _mm256_cmpeq_epi8(off1_current_bytes, _mm256_set1_epi8(0xF4));
+
+  __m256i badfollowED = _mm256_and_si256(
+      _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x9F)), maskED);
+  __m256i badfollowF4 = _mm256_and_si256(
+      _mm256_cmpgt_epi8(current_bytes, _mm256_set1_epi8(0x8F)), maskF4);
+
+  *has_error =
+      _mm256_or_si256(*has_error, _mm256_or_si256(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+static inline void avxcheckOverlong(__m256i current_bytes,
+                                    __m256i off1_current_bytes, __m256i hibits,
+                                    __m256i previous_hibits,
+                                    __m256i *has_error) {
+  __m256i off1_hibits = push_last_byte_of_a_to_b(previous_hibits, hibits);
+  __m256i initial_mins = _mm256_shuffle_epi8(
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, // 10xx => false
+                       0xC2, -128,       // 110x
+                       0xE1,             // 1110
+                       0xF1, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       0xC2, -128,             // 110x
+                       0xE1,                   // 1110
+                       0xF1),
+      off1_hibits);
+
+  __m256i initial_under = _mm256_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m256i second_mins = _mm256_shuffle_epi8(
+      _mm256_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, // 10xx => false
+                       127, 127,         // 110x => true
+                       0xA0,             // 1110
+                       0x90, -128, -128, -128, -128, -128, -128, -128, -128,
+                       -128, -128, -128, -128, // 10xx => false
+                       127, 127,               // 110x => true
+                       0xA0,                   // 1110
+                       0x90),
+      off1_hibits);
+  __m256i second_under = _mm256_cmpgt_epi8(second_mins, current_bytes);
+  *has_error = _mm256_or_si256(*has_error,
+                               _mm256_and_si256(initial_under, second_under));
+}
+
+struct avx_processed_utf_bytes {
+  __m256i rawbytes;
+  __m256i high_nibbles;
+  __m256i carried_continuations;
+};
+
+static inline void avx_count_nibbles(__m256i bytes,
+                                     struct avx_processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+      _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct avx_processed_utf_bytes
+avxcheckUTF8Bytes(__m256i current_bytes,
+                  struct avx_processed_utf_bytes *previous,
+                  __m256i *has_error) {
+  struct avx_processed_utf_bytes pb;
+  avx_count_nibbles(current_bytes, &pb);
+
+  avxcheckSmallerThan0xF4(current_bytes, has_error);
+
+  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      avxcarryContinuations(initial_lengths, previous->carried_continuations);
+
+  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m256i off1_current_bytes =
+      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
+  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                   previous->high_nibbles, has_error);
+  return pb;
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct avx_processed_utf_bytes
+avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
+                            struct avx_processed_utf_bytes *previous,
+                            __m256i *has_error) {
+  if (_mm256_testz_si256(current_bytes,
+                         _mm256_set1_epi8(0x80))) { // fast ascii path
+    *has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous->carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        *has_error);
+    return *previous;
+  }
+
+  struct avx_processed_utf_bytes pb;
+  avx_count_nibbles(current_bytes, &pb);
+
+  avxcheckSmallerThan0xF4(current_bytes, has_error);
+
+  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      avxcarryContinuations(initial_lengths, previous->carried_continuations);
+
+  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m256i off1_current_bytes =
+      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
+  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                   previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
+  size_t i = 0;
+  __m256i has_error = _mm256_setzero_si256();
+  struct avx_processed_utf_bytes previous = {
+      .rawbytes = _mm256_setzero_si256(),
+      .high_nibbles = _mm256_setzero_si256(),
+      .carried_continuations = _mm256_setzero_si256()};
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      previous =
+          avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[32];
+    memset(buffer, 0, 32);
+    memcpy(buffer, src + i, len - i);
+    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
+    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        has_error);
+  }
+
+  return _mm256_testz_si256(has_error, has_error);
+}
+
+static bool validate_utf8_fast_avx(const char *src, size_t len) {
+  size_t i = 0;
+  __m256i has_error = _mm256_setzero_si256();
+  struct avx_processed_utf_bytes previous = {
+      .rawbytes = _mm256_setzero_si256(),
+      .high_nibbles = _mm256_setzero_si256(),
+      .carried_continuations = _mm256_setzero_si256()};
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+    }
+  }
+
+  // last part
+  if (i < len) {
+    char buffer[32];
+    memset(buffer, 0, 32);
+    memcpy(buffer, src + i, len - i);
+    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
+    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+  } else {
+    has_error = _mm256_or_si256(
+        _mm256_cmpgt_epi8(previous.carried_continuations,
+                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                           9, 9, 9, 9, 9, 9, 9, 1)),
+        has_error);
+  }
+
+  return _mm256_testz_si256(has_error, has_error);
+}
+
+#endif // __AVX2__
+#endif

From 629653a1ec46227894cacf4756ceca7cffddcf1d Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Wed, 11 Dec 2019 16:02:15 +0100
Subject: [PATCH 02/11] Use validate_ascii_fast in coderange_scan

---
 string.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/string.c b/string.c
index 22ced326b8d0f5..31a7a098b4e5f7 100644
--- a/string.c
+++ b/string.c
@@ -21,6 +21,7 @@
 #include "id.h"
 #include "debug_counter.h"
 #include "ruby/util.h"
+#include "simdasciicheck.h"
 
 #define BEG(no) (regs->beg[(no)])
 #define END(no) (regs->end[(no)])
@@ -535,13 +536,19 @@ coderange_scan(const char *p, long len, rb_encoding *enc)
 {
     const char *e = p + len;
 
-    if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
+    switch (rb_enc_to_index(enc)) {
+      case ENCINDEX_ASCII:
         /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
-        p = search_nonascii(p, e);
-        return p ? ENC_CODERANGE_VALID : ENC_CODERANGE_7BIT;
+        return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+      case ENCINDEX_US_ASCII:
+        return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
     }
 
     if (rb_enc_asciicompat(enc)) {
+        if (validate_ascii_fast(p, len)) {
+            return ENC_CODERANGE_7BIT;
+        }
+
         p = search_nonascii(p, e);
         if (!p) return ENC_CODERANGE_7BIT;
         for (;;) {

From 7056c04e4c09fb36204aacbbf3b4606cd0eeb6ae Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Wed, 11 Dec 2019 16:49:02 +0100
Subject: [PATCH 03/11] Use validate_utf8_fast in coderange_scan

---
 string.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/string.c b/string.c
index 31a7a098b4e5f7..913a15da10177b 100644
--- a/string.c
+++ b/string.c
@@ -22,6 +22,7 @@
 #include "debug_counter.h"
 #include "ruby/util.h"
 #include "simdasciicheck.h"
+#include "simdutf8check.h"
 
 #define BEG(no) (regs->beg[(no)])
 #define END(no) (regs->end[(no)])
@@ -534,14 +535,18 @@ search_nonascii(const char *p, const char *e)
 static int
 coderange_scan(const char *p, long len, rb_encoding *enc)
 {
-    const char *e = p + len;
+    const char *e;
 
     switch (rb_enc_to_index(enc)) {
-      case ENCINDEX_ASCII:
-        /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
-        return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
-      case ENCINDEX_US_ASCII:
-        return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
+        case ENCINDEX_ASCII:
+            /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
+            return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+        case ENCINDEX_US_ASCII:
+            return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
+        case RUBY_ENCINDEX_UTF_8:
+            if (validate_ascii_fast(p, len)) return ENC_CODERANGE_7BIT;
+            if (validate_utf8_fast(p, len)) return ENC_CODERANGE_VALID;
+            return ENC_CODERANGE_BROKEN;
     }
 
     if (rb_enc_asciicompat(enc)) {
@@ -549,6 +554,7 @@ coderange_scan(const char *p, long len, rb_encoding *enc)
             return ENC_CODERANGE_7BIT;
         }
 
+        e = p + len;
         p = search_nonascii(p, e);
         if (!p) return ENC_CODERANGE_7BIT;
         for (;;) {
@@ -561,6 +567,7 @@ coderange_scan(const char *p, long len, rb_encoding *enc)
         }
     }
     else {
+        e = p + len;
         while (p < e) {
             int ret = rb_enc_precise_mbclen(p, e, enc);
             if (!MBCLEN_CHARFOUND_P(ret)) return ENC_CODERANGE_BROKEN;

From b5836631be3997f84a3227e42b7da1f6f70e76b5 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Wed, 11 Dec 2019 17:31:43 +0100
Subject: [PATCH 04/11] Enable -msse4.1

---
 configure.ac | 1 +
 1 file changed, 1 insertion(+)

diff --git a/configure.ac b/configure.ac
index 83d6ff4e23bbba..6344fae7cd634f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,6 +1513,7 @@ RUBY_DECL_ATTRIBUTE([__nonnull__(n)], [RUBY_FUNC_NONNULL(n,x)], [rb_cv_func_nonn
 @%:@define n 1
 ])
 
+RUBY_APPEND_OPTION(XCFLAGS, -msse4.1)
 RUBY_APPEND_OPTION(XCFLAGS, -DRUBY_EXPORT)
 
 AC_ARG_ENABLE(mathn,

From 64aed65623a133987bbbb5dd437451c16da24b5e Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 12:46:18 +0100
Subject: [PATCH 05/11] use AVX functions and add a benchmark

---
 benchmark/string_coderange_scan.rb |  5 +++++
 configure.ac                       |  1 +
 string.c                           | 10 +++++-----
 3 files changed, 11 insertions(+), 5 deletions(-)
 create mode 100644 benchmark/string_coderange_scan.rb

diff --git a/benchmark/string_coderange_scan.rb b/benchmark/string_coderange_scan.rb
new file mode 100644
index 00000000000000..95e3225cc0f804
--- /dev/null
+++ b/benchmark/string_coderange_scan.rb
@@ -0,0 +1,5 @@
+str = 'abc€›ﬁ!‰,' * 1_000_000
+1_000.times do
+  str.force_encoding(Encoding::UTF_8) # clear coderange
+  str.valid_encoding?
+end
diff --git a/configure.ac b/configure.ac
index 6344fae7cd634f..fa409ab16de147 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1514,6 +1514,7 @@ RUBY_DECL_ATTRIBUTE([__nonnull__(n)], [RUBY_FUNC_NONNULL(n,x)], [rb_cv_func_nonn
 ])
 
 RUBY_APPEND_OPTION(XCFLAGS, -msse4.1)
+RUBY_APPEND_OPTION(XCFLAGS, -mavx2)
 RUBY_APPEND_OPTION(XCFLAGS, -DRUBY_EXPORT)
 
 AC_ARG_ENABLE(mathn,
diff --git a/string.c b/string.c
index 913a15da10177b..1a9e5dbef0cb11 100644
--- a/string.c
+++ b/string.c
@@ -540,17 +540,17 @@ coderange_scan(const char *p, long len, rb_encoding *enc)
     switch (rb_enc_to_index(enc)) {
         case ENCINDEX_ASCII:
             /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
-            return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+            return validate_ascii_fast_avx(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
         case ENCINDEX_US_ASCII:
-            return validate_ascii_fast(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
+            return validate_ascii_fast_avx(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
         case RUBY_ENCINDEX_UTF_8:
-            if (validate_ascii_fast(p, len)) return ENC_CODERANGE_7BIT;
-            if (validate_utf8_fast(p, len)) return ENC_CODERANGE_VALID;
+            if (validate_ascii_fast_avx(p, len)) return ENC_CODERANGE_7BIT;
+            if (validate_utf8_fast_avx(p, len)) return ENC_CODERANGE_VALID;
             return ENC_CODERANGE_BROKEN;
     }
 
     if (rb_enc_asciicompat(enc)) {
-        if (validate_ascii_fast(p, len)) {
+        if (validate_ascii_fast_avx(p, len)) {
             return ENC_CODERANGE_7BIT;
         }
 

From fb12cbdba8d9259354cdded812362373a4b74eed Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 14:31:00 +0100
Subject: [PATCH 06/11] Refactor fast encoding check to automatically select
 the fastest available implementation

---
 benchmark/string_coderange_scan.rb       |   2 +-
 configure.ac                             |   3 +-
 simdutf8check.h => simd_encoding_check.h | 464 +++++++++++------------
 simdasciicheck.h                         |  55 ---
 string.c                                 |  49 ++-
 5 files changed, 268 insertions(+), 305 deletions(-)
 rename simdutf8check.h => simd_encoding_check.h (85%)
 delete mode 100644 simdasciicheck.h

diff --git a/benchmark/string_coderange_scan.rb b/benchmark/string_coderange_scan.rb
index 95e3225cc0f804..509709b475eecb 100644
--- a/benchmark/string_coderange_scan.rb
+++ b/benchmark/string_coderange_scan.rb
@@ -1,4 +1,4 @@
-str = 'abc€›ﬁ!‰,' * 1_000_000
+str = 'abc€›ﬁ!‰,' * 1_000
 1_000.times do
   str.force_encoding(Encoding::UTF_8) # clear coderange
   str.valid_encoding?
diff --git a/configure.ac b/configure.ac
index fa409ab16de147..579bac4370fd13 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1513,8 +1513,7 @@ RUBY_DECL_ATTRIBUTE([__nonnull__(n)], [RUBY_FUNC_NONNULL(n,x)], [rb_cv_func_nonn
 @%:@define n 1
 ])
 
-RUBY_APPEND_OPTION(XCFLAGS, -msse4.1)
-RUBY_APPEND_OPTION(XCFLAGS, -mavx2)
+RUBY_APPEND_OPTION(XCFLAGS, -march=native)
 RUBY_APPEND_OPTION(XCFLAGS, -DRUBY_EXPORT)
 
 AC_ARG_ENABLE(mathn,
diff --git a/simdutf8check.h b/simd_encoding_check.h
similarity index 85%
rename from simdutf8check.h
rename to simd_encoding_check.h
index 3e16779830e3c0..2a76d2ec4ac2e4 100644
--- a/simdutf8check.h
+++ b/simd_encoding_check.h
@@ -1,193 +1,41 @@
-
-#ifndef SIMDUTF8CHECK_H
-#define SIMDUTF8CHECK_H
+#ifndef SIMDASCIICHECK_H
+#define SIMDASCIICHECK_H
+
+#if defined __AVX2__ || defined __SSE4_1__
+#define SIMD_ENCODING_CHECK
+#include <emmintrin.h> // SSE2
+#include <stdbool.h>   // c99 bool
+#include <stddef.h>    // size_t
 #include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
 #include <string.h>
-#include <x86intrin.h>
-/*
- * legal utf-8 byte sequence
- * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
- *
- *  Code Points        1st       2s       3s       4s
- * U+0000..U+007F     00..7F
- * U+0080..U+07FF     C2..DF   80..BF
- * U+0800..U+0FFF     E0       A0..BF   80..BF
- * U+1000..U+CFFF     E1..EC   80..BF   80..BF
- * U+D000..U+D7FF     ED       80..9F   80..BF
- * U+E000..U+FFFF     EE..EF   80..BF   80..BF
- * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
- * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
- * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
- *
- */
-
-// all byte values must be no larger than 0xF4
-static inline void checkSmallerThan0xF4(__m128i current_bytes,
-                                        __m128i *has_error) {
-  // unsigned, saturates to 0 below max
-  *has_error = _mm_or_si128(*has_error,
-                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
-}
-
-static inline __m128i continuationLengths(__m128i high_nibbles) {
-  return _mm_shuffle_epi8(
-      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
-                    0, 0, 0, 0,             // 10xx (continuation)
-                    2, 2,                   // 110x
-                    3,                      // 1110
-                    4), // 1111, next should be 0 (not checked here)
-      high_nibbles);
-}
-
-static inline __m128i carryContinuations(__m128i initial_lengths,
-                                         __m128i previous_carries) {
-
-  __m128i right1 =
-      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
-                    _mm_set1_epi8(1));
-  __m128i sum = _mm_add_epi8(initial_lengths, right1);
-
-  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
-                                 _mm_set1_epi8(2));
-  return _mm_add_epi8(sum, right2);
-}
-
-static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
-                                      __m128i *has_error) {
-
-  // overlap || underlap
-  // carry > length && length > 0 || !(carry > length) && !(length > 0)
-  // (carries > length) == (lengths > 0)
-  __m128i overunder =
-      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
-                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
-
-  *has_error = _mm_or_si128(*has_error, overunder);
-}
-
-// when 0xED is found, next byte must be no larger than 0x9F
-// when 0xF4 is found, next byte must be no larger than 0x8F
-// next byte must be continuation, ie sign bit is set, so signed < is ok
-static inline void checkFirstContinuationMax(__m128i current_bytes,
-                                             __m128i off1_current_bytes,
-                                             __m128i *has_error) {
-  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
-  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
-
-  __m128i badfollowED =
-      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
-  __m128i badfollowF4 =
-      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
-
-  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
-}
-
-// map off1_hibits => error condition
-// hibits     off1    cur
-// C       => < C2 && true
-// E       => < E1 && < A0
-// F       => < F1 && < 90
-// else      false && false
-static inline void checkOverlong(__m128i current_bytes,
-                                 __m128i off1_current_bytes, __m128i hibits,
-                                 __m128i previous_hibits, __m128i *has_error) {
-  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
-  __m128i initial_mins = _mm_shuffle_epi8(
-      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                    -128, -128, // 10xx => false
-                    0xC2, -128, // 110x
-                    0xE1,       // 1110
-                    0xF1),
-      off1_hibits);
-
-  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
-
-  __m128i second_mins = _mm_shuffle_epi8(
-      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
-                    -128, -128, // 10xx => false
-                    127, 127,   // 110x => true
-                    0xA0,       // 1110
-                    0x90),
-      off1_hibits);
-  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
-  *has_error =
-      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
-}
-
-struct processed_utf_bytes {
-  __m128i rawbytes;
-  __m128i high_nibbles;
-  __m128i carried_continuations;
-};
-
-static inline void count_nibbles(__m128i bytes,
-                                 struct processed_utf_bytes *answer) {
-  answer->rawbytes = bytes;
-  answer->high_nibbles =
-      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
-}
-
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
-static struct processed_utf_bytes
-checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
-               __m128i *has_error) {
-  struct processed_utf_bytes pb;
-  count_nibbles(current_bytes, &pb);
-
-  checkSmallerThan0xF4(current_bytes, has_error);
-
-  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
-
-  pb.carried_continuations =
-      carryContinuations(initial_lengths, previous->carried_continuations);
-
-  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
-
-  __m128i off1_current_bytes =
-      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
-  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+#endif
 
-  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                previous->high_nibbles, has_error);
-  return pb;
-}
+#ifdef __AVX2__
+#include <x86intrin.h>
 
-static bool validate_utf8_fast(const char *src, size_t len) {
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+static bool validate_ascii_fast(const char *src, size_t len) {
   size_t i = 0;
-  __m128i has_error = _mm_setzero_si128();
-  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
-                                         .high_nibbles = _mm_setzero_si128(),
-                                         .carried_continuations =
-                                             _mm_setzero_si128()};
-  if (len >= 16) {
-    for (; i <= len - 16; i += 16) {
-      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
-      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
+  __m256i has_error = _mm256_setzero_si256();
+  if (len >= 32) {
+    for (; i <= len - 32; i += 32) {
+      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
+      has_error = _mm256_or_si256(has_error, current_bytes);
     }
   }
+  int error_mask = _mm256_movemask_epi8(has_error);
 
-  // last part
-  if (i < len) {
-    char buffer[16];
-    memset(buffer, 0, 16);
-    memcpy(buffer, src + i, len - i);
-    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
-    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
-  } else {
-    has_error =
-        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
-                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                                  9, 9, 9, 9, 9, 1)),
-                     has_error);
+  char tail_has_error = 0;
+  for (; i < len; i++) {
+    tail_has_error |= src[i];
   }
+  error_mask |= (tail_has_error & 0x80);
 
-  return _mm_testz_si128(has_error, has_error);
+  return !error_mask;
 }
 
-#ifdef __AVX2__
 
 /*****************************/
 static inline __m256i push_last_byte_of_a_to_b(__m256i a, __m256i b) {
@@ -349,45 +197,7 @@ avxcheckUTF8Bytes(__m256i current_bytes,
   return pb;
 }
 
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
-static struct avx_processed_utf_bytes
-avxcheckUTF8Bytes_asciipath(__m256i current_bytes,
-                            struct avx_processed_utf_bytes *previous,
-                            __m256i *has_error) {
-  if (_mm256_testz_si256(current_bytes,
-                         _mm256_set1_epi8(0x80))) { // fast ascii path
-    *has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous->carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        *has_error);
-    return *previous;
-  }
-
-  struct avx_processed_utf_bytes pb;
-  avx_count_nibbles(current_bytes, &pb);
-
-  avxcheckSmallerThan0xF4(current_bytes, has_error);
-
-  __m256i initial_lengths = avxcontinuationLengths(pb.high_nibbles);
-
-  pb.carried_continuations =
-      avxcarryContinuations(initial_lengths, previous->carried_continuations);
-
-  avxcheckContinuations(initial_lengths, pb.carried_continuations, has_error);
-
-  __m256i off1_current_bytes =
-      push_last_byte_of_a_to_b(previous->rawbytes, pb.rawbytes);
-  avxcheckFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
-
-  avxcheckOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
-                   previous->high_nibbles, has_error);
-  return pb;
-}
-
-static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
+static bool validate_utf8_fast(const char *src, size_t len) {
   size_t i = 0;
   __m256i has_error = _mm256_setzero_si256();
   struct avx_processed_utf_bytes previous = {
@@ -397,8 +207,7 @@ static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
   if (len >= 32) {
     for (; i <= len - 32; i += 32) {
       __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous =
-          avxcheckUTF8Bytes_asciipath(current_bytes, &previous, &has_error);
+      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
     }
   }
 
@@ -421,38 +230,213 @@ static bool validate_utf8_fast_avx_asciipath(const char *src, size_t len) {
   return _mm256_testz_si256(has_error, has_error);
 }
 
-static bool validate_utf8_fast_avx(const char *src, size_t len) {
+#else
+
+#ifdef __SSE4_1__
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+static bool validate_ascii_fast(const char *src, size_t len) {
   size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  struct avx_processed_utf_bytes previous = {
-      .rawbytes = _mm256_setzero_si256(),
-      .high_nibbles = _mm256_setzero_si256(),
-      .carried_continuations = _mm256_setzero_si256()};
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+  __m128i has_error = _mm_setzero_si128();
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+      has_error = _mm_or_si128(has_error, current_bytes);
+    }
+  }
+  int error_mask = _mm_movemask_epi8(has_error);
+
+  char tail_has_error = 0;
+  for (; i < len; i++) {
+    tail_has_error |= src[i];
+  }
+  error_mask |= (tail_has_error & 0x80);
+
+  return !error_mask;
+}
+
+/*
+ * legal utf-8 byte sequence
+ * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+ *
+ *  Code Points        1st       2s       3s       4s
+ * U+0000..U+007F     00..7F
+ * U+0080..U+07FF     C2..DF   80..BF
+ * U+0800..U+0FFF     E0       A0..BF   80..BF
+ * U+1000..U+CFFF     E1..EC   80..BF   80..BF
+ * U+D000..U+D7FF     ED       80..9F   80..BF
+ * U+E000..U+FFFF     EE..EF   80..BF   80..BF
+ * U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
+ * U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
+ * U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
+ *
+ */
+
+// all byte values must be no larger than 0xF4
+static inline void checkSmallerThan0xF4(__m128i current_bytes,
+                                        __m128i *has_error) {
+  // unsigned, saturates to 0 below max
+  *has_error = _mm_or_si128(*has_error,
+                            _mm_subs_epu8(current_bytes, _mm_set1_epi8(0xF4)));
+}
+
+static inline __m128i continuationLengths(__m128i high_nibbles) {
+  return _mm_shuffle_epi8(
+      _mm_setr_epi8(1, 1, 1, 1, 1, 1, 1, 1, // 0xxx (ASCII)
+                    0, 0, 0, 0,             // 10xx (continuation)
+                    2, 2,                   // 110x
+                    3,                      // 1110
+                    4), // 1111, next should be 0 (not checked here)
+      high_nibbles);
+}
+
+static inline __m128i carryContinuations(__m128i initial_lengths,
+                                         __m128i previous_carries) {
+
+  __m128i right1 =
+      _mm_subs_epu8(_mm_alignr_epi8(initial_lengths, previous_carries, 16 - 1),
+                    _mm_set1_epi8(1));
+  __m128i sum = _mm_add_epi8(initial_lengths, right1);
+
+  __m128i right2 = _mm_subs_epu8(_mm_alignr_epi8(sum, previous_carries, 16 - 2),
+                                 _mm_set1_epi8(2));
+  return _mm_add_epi8(sum, right2);
+}
+
+static inline void checkContinuations(__m128i initial_lengths, __m128i carries,
+                                      __m128i *has_error) {
+
+  // overlap || underlap
+  // carry > length && length > 0 || !(carry > length) && !(length > 0)
+  // (carries > length) == (lengths > 0)
+  __m128i overunder =
+      _mm_cmpeq_epi8(_mm_cmpgt_epi8(carries, initial_lengths),
+                     _mm_cmpgt_epi8(initial_lengths, _mm_setzero_si128()));
+
+  *has_error = _mm_or_si128(*has_error, overunder);
+}
+
+// when 0xED is found, next byte must be no larger than 0x9F
+// when 0xF4 is found, next byte must be no larger than 0x8F
+// next byte must be continuation, ie sign bit is set, so signed < is ok
+static inline void checkFirstContinuationMax(__m128i current_bytes,
+                                             __m128i off1_current_bytes,
+                                             __m128i *has_error) {
+  __m128i maskED = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xED));
+  __m128i maskF4 = _mm_cmpeq_epi8(off1_current_bytes, _mm_set1_epi8(0xF4));
+
+  __m128i badfollowED =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x9F)), maskED);
+  __m128i badfollowF4 =
+      _mm_and_si128(_mm_cmpgt_epi8(current_bytes, _mm_set1_epi8(0x8F)), maskF4);
+
+  *has_error = _mm_or_si128(*has_error, _mm_or_si128(badfollowED, badfollowF4));
+}
+
+// map off1_hibits => error condition
+// hibits     off1    cur
+// C       => < C2 && true
+// E       => < E1 && < A0
+// F       => < F1 && < 90
+// else      false && false
+static inline void checkOverlong(__m128i current_bytes,
+                                 __m128i off1_current_bytes, __m128i hibits,
+                                 __m128i previous_hibits, __m128i *has_error) {
+  __m128i off1_hibits = _mm_alignr_epi8(hibits, previous_hibits, 16 - 1);
+  __m128i initial_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    0xC2, -128, // 110x
+                    0xE1,       // 1110
+                    0xF1),
+      off1_hibits);
+
+  __m128i initial_under = _mm_cmpgt_epi8(initial_mins, off1_current_bytes);
+
+  __m128i second_mins = _mm_shuffle_epi8(
+      _mm_setr_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+                    -128, -128, // 10xx => false
+                    127, 127,   // 110x => true
+                    0xA0,       // 1110
+                    0x90),
+      off1_hibits);
+  __m128i second_under = _mm_cmpgt_epi8(second_mins, current_bytes);
+  *has_error =
+      _mm_or_si128(*has_error, _mm_and_si128(initial_under, second_under));
+}
+
+struct processed_utf_bytes {
+  __m128i rawbytes;
+  __m128i high_nibbles;
+  __m128i carried_continuations;
+};
+
+static inline void count_nibbles(__m128i bytes,
+                                 struct processed_utf_bytes *answer) {
+  answer->rawbytes = bytes;
+  answer->high_nibbles =
+      _mm_and_si128(_mm_srli_epi16(bytes, 4), _mm_set1_epi8(0x0F));
+}
+
+// check whether the current bytes are valid UTF-8
+// at the end of the function, previous gets updated
+static struct processed_utf_bytes
+checkUTF8Bytes(__m128i current_bytes, struct processed_utf_bytes *previous,
+               __m128i *has_error) {
+  struct processed_utf_bytes pb;
+  count_nibbles(current_bytes, &pb);
+
+  checkSmallerThan0xF4(current_bytes, has_error);
+
+  __m128i initial_lengths = continuationLengths(pb.high_nibbles);
+
+  pb.carried_continuations =
+      carryContinuations(initial_lengths, previous->carried_continuations);
+
+  checkContinuations(initial_lengths, pb.carried_continuations, has_error);
+
+  __m128i off1_current_bytes =
+      _mm_alignr_epi8(pb.rawbytes, previous->rawbytes, 16 - 1);
+  checkFirstContinuationMax(current_bytes, off1_current_bytes, has_error);
+
+  checkOverlong(current_bytes, off1_current_bytes, pb.high_nibbles,
+                previous->high_nibbles, has_error);
+  return pb;
+}
+
+static bool validate_utf8_fast(const char *src, size_t len) {
+  size_t i = 0;
+  __m128i has_error = _mm_setzero_si128();
+  struct processed_utf_bytes previous = {.rawbytes = _mm_setzero_si128(),
+                                         .high_nibbles = _mm_setzero_si128(),
+                                         .carried_continuations =
+                                             _mm_setzero_si128()};
+  if (len >= 16) {
+    for (; i <= len - 16; i += 16) {
+      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
+      previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
     }
   }
 
   // last part
   if (i < len) {
-    char buffer[32];
-    memset(buffer, 0, 32);
+    char buffer[16];
+    memset(buffer, 0, 16);
     memcpy(buffer, src + i, len - i);
-    __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(buffer));
-    previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+    __m128i current_bytes = _mm_loadu_si128((const __m128i *)(buffer));
+    previous = checkUTF8Bytes(current_bytes, &previous, &has_error);
   } else {
-    has_error = _mm256_or_si256(
-        _mm256_cmpgt_epi8(previous.carried_continuations,
-                          _mm256_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
-                                           9, 9, 9, 9, 9, 9, 9, 1)),
-        has_error);
+    has_error =
+        _mm_or_si128(_mm_cmpgt_epi8(previous.carried_continuations,
+                                    _mm_setr_epi8(9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+                                                  9, 9, 9, 9, 9, 1)),
+                     has_error);
   }
 
-  return _mm256_testz_si256(has_error, has_error);
+  return _mm_testz_si128(has_error, has_error);
 }
 
-#endif // __AVX2__
 #endif
+
+#endif
+#endif
\ No newline at end of file
diff --git a/simdasciicheck.h b/simdasciicheck.h
deleted file mode 100644
index 6c119ff3728963..00000000000000
--- a/simdasciicheck.h
+++ /dev/null
@@ -1,55 +0,0 @@
-#ifndef SIMDASCIICHECK_H
-#define SIMDASCIICHECK_H
-
-#include <emmintrin.h> // SSE2
-#include <stdbool.h>   // c99 bool
-#include <stddef.h>    // size_t
-
-// The function returns true (1) if all chars passed in src are
-// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
-static bool validate_ascii_fast(const char *src, size_t len) {
-  size_t i = 0;
-  __m128i has_error = _mm_setzero_si128();
-  if (len >= 16) {
-    for (; i <= len - 16; i += 16) {
-      __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
-      has_error = _mm_or_si128(has_error, current_bytes);
-    }
-  }
-  int error_mask = _mm_movemask_epi8(has_error);
-
-  char tail_has_error = 0;
-  for (; i < len; i++) {
-    tail_has_error |= src[i];
-  }
-  error_mask |= (tail_has_error & 0x80);
-
-  return !error_mask;
-}
-
-#ifdef __AVX2__
-#include <x86intrin.h>
-// The function returns true (1) if all chars passed in src are
-// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
-static bool validate_ascii_fast_avx(const char *src, size_t len) {
-  size_t i = 0;
-  __m256i has_error = _mm256_setzero_si256();
-  if (len >= 32) {
-    for (; i <= len - 32; i += 32) {
-      __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
-      has_error = _mm256_or_si256(has_error, current_bytes);
-    }
-  }
-  int error_mask = _mm256_movemask_epi8(has_error);
-
-  char tail_has_error = 0;
-  for (; i < len; i++) {
-    tail_has_error |= src[i];
-  }
-  error_mask |= (tail_has_error & 0x80);
-
-  return !error_mask;
-}
-#endif
-
-#endif
diff --git a/string.c b/string.c
index 1a9e5dbef0cb11..dc416557a2f228 100644
--- a/string.c
+++ b/string.c
@@ -21,8 +21,7 @@
 #include "id.h"
 #include "debug_counter.h"
 #include "ruby/util.h"
-#include "simdasciicheck.h"
-#include "simdutf8check.h"
+#include "simd_encoding_check.h"
 
 #define BEG(no) (regs->beg[(no)])
 #define END(no) (regs->end[(no)])
@@ -532,6 +531,38 @@ search_nonascii(const char *p, const char *e)
     }
 }
 
+static inline bool
+is_valid_ascii(const char *p, long len)
+{
+#ifdef SIMD_ENCODING_CHECK
+    return validate_ascii_fast(p, len);
+#else
+    return !search_nonascii(p, p + len);
+#endif
+}
+
+static inline bool
+is_valid_utf8(const char *p, long len)
+{
+#ifdef SIMD_ENCODING_CHECK
+    return validate_utf8_fast(p, len);
+#else
+    const char *e;
+    e = p + len;
+    p = search_nonascii(p, e);
+    if (!p) return true;
+    for (;;) {
+        int ret = rb_enc_precise_mbclen(p, e, enc);
+        if (!MBCLEN_CHARFOUND_P(ret)) return false;
+        p += MBCLEN_CHARFOUND_LEN(ret);
+        if (p == e) break;
+        p = search_nonascii(p, e);
+        if (!p) break;
+    }
+    return true;
+#endif
+}
+
 static int
 coderange_scan(const char *p, long len, rb_encoding *enc)
 {
@@ -540,19 +571,23 @@ coderange_scan(const char *p, long len, rb_encoding *enc)
     switch (rb_enc_to_index(enc)) {
         case ENCINDEX_ASCII:
             /* enc is ASCII-8BIT.  ASCII-8BIT string never be broken. */
-            return validate_ascii_fast_avx(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
+            return is_valid_ascii(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
         case ENCINDEX_US_ASCII:
-            return validate_ascii_fast_avx(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
+            return is_valid_ascii(p, len) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_BROKEN;
+        #ifdef SIMD_ENCODING_CHECK
         case RUBY_ENCINDEX_UTF_8:
-            if (validate_ascii_fast_avx(p, len)) return ENC_CODERANGE_7BIT;
-            if (validate_utf8_fast_avx(p, len)) return ENC_CODERANGE_VALID;
+            if (is_valid_ascii(p, len)) return ENC_CODERANGE_7BIT;
+            if (is_valid_utf8(p, len)) return ENC_CODERANGE_VALID;
             return ENC_CODERANGE_BROKEN;
+        #endif
     }
 
     if (rb_enc_asciicompat(enc)) {
-        if (validate_ascii_fast_avx(p, len)) {
+        #ifdef SIMD_ENCODING_CHECK
+        if (is_valid_ascii(p, len)) {
             return ENC_CODERANGE_7BIT;
         }
+        #endif
 
         e = p + len;
         p = search_nonascii(p, e);

From 37182a03da21b84de1d0c1d8a8a17b7489f77520 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 15:19:00 +0100
Subject: [PATCH 07/11] Use faster is_valid_ascii in
 rb_enc_cr_str_copy_for_substr and rb_external_str_new_with_enc

---
 string.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/string.c b/string.c
index dc416557a2f228..361211a70bd3fa 100644
--- a/string.c
+++ b/string.c
@@ -684,11 +684,10 @@ rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
 	ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
 	break;
       case ENC_CODERANGE_VALID:
-	if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
-	    search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
-	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
-	else
+	if (rb_enc_asciicompat(STR_ENC_GET(src)) && is_valid_ascii(RSTRING_PTR(dest), RSTRING_LEN(dest)))
 	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_7BIT);
+	else
+	    ENC_CODERANGE_SET(dest, ENC_CODERANGE_VALID);
 	break;
       default:
 	break;
@@ -1094,7 +1093,7 @@ rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
 
     /* ASCII-8BIT case, no conversion */
     if ((eidx == rb_ascii8bit_encindex()) ||
-	(eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
+        (eidx == rb_usascii_encindex() && !is_valid_ascii(ptr, len))) {
         return rb_str_new(ptr, len);
     }
     /* no default_internal or same encoding, no conversion */
@@ -1105,8 +1104,8 @@ rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
     /* ASCII compatible, and ASCII only string, no conversion in
      * default_internal */
     if ((eidx == rb_ascii8bit_encindex()) ||
-	(eidx == rb_usascii_encindex()) ||
-	(rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
+        (eidx == rb_usascii_encindex()) ||
+        (rb_enc_asciicompat(eenc) && is_valid_ascii(ptr, len))) {
         return rb_enc_str_new(ptr, len, ienc);
     }
     /* convert from the given encoding to default_internal */
@@ -1114,7 +1113,7 @@ rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *eenc)
     /* when the conversion failed for some reason, just ignore the
      * default_internal and result in the given encoding as-is. */
     if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
-	rb_str_initialize(str, ptr, len, eenc);
+        rb_str_initialize(str, ptr, len, eenc);
     }
     return str;
 }

From f6a57cedc3e73a46dc35b9ae8d6bebdc4e1c8402 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 17:16:04 +0100
Subject: [PATCH 08/11] Fix simd_encoding_check.h when only SSE4.1 is available

---
 simd_encoding_check.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/simd_encoding_check.h b/simd_encoding_check.h
index 2a76d2ec4ac2e4..3f98995cec3036 100644
--- a/simd_encoding_check.h
+++ b/simd_encoding_check.h
@@ -9,10 +9,10 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <string.h>
+#include <x86intrin.h>
 #endif
 
 #ifdef __AVX2__
-#include <x86intrin.h>
 
 // The function returns true (1) if all chars passed in src are
 // 7-bit values (0x00..0x7F). Otherwise, it returns false (0).

From 30f17d13d48e0105cecd670eae7d59e7d6aa2326 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 17:16:18 +0100
Subject: [PATCH 09/11] More coderange benchmarks

---
 benchmark/string_coderange_scan_ascii.rb                     | 5 +++++
 ...tring_coderange_scan.rb => string_coderange_scan_utf8.rb} | 0
 2 files changed, 5 insertions(+)
 create mode 100644 benchmark/string_coderange_scan_ascii.rb
 rename benchmark/{string_coderange_scan.rb => string_coderange_scan_utf8.rb} (100%)

diff --git a/benchmark/string_coderange_scan_ascii.rb b/benchmark/string_coderange_scan_ascii.rb
new file mode 100644
index 00000000000000..c4f7ba1d4fe186
--- /dev/null
+++ b/benchmark/string_coderange_scan_ascii.rb
@@ -0,0 +1,5 @@
+str = 'abcdefgh' * 1_000
+1_000.times do
+  str.force_encoding(Encoding::UTF_8) # clear coderange
+  str.valid_encoding?
+end
diff --git a/benchmark/string_coderange_scan.rb b/benchmark/string_coderange_scan_utf8.rb
similarity index 100%
rename from benchmark/string_coderange_scan.rb
rename to benchmark/string_coderange_scan_utf8.rb

From 0ba7d62d3ec59c6a6fe7fcf149c7b9b1cb65d0d8 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Thu, 12 Dec 2019 17:39:15 +0100
Subject: [PATCH 10/11] Fix fallback version of is_valid_utf8

---
 string.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/string.c b/string.c
index 361211a70bd3fa..94f3c222ae5022 100644
--- a/string.c
+++ b/string.c
@@ -552,7 +552,7 @@ is_valid_utf8(const char *p, long len)
     p = search_nonascii(p, e);
     if (!p) return true;
     for (;;) {
-        int ret = rb_enc_precise_mbclen(p, e, enc);
+        int ret = rb_enc_precise_mbclen(p, e, rb_utf8_encoding());
         if (!MBCLEN_CHARFOUND_P(ret)) return false;
         p += MBCLEN_CHARFOUND_LEN(ret);
         if (p == e) break;

From a410ce5373c9792924c7493e030f7a9160305015 Mon Sep 17 00:00:00 2001
From: Jean Boussier <jean.boussier@gmail.com>
Date: Fri, 13 Dec 2019 13:28:50 +0100
Subject: [PATCH 11/11] More benchmarks and add shortcircuiting logic

---
 benchmark/string_coderange_scan.yml      | 26 +++++++++++++++++++++
 benchmark/string_coderange_scan_ascii.rb |  5 ----
 benchmark/string_coderange_scan_utf8.rb  |  5 ----
 simd_encoding_check.h                    | 29 ++++++++++++++----------
 4 files changed, 43 insertions(+), 22 deletions(-)
 create mode 100644 benchmark/string_coderange_scan.yml
 delete mode 100644 benchmark/string_coderange_scan_ascii.rb
 delete mode 100644 benchmark/string_coderange_scan_utf8.rb

diff --git a/benchmark/string_coderange_scan.yml b/benchmark/string_coderange_scan.yml
new file mode 100644
index 00000000000000..2fca47362804de
--- /dev/null
+++ b/benchmark/string_coderange_scan.yml
@@ -0,0 +1,26 @@
+prelude: |
+  ascii1 = [*"a".."m",*"N".."Z",*"0".."9"].join("")
+  ascii10 = ascii1 * 10
+  ascii100 = ascii10 * 10
+  ascii1000 = ascii100 * 10
+
+  utf81 = [*"a".."m",*"N".."Z", "éà€‹›Ç☃"].join("")
+  utf810 = utf81 * 10
+  utf8100 = utf810 * 10
+  utf81000 = utf8100 * 10
+
+  invalid_ascii_last = ascii1000 + "☃"
+  invalid_ascii_first = "☃" + ascii1000
+benchmark:
+  ascii-coderange-scan-1: ascii1.force_encoding(Encoding::UTF_8).valid_encoding?
+  ascii-coderange-scan-10: ascii10.force_encoding(Encoding::UTF_8).valid_encoding?
+  ascii-coderange-scan-100: ascii100.force_encoding(Encoding::UTF_8).valid_encoding?
+  ascii-coderange-scan-1000: ascii1000.force_encoding(Encoding::UTF_8).valid_encoding?
+  
+  utf8-coderange-scan-1: utf81.force_encoding(Encoding::UTF_8).valid_encoding?
+  utf8-coderange-scan-10: utf810.force_encoding(Encoding::UTF_8).valid_encoding?
+  utf8-coderange-scan-100: utf8100.force_encoding(Encoding::UTF_8).valid_encoding?
+  utf8-coderange-scan-1000: utf81000.force_encoding(Encoding::UTF_8).valid_encoding?
+
+  ascii-coderange-scan-first-1: invalid_ascii_first.force_encoding(Encoding::ASCII).valid_encoding?
+  ascii-coderange-scan-last-1: invalid_ascii_last.force_encoding(Encoding::ASCII).valid_encoding?
diff --git a/benchmark/string_coderange_scan_ascii.rb b/benchmark/string_coderange_scan_ascii.rb
deleted file mode 100644
index c4f7ba1d4fe186..00000000000000
--- a/benchmark/string_coderange_scan_ascii.rb
+++ /dev/null
@@ -1,5 +0,0 @@
-str = 'abcdefgh' * 1_000
-1_000.times do
-  str.force_encoding(Encoding::UTF_8) # clear coderange
-  str.valid_encoding?
-end
diff --git a/benchmark/string_coderange_scan_utf8.rb b/benchmark/string_coderange_scan_utf8.rb
deleted file mode 100644
index 509709b475eecb..00000000000000
--- a/benchmark/string_coderange_scan_utf8.rb
+++ /dev/null
@@ -1,5 +0,0 @@
-str = 'abc€›ﬁ!‰,' * 1_000
-1_000.times do
-  str.force_encoding(Encoding::UTF_8) # clear coderange
-  str.valid_encoding?
-end
diff --git a/simd_encoding_check.h b/simd_encoding_check.h
index 3f98995cec3036..d9755df777216a 100644
--- a/simd_encoding_check.h
+++ b/simd_encoding_check.h
@@ -23,17 +23,19 @@ static bool validate_ascii_fast(const char *src, size_t len) {
     for (; i <= len - 32; i += 32) {
       __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
       has_error = _mm256_or_si256(has_error, current_bytes);
+      if (_mm256_movemask_epi8(has_error)) {
+          return false;
+      }
     }
   }
-  int error_mask = _mm256_movemask_epi8(has_error);
 
-  char tail_has_error = 0;
   for (; i < len; i++) {
-    tail_has_error |= src[i];
+    if (src[i] & 0x80) {
+      return false;
+    }
   }
-  error_mask |= (tail_has_error & 0x80);
 
-  return !error_mask;
+  return true;
 }
 
 
@@ -170,8 +172,6 @@ static inline void avx_count_nibbles(__m256i bytes,
       _mm256_and_si256(_mm256_srli_epi16(bytes, 4), _mm256_set1_epi8(0x0F));
 }
 
-// check whether the current bytes are valid UTF-8
-// at the end of the function, previous gets updated
 static struct avx_processed_utf_bytes
 avxcheckUTF8Bytes(__m256i current_bytes,
                   struct avx_processed_utf_bytes *previous,
@@ -208,6 +208,9 @@ static bool validate_utf8_fast(const char *src, size_t len) {
     for (; i <= len - 32; i += 32) {
       __m256i current_bytes = _mm256_loadu_si256((const __m256i *)(src + i));
       previous = avxcheckUTF8Bytes(current_bytes, &previous, &has_error);
+      if (_mm256_movemask_epi8(has_error)) {
+        return false;
+      }
     }
   }
 
@@ -242,17 +245,19 @@ static bool validate_ascii_fast(const char *src, size_t len) {
     for (; i <= len - 16; i += 16) {
       __m128i current_bytes = _mm_loadu_si128((const __m128i *)(src + i));
       has_error = _mm_or_si128(has_error, current_bytes);
+      if (_mm_movemask_epi8(has_error)) {
+        return false;
+      }
     }
   }
-  int error_mask = _mm_movemask_epi8(has_error);
 
-  char tail_has_error = 0;
   for (; i < len; i++) {
-    tail_has_error |= src[i];
+    if (src[i] & 0x80) {
+      return false;
+    }
   }
-  error_mask |= (tail_has_error & 0x80);
 
-  return !error_mask;
+  return true;
 }
 
 /*