38
38
#include < cstring>
39
39
#include < climits>
40
40
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define NODE_HAS_SIMD_NEON 1
43
+ #endif
44
+
45
+ #if NODE_HAS_SIMD_NEON
46
+ #include < arm_neon.h>
47
+ #endif
48
+
41
49
#define THROW_AND_RETURN_UNLESS_BUFFER (env, obj ) \
42
50
THROW_AND_RETURN_IF_NOT_BUFFER (env, obj, " argument" ) \
43
51
@@ -741,6 +749,37 @@ void SlowByteLengthUtf8(const FunctionCallbackInfo<Value>& args) {
741
749
args.GetReturnValue ().Set (args[0 ].As <String>()->Utf8Length (env->isolate ()));
742
750
}
743
751
752
+ #if NODE_HAS_SIMD_NEON
753
+ uint32_t FastByteLengthUtf8 (Local<Value> receiver,
754
+ const v8::FastOneByteString& source) {
755
+ const auto data = reinterpret_cast <const uint8_t *>(source.data );
756
+ const uint8x16_t mask = vdupq_n_u8 (0x80 );
757
+ uint8x16_t result_vector = vdupq_n_u8 (0 );
758
+ size_t i = 0 ;
759
+
760
+ for (; i < source.length ; i += 16 ) {
761
+ // load 16 bytes from data
762
+ uint8x16_t values = vld1q_u8 (data + i);
763
+
764
+ // extract the high bits using 0x80 mask
765
+ uint8x16_t high_bits = vcgeq_u8 (values, mask);
766
+
767
+ // accumulate the high bits to result_vector
768
+ result_vector = vqaddq_u8 (result_vector, high_bits);
769
+ }
770
+
771
+ // sum the elements in the result_vector
772
+ uint64x2_t sum64 = vpaddlq_u32 (vpaddlq_u16 (vpaddlq_u8 (result_vector)));
773
+
774
+ uint32_t answer = vgetq_lane_u64 (sum64, 0 ) + vgetq_lane_u64 (sum64, 1 );
775
+
776
+ for (; i < source.length ; ++i) {
777
+ answer += (data[i] >> 7 );
778
+ }
779
+
780
+ return answer + source.length ;
781
+ }
782
+ #else
744
783
uint32_t FastByteLengthUtf8 (Local<Value> receiver,
745
784
const v8::FastOneByteString& source) {
746
785
uint32_t result = 0 ;
@@ -752,6 +791,7 @@ uint32_t FastByteLengthUtf8(Local<Value> receiver,
752
791
result += length;
753
792
return result;
754
793
}
794
+ #endif
755
795
756
796
static v8::CFunction fast_byte_length_utf8 (
757
797
v8::CFunction::Make (FastByteLengthUtf8));
0 commit comments