Skip to content

Commit 8dee453

Browse files
committed
Added zend_simd.h
1 parent 4f32443 commit 8dee453

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

Zend/zend_simd.h

+119
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
+----------------------------------------------------------------------+
3+
| Zend Engine |
4+
+----------------------------------------------------------------------+
5+
| Copyright (c) Zend Technologies Ltd. (http://www.zend.com) |
6+
+----------------------------------------------------------------------+
7+
| This source file is subject to version 2.00 of the Zend license, |
8+
| that is bundled with this package in the file LICENSE, and is |
9+
| available through the world-wide-web at the following url: |
10+
| http://www.zend.com/license/2_00.txt. |
11+
| If you did not receive a copy of the Zend license and are unable to |
12+
| obtain it through the world-wide-web, please send a note to |
13+
| [email protected] so we can mail you a copy immediately. |
14+
+----------------------------------------------------------------------+
15+
| Authors: Saki Takamachi <[email protected]> |
16+
+----------------------------------------------------------------------+
17+
*/
18+
19+
#ifndef ZEND_SIMD_H
20+
#define ZEND_SIMD_H
21+
22+
#if defined(__SSE2__) || defined(__aarch64__) || defined(_M_ARM64)
23+
24+
#define ZEND_HAVE_SIMD
25+
26+
#ifdef __SSE2__
27+
#include <emmintrin.h>
28+
29+
typedef __m128i zend_vec_8x16_t;
30+
typedef __m128i zend_vec_16x8_t;
31+
typedef __m128i zend_vec_32x4_t;
32+
typedef __m128i zend_vec_64x2_t;
33+
34+
#define zend_vec_setzero_8x16() _mm_setzero_si128()
35+
#define zend_vec_set_8x16(x) _mm_set1_epi8(x)
36+
#define zend_vec_set_8x16_from_16x8(x0, x1, x2, x3, x4, x5, x6, x7) _mm_set_epi16(x0, x1, x2, x3, x4, x5, x6, x7)
37+
#define zend_vec_set_8x16_from_32x4(x0, x1, x2, x3) _mm_set_epi32(x0, x1, x2, x3)
38+
#define zend_vec_set_8x16_from_64x2(x0, x1) _mm_set_epi64(x0, x1)
39+
#define zend_vec_load_8x16(x) _mm_load_si128((const __m128i *) (x))
40+
#define zend_vec_loadu_8x16(x) _mm_loadu_si128((const __m128i *) (x))
41+
#define zend_vec_store_8x16(to, x) _mm_store_si128((__m128i *) (to), x)
42+
#define zend_vec_storeu_8x16(to, x) _mm_storeu_si128((__m128i *) (to), x)
43+
44+
#define zend_vec_or_8x16(a, b) _mm_or_si128(a, b)
45+
#define zend_vec_xor_8x16(a, b) _mm_xor_si128(a, b)
46+
#define zend_vec_and_8x16(a, b) _mm_and_si128(a, b)
47+
#define zend_vec_rshift_128_from_8x16(x, bytes) _mm_srli_si128(x, bytes)
48+
#define zend_vec_lshift_128_from_8x16(x, bytes) _mm_slli_si128(x, bytes)
49+
50+
#define zend_vec_add_8x16(a, b) _mm_add_epi8(a, b)
51+
52+
#define zend_vec_cmpeq_8x16(a, b) _mm_cmpeq_epi8(a, b)
53+
#define zend_vec_cmplt_8x16(a, b) _mm_cmplt_epi8(a, b)
54+
#define zend_vec_cmpgt_8x16(a, b) _mm_cmpgt_epi8(a, b)
55+
56+
#define zend_vec_movemask_8x16(x) _mm_movemask_epi8(x)
57+
58+
59+
#elif defined(__aarch64__) || defined(_M_ARM64)
60+
#include <arm_neon.h>
61+
62+
typedef int8x16_t zend_vec_8x16_t;
63+
typedef int16x8_t zend_vec_16x8_t;
64+
typedef int32x4_t zend_vec_32x4_t;
65+
typedef int64x2_t zend_vec_64x2_t;
66+
67+
#define zend_vec_setzero_8x16() vdupq_n_s8(0)
68+
#define zend_vec_set_8x16(x) vdupq_n_s8(x)
69+
static inline int8x16_t zend_vec_set_8x16_from_16x8(
70+
int16_t x0, int16_t x1, int16_t x2, int16_t x3,
71+
int16_t x4, int16_t x5, int16_t x6, int16_t x7
72+
) {
73+
return vreinterpretq_s8_s16((int16x8_t) { x7, x6, x5, x4, x3, x2, x1, x0 });
74+
}
75+
static inline int8x16_t zend_vec_set_8x16_from_32x4(int32_t x0, int32_t x1, int32_t x2, int32_t x3)
76+
{
77+
return vreinterpretq_s8_s32((int32x4_t) { x3, x2, x1, x0 });
78+
}
79+
static inline int8x16_t zend_vec_set_8x16_from_64x2(int64_t x0, int64_t x1)
80+
{
81+
return vreinterpretq_s8_s64((int64x2_t) { x1, x0 });
82+
}
83+
#define zend_vec_load_8x16(x) vld1q_s8((const int8_t *) (x))
84+
#define zend_vec_loadu_8x16(x) zend_vec_load_8x16(x)
85+
#define zend_vec_store_8x16(to, x) vst1q_s8((int8_t *) (to), x)
86+
#define zend_vec_storeu_8x16(to, x) zend_vec_store_8x16(to, x)
87+
88+
#define zend_vec_or_8x16(a, b) vorrq_s8(a, b)
89+
#define zend_vec_xor_8x16(a, b) veorq_s8(a, b)
90+
#define zend_vec_and_8x16(a, b) vandq_s8(a, b)
91+
static inline int8x16_t zend_vec_rshift_128_from_8x16(int8x16_t x, int bytes)
92+
{
93+
return vreinterpretq_s8_u8(vextq_u8(vdupq_n_u8(0), vreinterpretq_u8_s8(x), bytes));
94+
}
95+
static inline int8x16_t zend_vec_lshift_128_from_8x16(int8x16_t x, int bytes)
96+
{
97+
return vreinterpretq_s8_u8(vextq_u8(vreinterpretq_u8_s8(x), vdupq_n_u8(0), 16 - bytes));
98+
}
99+
100+
#define zend_vec_add_8x16(a, b) vaddq_s8(a, b)
101+
102+
#define zend_vec_cmpeq_8x16(a, b) (vreinterpretq_s8_u8(vceqq_s8(a, b)))
103+
#define zend_vec_cmplt_8x16(a, b) (vreinterpretq_s8_u8(vcltq_s8(a, b)))
104+
#define zend_vec_cmpgt_8x16(a, b) (vreinterpretq_s8_u8(vcgtq_s8(a, b)))
105+
106+
static inline int zend_vec_movemask_8x16(int8x16_t x)
107+
{
108+
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(vreinterpretq_u8_s8(x), 7));
109+
uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
110+
uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
111+
uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
112+
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
113+
}
114+
115+
#endif
116+
117+
#endif /* defined(__SSE2__) || defined(__aarch64__) || defined(_M_ARM64) */
118+
119+
#endif /* ZEND_SIMD_H */

0 commit comments

Comments
 (0)