16
16
#include "pyconfig.h"
17
17
#include "Python.h"
18
18
#include "hashlib.h"
19
- #include "pycore_strhex.h" // _Py_strhex()
19
+ #include "pycore_cpuinfo.h" // py_cpuid_features
20
+ #include "pycore_strhex.h" // _Py_strhex()
20
21
#include "pycore_typeobject.h"
21
22
#include "pycore_moduleobject.h"
22
23
23
- // QUICK CPU AUTODETECTION
24
- //
25
- // See https://github.com/python/cpython/pull/119316 -- we only enable
26
- // vectorized versions for Intel CPUs, even though HACL*'s "vec128" modules also
27
- // run on ARM NEON. (We could enable them on POWER -- but I don't have access to
28
- // a test machine to see if that speeds anything up.)
29
- //
30
- // Note that configure.ac and the rest of the build are written in such a way
31
- // that if the configure script finds suitable flags to compile HACL's SIMD128
32
- // (resp. SIMD256) files, then Hacl_Hash_Blake2b_Simd128.c (resp. ...) will be
33
- // pulled into the build automatically, and then only the CPU autodetection will
34
- // need to be updated here.
35
-
36
- #if defined(__x86_64__ ) && defined(__GNUC__ )
37
- #include <cpuid.h>
38
- #elif defined(_M_X64 )
39
- #include <intrin.h>
40
- #endif
41
-
42
24
#include <stdbool.h>
43
25
44
26
// SIMD256 can't be compiled on macOS ARM64, and performance of SIMD128 isn't
51
33
# undef HACL_CAN_COMPILE_SIMD256
52
34
#endif
53
35
54
- // ECX
55
- #define ECX_SSE3 (1 << 0)
56
- #define ECX_SSSE3 (1 << 9)
57
- #define ECX_SSE4_1 (1 << 19)
58
- #define ECX_SSE4_2 (1 << 20)
59
- #define ECX_AVX (1 << 28)
60
-
61
- // EBX
62
- #define EBX_AVX2 (1 << 5)
63
-
64
- // EDX
65
- #define EDX_SSE (1 << 25)
66
- #define EDX_SSE2 (1 << 26)
67
- #define EDX_CMOV (1 << 15)
68
-
69
- // zero-initialized by default
70
- typedef struct {
71
- bool sse , sse2 , sse3 , sse41 , sse42 , cmov , avx , avx2 ;
72
- bool done ;
73
- } cpu_flags ;
74
-
75
- void detect_cpu_features (cpu_flags * flags ) {
76
- if (!flags -> done ) {
77
- int eax1 = 0 , ebx1 = 0 , ecx1 = 0 , edx1 = 0 ;
78
- int eax7 = 0 , ebx7 = 0 , ecx7 = 0 , edx7 = 0 ;
79
- #if defined(__x86_64__ ) && defined(__GNUC__ )
80
- __cpuid_count (1 , 0 , eax1 , ebx1 , ecx1 , edx1 );
81
- __cpuid_count (7 , 0 , eax7 , ebx7 , ecx7 , edx7 );
82
- #elif defined(_M_X64 )
83
- int info1 [4 ] = { 0 };
84
- int info7 [4 ] = { 0 };
85
- __cpuidex (info1 , 1 , 0 );
86
- __cpuidex (info7 , 7 , 0 );
87
- eax1 = info1 [0 ];
88
- ebx1 = info1 [1 ];
89
- ecx1 = info1 [2 ];
90
- edx1 = info1 [3 ];
91
- eax7 = info7 [0 ];
92
- ebx7 = info7 [1 ];
93
- ecx7 = info7 [2 ];
94
- edx7 = info7 [3 ];
95
- #endif
96
- (void ) eax1 ; (void ) ebx1 ; (void ) ecx1 ; (void ) edx1 ;
97
- (void ) eax7 ; (void ) ebx7 ; (void ) ecx7 ; (void ) edx7 ;
98
-
99
-
100
- flags -> avx = (ecx1 & ECX_AVX ) != 0 ;
101
-
102
- flags -> avx2 = (ebx7 & EBX_AVX2 ) != 0 ;
103
-
104
- flags -> sse = (edx1 & EDX_SSE ) != 0 ;
105
- flags -> sse2 = (edx1 & EDX_SSE2 ) != 0 ;
106
- flags -> cmov = (edx1 & EDX_CMOV ) != 0 ;
107
-
108
- flags -> sse3 = (ecx1 & ECX_SSE3 ) != 0 ;
109
- /* ssse3 = (ecx1 & ECX_SSSE3) != 0; */
110
- flags -> sse41 = (ecx1 & ECX_SSE4_1 ) != 0 ;
111
- flags -> sse42 = (ecx1 & ECX_SSE4_2 ) != 0 ;
112
-
113
- flags -> done = true;
114
- }
115
- }
116
-
117
- #ifdef HACL_CAN_COMPILE_SIMD128
118
- static inline bool has_simd128 (cpu_flags * flags ) {
119
- // For now this is Intel-only, could conceivably be #ifdef'd to something
120
- // else.
121
- return flags -> sse && flags -> sse2 && flags -> sse3 && flags -> sse41 && flags -> sse42 && flags -> cmov ;
122
- }
123
- #endif
124
-
125
- #ifdef HACL_CAN_COMPILE_SIMD256
126
- static inline bool has_simd256 (cpu_flags * flags ) {
127
- return flags -> avx && flags -> avx2 ;
128
- }
129
- #endif
130
-
131
36
// Small mismatch between the variable names Python defines as part of configure
132
37
// at the ones HACL* expects to be set in order to enable those headers.
133
38
#define HACL_CAN_COMPILE_VEC128 HACL_CAN_COMPILE_SIMD128
@@ -154,9 +59,31 @@ PyDoc_STRVAR(blake2mod__doc__,
154
59
typedef struct {
155
60
PyTypeObject * blake2b_type ;
156
61
PyTypeObject * blake2s_type ;
157
- cpu_flags flags ;
62
+
63
+ bool can_run_simd128 ;
64
+ bool can_run_simd256 ;
158
65
} Blake2State ;
159
66
67
+ static void
68
+ blake2_init_cpu_features (Blake2State * state )
69
+ {
70
+ py_cpuid_features flags ;
71
+ _Py_cpuid_detect_features (& flags );
72
+ #if HACL_CAN_COMPILE_SIMD128
73
+ state -> can_run_simd128 = flags .sse && flags .sse2 && flags .sse3
74
+ && flags .sse41 && flags .sse42
75
+ && flags .cmov ;
76
+ #else
77
+ state -> can_run_simd128 = false;
78
+ #endif
79
+
80
+ #if HACL_CAN_COMPILE_SIMD256
81
+ state -> can_run_simd256 = flags .avx && flags .avx2 ;
82
+ #else
83
+ state -> can_run_simd256 = false;
84
+ #endif
85
+ }
86
+
160
87
static inline Blake2State *
161
88
blake2_get_state (PyObject * module )
162
89
{
@@ -224,10 +151,7 @@ static int
224
151
blake2_exec (PyObject * m )
225
152
{
226
153
Blake2State * st = blake2_get_state (m );
227
-
228
- // This is called at module initialization-time, and so appears to be as
229
- // good a place as any to probe the CPU flags.
230
- detect_cpu_features (& st -> flags );
154
+ blake2_init_cpu_features (st );
231
155
232
156
st -> blake2b_type = (PyTypeObject * )PyType_FromModuleAndSpec (
233
157
m , & blake2b_type_spec , NULL );
@@ -332,14 +256,14 @@ static inline blake2_impl type_to_impl(PyTypeObject *type) {
332
256
#endif
333
257
if (!strcmp (type -> tp_name , blake2b_type_spec .name )) {
334
258
#ifdef HACL_CAN_COMPILE_SIMD256
335
- if (has_simd256 ( & st -> flags ) )
259
+ if (st -> can_run_simd256 )
336
260
return Blake2b_256 ;
337
261
else
338
262
#endif
339
263
return Blake2b ;
340
264
} else if (!strcmp (type -> tp_name , blake2s_type_spec .name )) {
341
265
#ifdef HACL_CAN_COMPILE_SIMD128
342
- if (has_simd128 ( & st -> flags ) )
266
+ if (st -> can_run_simd128 )
343
267
return Blake2s_128 ;
344
268
else
345
269
#endif
0 commit comments