Skip to content

Commit 19b7d86

Browse files
committed
TMP: usage proof-of-concept
1 parent 4109d90 commit 19b7d86

File tree

2 files changed

+35
-164
lines changed

2 files changed

+35
-164
lines changed

Modules/blake2module.c

+28-104
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,11 @@
1616
#include "pyconfig.h"
1717
#include "Python.h"
1818
#include "hashlib.h"
19-
#include "pycore_strhex.h" // _Py_strhex()
19+
#include "pycore_cpuinfo.h" // py_cpuid_features
20+
#include "pycore_strhex.h" // _Py_strhex()
2021
#include "pycore_typeobject.h"
2122
#include "pycore_moduleobject.h"
2223

23-
// QUICK CPU AUTODETECTION
24-
//
25-
// See https://github.com/python/cpython/pull/119316 -- we only enable
26-
// vectorized versions for Intel CPUs, even though HACL*'s "vec128" modules also
27-
// run on ARM NEON. (We could enable them on POWER -- but I don't have access to
28-
// a test machine to see if that speeds anything up.)
29-
//
30-
// Note that configure.ac and the rest of the build are written in such a way
31-
// that if the configure script finds suitable flags to compile HACL's SIMD128
32-
// (resp. SIMD256) files, then Hacl_Hash_Blake2b_Simd128.c (resp. ...) will be
33-
// pulled into the build automatically, and then only the CPU autodetection will
34-
// need to be updated here.
35-
36-
#if defined(__x86_64__) && defined(__GNUC__)
37-
#include <cpuid.h>
38-
#elif defined(_M_X64)
39-
#include <intrin.h>
40-
#endif
41-
4224
#include <stdbool.h>
4325

4426
// SIMD256 can't be compiled on macOS ARM64, and performance of SIMD128 isn't
@@ -51,83 +33,6 @@
5133
# undef HACL_CAN_COMPILE_SIMD256
5234
#endif
5335

54-
// ECX
55-
#define ECX_SSE3 (1 << 0)
56-
#define ECX_SSSE3 (1 << 9)
57-
#define ECX_SSE4_1 (1 << 19)
58-
#define ECX_SSE4_2 (1 << 20)
59-
#define ECX_AVX (1 << 28)
60-
61-
// EBX
62-
#define EBX_AVX2 (1 << 5)
63-
64-
// EDX
65-
#define EDX_SSE (1 << 25)
66-
#define EDX_SSE2 (1 << 26)
67-
#define EDX_CMOV (1 << 15)
68-
69-
// zero-initialized by default
70-
typedef struct {
71-
bool sse, sse2, sse3, sse41, sse42, cmov, avx, avx2;
72-
bool done;
73-
} cpu_flags;
74-
75-
void detect_cpu_features(cpu_flags *flags) {
76-
if (!flags->done) {
77-
int eax1 = 0, ebx1 = 0, ecx1 = 0, edx1 = 0;
78-
int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0;
79-
#if defined(__x86_64__) && defined(__GNUC__)
80-
__cpuid_count(1, 0, eax1, ebx1, ecx1, edx1);
81-
__cpuid_count(7, 0, eax7, ebx7, ecx7, edx7);
82-
#elif defined(_M_X64)
83-
int info1[4] = { 0 };
84-
int info7[4] = { 0 };
85-
__cpuidex(info1, 1, 0);
86-
__cpuidex(info7, 7, 0);
87-
eax1 = info1[0];
88-
ebx1 = info1[1];
89-
ecx1 = info1[2];
90-
edx1 = info1[3];
91-
eax7 = info7[0];
92-
ebx7 = info7[1];
93-
ecx7 = info7[2];
94-
edx7 = info7[3];
95-
#endif
96-
(void) eax1; (void) ebx1; (void) ecx1; (void) edx1;
97-
(void) eax7; (void) ebx7; (void) ecx7; (void) edx7;
98-
99-
100-
flags->avx = (ecx1 & ECX_AVX) != 0;
101-
102-
flags->avx2 = (ebx7 & EBX_AVX2) != 0;
103-
104-
flags->sse = (edx1 & EDX_SSE) != 0;
105-
flags->sse2 = (edx1 & EDX_SSE2) != 0;
106-
flags->cmov = (edx1 & EDX_CMOV) != 0;
107-
108-
flags->sse3 = (ecx1 & ECX_SSE3) != 0;
109-
/* ssse3 = (ecx1 & ECX_SSSE3) != 0; */
110-
flags->sse41 = (ecx1 & ECX_SSE4_1) != 0;
111-
flags->sse42 = (ecx1 & ECX_SSE4_2) != 0;
112-
113-
flags->done = true;
114-
}
115-
}
116-
117-
#ifdef HACL_CAN_COMPILE_SIMD128
118-
static inline bool has_simd128(cpu_flags *flags) {
119-
// For now this is Intel-only, could conceivably be #ifdef'd to something
120-
// else.
121-
return flags->sse && flags->sse2 && flags->sse3 && flags->sse41 && flags->sse42 && flags->cmov;
122-
}
123-
#endif
124-
125-
#ifdef HACL_CAN_COMPILE_SIMD256
126-
static inline bool has_simd256(cpu_flags *flags) {
127-
return flags->avx && flags->avx2;
128-
}
129-
#endif
130-
13136
// Small mismatch between the variable names Python defines as part of configure
13237
// at the ones HACL* expects to be set in order to enable those headers.
13338
#define HACL_CAN_COMPILE_VEC128 HACL_CAN_COMPILE_SIMD128
@@ -154,9 +59,31 @@ PyDoc_STRVAR(blake2mod__doc__,
15459
typedef struct {
15560
PyTypeObject* blake2b_type;
15661
PyTypeObject* blake2s_type;
157-
cpu_flags flags;
62+
63+
bool can_run_simd128;
64+
bool can_run_simd256;
15865
} Blake2State;
15966

67+
static void
68+
blake2_init_cpu_features(Blake2State *state)
69+
{
70+
py_cpuid_features flags;
71+
_Py_cpuid_detect_features(&flags);
72+
#if HACL_CAN_COMPILE_SIMD128
73+
state->can_run_simd128 = flags.sse && flags.sse2 && flags.sse3
74+
&& flags.sse41 && flags.sse42
75+
&& flags.cmov;
76+
#else
77+
state->can_run_simd128 = false;
78+
#endif
79+
80+
#if HACL_CAN_COMPILE_SIMD256
81+
state->can_run_simd256 = flags.avx && flags.avx2;
82+
#else
83+
state->can_run_simd256 = false;
84+
#endif
85+
}
86+
16087
static inline Blake2State*
16188
blake2_get_state(PyObject *module)
16289
{
@@ -224,10 +151,7 @@ static int
224151
blake2_exec(PyObject *m)
225152
{
226153
Blake2State* st = blake2_get_state(m);
227-
228-
// This is called at module initialization-time, and so appears to be as
229-
// good a place as any to probe the CPU flags.
230-
detect_cpu_features(&st->flags);
154+
blake2_init_cpu_features(st);
231155

232156
st->blake2b_type = (PyTypeObject *)PyType_FromModuleAndSpec(
233157
m, &blake2b_type_spec, NULL);
@@ -332,14 +256,14 @@ static inline blake2_impl type_to_impl(PyTypeObject *type) {
332256
#endif
333257
if (!strcmp(type->tp_name, blake2b_type_spec.name)) {
334258
#ifdef HACL_CAN_COMPILE_SIMD256
335-
if (has_simd256(&st->flags))
259+
if (st->can_run_simd256)
336260
return Blake2b_256;
337261
else
338262
#endif
339263
return Blake2b;
340264
} else if (!strcmp(type->tp_name, blake2s_type_spec.name)) {
341265
#ifdef HACL_CAN_COMPILE_SIMD128
342-
if (has_simd128(&st->flags))
266+
if (st->can_run_simd128)
343267
return Blake2s_128;
344268
else
345269
#endif

Modules/hmacmodule.c

+7-60
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#endif
1818

1919
#include "Python.h"
20+
#include "pycore_cpuinfo.h" // py_cpuid_features
2021
#include "pycore_hashtable.h"
2122
#include "pycore_strhex.h" // _Py_strhex()
2223

@@ -1682,73 +1683,19 @@ hmacmodule_init_strings(hmacmodule_state *state)
16821683
static void
16831684
hmacmodule_init_cpu_features(hmacmodule_state *state)
16841685
{
1685-
int eax1 = 0, ebx1 = 0, ecx1 = 0, edx1 = 0;
1686-
int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0;
1687-
#if defined(__x86_64__) && defined(__GNUC__)
1688-
__cpuid_count(1, 0, eax1, ebx1, ecx1, edx1);
1689-
__cpuid_count(7, 0, eax7, ebx7, ecx7, edx7);
1690-
#elif defined(_M_X64)
1691-
int info1[4] = { 0 };
1692-
__cpuidex(info1, 1, 0);
1693-
eax1 = info1[0], ebx1 = info1[1], ecx1 = info1[2], edx1 = info1[3];
1694-
1695-
int info7[4] = { 0 };
1696-
__cpuidex(info7, 7, 0);
1697-
eax7 = info7[0], ebx7 = info7[1], ecx7 = info7[2], edx7 = info7[3];
1698-
#endif
1699-
// fmt: off
1700-
(void)eax1; (void)ebx1; (void)ecx1; (void)edx1;
1701-
(void)eax7; (void)ebx7; (void)ecx7; (void)edx7;
1702-
// fmt: on
1703-
1704-
#define EBX_AVX2 (1 << 5)
1705-
#define ECX_SSE3 (1 << 0)
1706-
#define ECX_SSSE3 (1 << 9)
1707-
#define ECX_SSE4_1 (1 << 19)
1708-
#define ECX_SSE4_2 (1 << 20)
1709-
#define ECX_AVX (1 << 28)
1710-
#define EDX_SSE (1 << 25)
1711-
#define EDX_SSE2 (1 << 26)
1712-
#define EDX_CMOV (1 << 15)
1713-
1714-
bool avx = (ecx1 & ECX_AVX) != 0;
1715-
bool avx2 = (ebx7 & EBX_AVX2) != 0;
1716-
1717-
bool sse = (edx1 & EDX_SSE) != 0;
1718-
bool sse2 = (edx1 & EDX_SSE2) != 0;
1719-
bool cmov = (edx1 & EDX_CMOV) != 0;
1720-
1721-
bool sse3 = (ecx1 & ECX_SSE3) != 0;
1722-
bool sse41 = (ecx1 & ECX_SSE4_1) != 0;
1723-
bool sse42 = (ecx1 & ECX_SSE4_2) != 0;
1724-
1725-
#undef EDX_CMOV
1726-
#undef EDX_SSE2
1727-
#undef EDX_SSE
1728-
#undef ECX_AVX
1729-
#undef ECX_SSE4_2
1730-
#undef ECX_SSE4_1
1731-
#undef ECX_SSSE3
1732-
#undef ECX_SSE3
1733-
#undef EBX_AVX2
1734-
1686+
py_cpuid_features flags;
1687+
_Py_cpuid_detect_features(&flags);
17351688
#if HACL_CAN_COMPILE_SIMD128
1736-
// TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection
1737-
state->can_run_simd128 = sse && sse2 && sse3 && sse41 && sse42 && cmov;
1689+
state->can_run_simd128 = flags.sse && flags.sse2 && flags.sse3
1690+
&& flags.sse41 && flags.sse42
1691+
&& flags.cmov;
17381692
#else
1739-
// fmt: off
1740-
(void)sse; (void)sse2; (void)sse3; (void)sse41; (void)sse42; (void)cmov;
1741-
// fmt: on
17421693
state->can_run_simd128 = false;
17431694
#endif
17441695

17451696
#if HACL_CAN_COMPILE_SIMD256
1746-
// TODO(picnixz): use py_cpuid_features (gh-125022) to improve detection
1747-
state->can_run_simd256 = state->can_run_simd128 && avx && avx2;
1697+
state->can_run_simd256 = flags.avx && flags.avx2;
17481698
#else
1749-
// fmt: off
1750-
(void)avx; (void)avx2;
1751-
// fmt: on
17521699
state->can_run_simd256 = false;
17531700
#endif
17541701
}

0 commit comments

Comments
 (0)