Skip to content

Commit f9fb37c

Browse files
colesburyfacebook-github-bot
authored andcommitted
Guard Denormals-Are-Zero with runtime CPU check (#12386)
Summary: Previously, we were only enabling Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) when compiling with SSE3 enabled. After, Christian's patch (#12109) we won't be compiling core files with SSE3 or SSE4 enabled, to better support older AMD processors. This moves the FTZ and DAZ code behind a runtime CPU check in preparation for that change. Pull Request resolved: #12386 Differential Revision: D10222237 Pulled By: colesbury fbshipit-source-id: 7ffe32561ab965e1e5f9eb6e679602bbf4775538
1 parent bd09ab6 commit f9fb37c

File tree

5 files changed

+51
-18
lines changed

5 files changed

+51
-18
lines changed

aten/src/ATen/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
2020
CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
2121

2222
# NB: If you edit these globs, you'll have to update setup.py package_data as well
23-
FILE(GLOB base_h "*.h" "detail/*.h")
24-
FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
23+
FILE(GLOB base_h "*.h" "detail/*.h" "cpu/*.h")
24+
FILE(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp")
2525
add_subdirectory(core)
2626
FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
2727
FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")

aten/src/ATen/Context.cpp

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,10 @@
1313
#include "ATen/CPUGenerator.h"
1414
#include "ATen/RegisterCPU.h"
1515
#include "ATen/Tensor.h"
16+
#include <ATen/cpu/FlushDenormal.h>
1617

1718
#include "TH/TH.h" // for USE_LAPACK
1819

19-
#ifdef USE_SSE3
20-
#include <pmmintrin.h>
21-
#endif
22-
2320
namespace at {
2421

2522
static inline void errorHandler(const char * msg, void * data) {
@@ -94,18 +91,7 @@ bool Context::hasLAPACK() const {
9491
}
9592

9693
bool Context::setFlushDenormal(bool on) {
97-
#ifdef USE_SSE3
98-
// Setting flush-to-zero (FTZ) flag
99-
_MM_SET_FLUSH_ZERO_MODE(on ? _MM_FLUSH_ZERO_ON
100-
: _MM_FLUSH_ZERO_OFF);
101-
102-
// Setting denormals-are-zero (DAZ) flag
103-
_MM_SET_DENORMALS_ZERO_MODE(on ? _MM_DENORMALS_ZERO_ON
104-
: _MM_DENORMALS_ZERO_OFF);
105-
return true;
106-
#else
107-
return false;
108-
#endif
94+
return at::cpu::set_flush_denormal(on);
10995
}
11096

11197
TypeExtendedInterface& getType(TensorOptions options) {

aten/src/ATen/cpu/FlushDenormal.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#include <ATen/cpu/FlushDenormal.h>
2+
3+
#include <ATen/cpu/vec256/intrinsics.h>
4+
#include <cpuinfo.h>
5+
6+
namespace at { namespace cpu {
7+
8+
static constexpr unsigned int DENORMALS_ZERO = 0x0040;
9+
static constexpr unsigned int FLUSH_ZERO = 0x8000;
10+
11+
bool set_flush_denormal(bool on) {
12+
// Compile if we have SSE support (GCC), x86-64 (MSVC), or x86 with SSE (MSVC)
13+
#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
14+
// Denormals-Are-Zero is supported by most SSE2 processors, with the exception
15+
// of some early Pentium 4 processors. We guard it with a runtime check.
16+
// Flush-To-Zero (FTZ) only requires SSE.
17+
if (cpuinfo_has_x86_daz()) {
18+
unsigned int csr = _mm_getcsr();
19+
csr &= ~DENORMALS_ZERO;
20+
csr &= ~FLUSH_ZERO;
21+
if (on) {
22+
csr |= DENORMALS_ZERO;
23+
csr |= FLUSH_ZERO;
24+
}
25+
_mm_setcsr(csr);
26+
return true;
27+
}
28+
#endif
29+
return false;
30+
}
31+
32+
}} // namespace at::cpu

aten/src/ATen/cpu/FlushDenormal.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
/// Flush-To-Zero and Denormals-Are-Zero mode
2+
///
3+
/// Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass
4+
/// IEEE 754 methods of dealing with denormal floating-point numbers on x86-64
5+
/// and some x86 CPUs. They result in reduced precision for values near zero,
6+
/// but increased performance.
7+
///
8+
/// See https://software.intel.com/en-us/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz
9+
10+
namespace at { namespace cpu {
11+
12+
bool set_flush_denormal(bool on);
13+
14+
}} // namespace at::cpu

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1195,6 +1195,7 @@ def make_relative_rpath(path):
11951195
'lib/torch_shm_manager',
11961196
'lib/*.h',
11971197
'lib/include/ATen/*.h',
1198+
'lib/include/ATen/cpu/*.h',
11981199
'lib/include/ATen/core/*.h',
11991200
'lib/include/ATen/cuda/*.cuh',
12001201
'lib/include/ATen/cuda/*.h',

0 commit comments

Comments
 (0)