Guard Denormals-Are-Zero with runtime CPU check (#12386)

colesbury · facebook-github-bot · commit f9fb37ca792f · 2018-10-05T14:54:54.000-07:00
Summary: Previously, we were only enabling Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) when compiling with SSE3 enabled. After, Christian's patch (#12109) we won't be compiling core files with SSE3 or SSE4 enabled, to better support older AMD processors. This moves the FTZ and DAZ code behind a runtime CPU check in preparation for that change. Pull Request resolved: #12386 Differential Revision: D10222237 Pulled By: colesbury fbshipit-source-id: 7ffe32561ab965e1e5f9eb6e679602bbf4775538
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
@@ -20,8 +20,8 @@ CONFIGURE_FILE(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
 CONFIGURE_FILE(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
 
 # NB: If you edit these globs, you'll have to update setup.py package_data as well
-FILE(GLOB base_h "*.h" "detail/*.h")
-FILE(GLOB base_cpp "*.cpp" "detail/*.cpp")
+FILE(GLOB base_h "*.h" "detail/*.h" "cpu/*.h")
+FILE(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp")
 add_subdirectory(core)
 FILE(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
 FILE(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
@@ -13,13 +13,10 @@
 #include "ATen/CPUGenerator.h"
 #include "ATen/RegisterCPU.h"
 #include "ATen/Tensor.h"
+#include <ATen/cpu/FlushDenormal.h>
 
 #include "TH/TH.h"  // for USE_LAPACK
 
-#ifdef USE_SSE3
-#include <pmmintrin.h>
-#endif
-
 namespace at {
 
 static inline void errorHandler(const char * msg, void * data) {
@@ -94,18 +91,7 @@ bool Context::hasLAPACK() const {
 }
 
 bool Context::setFlushDenormal(bool on) {
-#ifdef USE_SSE3
-  // Setting flush-to-zero (FTZ) flag
-  _MM_SET_FLUSH_ZERO_MODE(on ? _MM_FLUSH_ZERO_ON
-                             : _MM_FLUSH_ZERO_OFF);
-
-  // Setting denormals-are-zero (DAZ) flag
-  _MM_SET_DENORMALS_ZERO_MODE(on ? _MM_DENORMALS_ZERO_ON
-                                 : _MM_DENORMALS_ZERO_OFF);
-  return true;
-#else
-  return false;
-#endif
+  return at::cpu::set_flush_denormal(on);
 }
 
 TypeExtendedInterface& getType(TensorOptions options) {
diff --git a/aten/src/ATen/cpu/FlushDenormal.cpp b/aten/src/ATen/cpu/FlushDenormal.cpp
@@ -0,0 +1,32 @@
+#include <ATen/cpu/FlushDenormal.h>
+
+#include <ATen/cpu/vec256/intrinsics.h>
+#include <cpuinfo.h>
+
+namespace at { namespace cpu {
+
+static constexpr unsigned int DENORMALS_ZERO = 0x0040;
+static constexpr unsigned int FLUSH_ZERO = 0x8000;
+
+bool set_flush_denormal(bool on) {
+  // Compile if we have SSE support (GCC), x86-64 (MSVC), or x86 with SSE (MSVC)
+#if defined(__SSE__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
+  // Denormals-Are-Zero is supported by most SSE2 processors, with the exception
+  // of some early Pentium 4 processors. We guard it with a runtime check.
+  // Flush-To-Zero (FTZ) only requires SSE.
+  if (cpuinfo_has_x86_daz()) {
+    unsigned int csr = _mm_getcsr();
+    csr &= ~DENORMALS_ZERO;
+    csr &= ~FLUSH_ZERO;
+    if (on) {
+      csr |= DENORMALS_ZERO;
+      csr |= FLUSH_ZERO;
+    }
+    _mm_setcsr(csr);
+    return true;
+  }
+#endif
+  return false;
+}
+
+}}  // namespace at::cpu
diff --git a/aten/src/ATen/cpu/FlushDenormal.h b/aten/src/ATen/cpu/FlushDenormal.h
@@ -0,0 +1,14 @@
+/// Flush-To-Zero and Denormals-Are-Zero mode
+///
+/// Flush-To-Zero (FTZ) and Denormals-Are-Zero (DAZ) are modes that bypass
+/// IEEE 754 methods of dealing with denormal floating-point numbers on x86-64
+/// and some x86 CPUs. They result in reduced precision for values near zero,
+/// but increased performance.
+///
+/// See https://software.intel.com/en-us/articles/x87-and-sse-floating-point-assists-in-ia-32-flush-to-zero-ftz-and-denormals-are-zero-daz
+
+namespace at { namespace cpu {
+
+bool set_flush_denormal(bool on);
+
+}}  // namespace at::cpu
diff --git a/setup.py b/setup.py
@@ -1195,6 +1195,7 @@ def make_relative_rpath(path):
                 'lib/torch_shm_manager',
                 'lib/*.h',
                 'lib/include/ATen/*.h',
+                'lib/include/ATen/cpu/*.h',
                 'lib/include/ATen/core/*.h',
                 'lib/include/ATen/cuda/*.cuh',
                 'lib/include/ATen/cuda/*.h',