Move avx2 specific code in different source files (#28)

dskhudia · facebook-github-bot · commit 4c96f1087150 · 2018-12-05T09:34:34.000-08:00
Summary: Pull Request resolved: #28 Pull Request resolved: pytorch/pytorch#14516 This is the first diff in a series of diffs that will separate out avx2 specific code in separate files. The goal is to compile as little as possible code with avx2 and avx512 compiler flags. Reviewed By: jianyuh Differential Revision: D13248376 fbshipit-source-id: 2347f3687c2cbd5c6d21d7365c6f9bd87ee96641
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -22,9 +22,10 @@ set(FBGEMM_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 set(FBGEMM_THIRDPARTY_DIR ${FBGEMM_BINARY_DIR}/third_party)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+
 #All the source files that either use avx2 instructions statically or JIT
 #avx2/avx512 instructions.
-set(FBGEMM_AVX2_SRCS src/ExecuteKernel.cc
+set(FBGEMM_GENERIC_SRCS src/ExecuteKernel.cc
                 src/ExecuteKernelU8S8.cc
                 src/Fbgemm.cc
                 src/FbgemmFP16.cc
@@ -62,6 +63,9 @@ else()
   message(WARNING "OpenMP is not supported by the compiler")
 endif()
 
+#All the source files that either use avx2 instructions statically
+set(FBGEMM_AVX2_SRCS src/Utils_avx2.cc)
+
 #All the source files that use avx512 instructions statically
 set(FBGEMM_AVX512_SRCS src/Utils_avx512.cc)
 
@@ -74,14 +78,17 @@ set(FBGEMM_PUBLIC_HEADERS include/fbgemm/Fbgemm.h
                           include/fbgemm/FbgemmI8Spmdm.h)
 
 
+add_library(fbgemm_generic OBJECT ${FBGEMM_GENERIC_SRCS})
 add_library(fbgemm_avx2 OBJECT ${FBGEMM_AVX2_SRCS})
 add_library(fbgemm_avx512 OBJECT ${FBGEMM_AVX512_SRCS})
 
-set_target_properties(fbgemm_avx2 fbgemm_avx512 PROPERTIES
+set_target_properties(fbgemm_generic fbgemm_avx2 fbgemm_avx512 PROPERTIES
       CXX_STANDARD 11
       CXX_EXTENSIONS NO
       CXX_VISIBILITY_PRESET hidden)
 
+target_compile_options(fbgemm_generic PRIVATE
+  "-m64" "-mavx2" "-mfma" "-masm=intel")
 target_compile_options(fbgemm_avx2 PRIVATE
   "-m64" "-mavx2" "-mfma" "-masm=intel")
 target_compile_options(fbgemm_avx512 PRIVATE
@@ -132,6 +139,12 @@ if(NOT TARGET cpuinfo)
   set_property(TARGET cpuinfo PROPERTY POSITION_INDEPENDENT_CODE ON)
 endif()
 
+target_include_directories(fbgemm_generic BEFORE
+      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}>
+      PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>
+      PRIVATE "${ASMJIT_SRC_DIR}/src"
+      PRIVATE "${CPUINFO_SOURCE_DIR}/include")
+
 target_include_directories(fbgemm_avx2 BEFORE
       PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}>
       PUBLIC $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>
@@ -145,17 +158,24 @@ target_include_directories(fbgemm_avx512 BEFORE
       PRIVATE "${CPUINFO_SOURCE_DIR}/include")
 
 if(FBGEMM_LIBRARY_TYPE STREQUAL "default")
-  add_library(fbgemm $<TARGET_OBJECTS:fbgemm_avx2>
+  add_library(fbgemm
+    $<TARGET_OBJECTS:fbgemm_generic>
+    $<TARGET_OBJECTS:fbgemm_avx2>
     $<TARGET_OBJECTS:fbgemm_avx512>)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "shared")
-  add_library(fbgemm SHARED $<TARGET_OBJECTS:fbgemm_avx2>
+  add_library(fbgemm SHARED
+    $<TARGET_OBJECTS:fbgemm_generic>
+    $<TARGET_OBJECTS:fbgemm_avx2>
     $<TARGET_OBJECTS:fbgemm_avx512>)
+  set_property(TARGET fbgemm_generic PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx2 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_property(TARGET fbgemm_avx512 PROPERTY POSITION_INDEPENDENT_CODE ON)
   set_target_properties(fbgemm PROPERTIES
     CXX_VISIBILITY_PRESET hidden)
 elseif(FBGEMM_LIBRARY_TYPE STREQUAL "static")
-  add_library(fbgemm STATIC $<TARGET_OBJECTS:fbgemm_avx2>
+  add_library(fbgemm STATIC
+    $<TARGET_OBJECTS:fbgemm_generic>
+    $<TARGET_OBJECTS:fbgemm_avx2>
     $<TARGET_OBJECTS:fbgemm_avx512>)
   target_compile_definitions(fbgemm_avx2 PRIVATE FBGEMM_STATIC)
   target_compile_definitions(fbgemm_avx512 PRIVATE FBGEMM_STATIC)
diff --git a/include/fbgemm/Utils.h b/include/fbgemm/Utils.h
@@ -95,34 +95,4 @@ void transpose_simd(
     float* dst,
     int ld_dst);
 
-namespace internal {
-
-/**
- * @brief Transpose a matrix using Intel AVX2.
- *
- * This is called if the code is running on a CPU with Intel AVX2 support.
- */
-void transpose_8x8(
-    int M,
-    int N,
-    const float* src,
-    int ld_src,
-    float* dst,
-    int ld_dst);
-
-/**
- * @brief Transpose a matrix using Intel AVX512.
- *
- * This is called if the code is running on a CPU with Intel AVX512 support.
- */
-void transpose_16x16(
-    int M,
-    int N,
-    const float* src,
-    int ld_src,
-    float* dst,
-    int ld_dst);
-
-} // namespace internal
-
 } // namespace fbgemm
diff --git a/src/TransposeUtils.h b/src/TransposeUtils.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+
+namespace fbgemm {
+
+/**
+ * @brief Reference implementation of matrix transposition: B = A^T.
+ * @param M The height of the matrix.
+ * @param N The width of the matrix.
+ * @param src The memory buffer of the source matrix A.
+ * @param ld_src The leading dimension of the source matrix A.
+ * @param dst The memory buffer of the destination matrix B.
+ * @param ld_dst The leading dimension of the destination matrix B.
+ */
+void transpose_ref(
+    int M,
+    int N,
+    const float* src,
+    int ld_src,
+    float* dst,
+    int ld_dst);
+
+namespace internal {
+
+/**
+ * @brief Transpose a matrix using Intel AVX2.
+ *
+ * This is called if the code is running on a CPU with Intel AVX2 support.
+ */
+void transpose_8x8(
+    int M,
+    int N,
+    const float* src,
+    int ld_src,
+    float* dst,
+    int ld_dst);
+
+/**
+ * @brief Transpose a matrix using Intel AVX512.
+ *
+ * This is called if the code is running on a CPU with Intel AVX512 support.
+ */
+void transpose_16x16(
+    int M,
+    int N,
+    const float* src,
+    int ld_src,
+    float* dst,
+    int ld_dst);
+
+} // namespace internal
+
+} // namespace fbgemm
diff --git a/src/Utils.cc b/src/Utils.cc
@@ -5,6 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 #include "fbgemm/Utils.h"
+#include "TransposeUtils.h"
 #include <cpuinfo.h>
 #include <immintrin.h>
 #include <cassert>
@@ -156,16 +157,7 @@ template void printMatrix<int32_t>(
     size_t ld,
     std::string name);
 
-/**
- * @brief Reference implementation of matrix transposition: B = A^T.
- * @param M The height of the matrix.
- * @param N The width of the matrix.
- * @param src The memory buffer of the source matrix A.
- * @param ld_src The leading dimension of the source matrix A.
- * @param dst The memory buffer of the destination matrix B.
- * @param ld_dst The leading dimension of the destination matrix B.
- */
-inline void transpose_ref(
+void transpose_ref(
     int M,
     int N,
     const float* src,
@@ -179,161 +171,6 @@ inline void transpose_ref(
   } // for each output row
 }
 
-inline void
-transpose_kernel_4x4_sse(const float* src, int ld_src, float* dst, int ld_dst) {
-  // load from src to registers
-  // a : a0 a1 a2 a3
-  // b : b0 b1 b2 b3
-  // c : c0 c1 c2 c3
-  // d : d0 d1 d2 d3
-  __m128 a = _mm_loadu_ps(&src[0 * ld_src]);
-  __m128 b = _mm_loadu_ps(&src[1 * ld_src]);
-  __m128 c = _mm_loadu_ps(&src[2 * ld_src]);
-  __m128 d = _mm_loadu_ps(&src[3 * ld_src]);
-
-  // transpose the 4x4 matrix formed by 32-bit elements: Macro from SSE
-  // a : a0 b0 c0 d0
-  // b : a1 b1 c1 d1
-  // c : a2 b2 c2 d2
-  // d : a3 b3 c3 d3
-  _MM_TRANSPOSE4_PS(a, b, c, d);
-
-  // store from registers to dst
-  _mm_storeu_ps(&dst[0 * ld_dst], a);
-  _mm_storeu_ps(&dst[1 * ld_dst], b);
-  _mm_storeu_ps(&dst[2 * ld_dst], c);
-  _mm_storeu_ps(&dst[3 * ld_dst], d);
-}
-inline void transpose_4x4(
-    int M,
-    int N,
-    const float* src,
-    int ld_src,
-    float* dst,
-    int ld_dst) {
-  int ib = 0, jb = 0;
-  for (ib = 0; ib + 4 <= M; ib += 4) {
-    for (jb = 0; jb + 4 <= N; jb += 4) {
-      transpose_kernel_4x4_sse(
-          &src[ib * ld_src + jb], ld_src, &dst[ib + jb * ld_dst], ld_dst);
-    }
-  }
-  transpose_ref(ib, N - jb, &src[jb], ld_src, &dst[jb * ld_dst], ld_dst);
-  transpose_ref(M - ib, N, &src[ib * ld_src], ld_src, &dst[ib], ld_dst);
-}
-
-inline void transpose_kernel_8x8_avx2(
-    const float* src,
-    int ld_src,
-    float* dst,
-    int ld_dst) {
-  // load from src to registers
-  // a : a0 a1 a2 a3 a4 a5 a6 a7
-  // b : b0 b1 b2 b3 b4 b5 b6 b7
-  // c : c0 c1 c2 c3 c4 c5 c6 c7
-  // d : d0 d1 d2 d3 d4 d5 d6 d7
-  // e : e0 e1 e2 e3 e4 e5 e6 e7
-  // f : f0 f1 f2 f3 f4 f5 f6 f7
-  // g : g0 g1 g2 g3 g4 g5 g6 g7
-  // h : h0 h1 h2 h3 h4 h5 h6 h7
-  __m256 a = _mm256_loadu_ps(&src[0 * ld_src]);
-  __m256 b = _mm256_loadu_ps(&src[1 * ld_src]);
-  __m256 c = _mm256_loadu_ps(&src[2 * ld_src]);
-  __m256 d = _mm256_loadu_ps(&src[3 * ld_src]);
-  __m256 e = _mm256_loadu_ps(&src[4 * ld_src]);
-  __m256 f = _mm256_loadu_ps(&src[5 * ld_src]);
-  __m256 g = _mm256_loadu_ps(&src[6 * ld_src]);
-  __m256 h = _mm256_loadu_ps(&src[7 * ld_src]);
-
-  __m256 ab0145, ab2367, cd0145, cd2367, ef0145, ef2367, gh0145, gh2367;
-  __m256 abcd04, abcd15, efgh04, efgh15, abcd26, abcd37, efgh26, efgh37;
-  // unpacking and interleaving 32-bit elements
-  // ab0145 : a0 b0 a1 b1 a4 b4 a5 b5
-  // ab2367 : a2 b2 a3 b3 a6 b6 a7 b7
-  // cd0145 : c0 d0 c1 d1 c4 d4 c5 d5
-  // cd2367 : c2 d2 c3 d3 c6 d6 c7 d7
-  // ef0145 : e0 f0 e1 f1 e4 f4 e5 f5
-  // ef2367 : e2 f2 e3 f3 e6 f6 e7 f7
-  // gh0145 : g0 h0 g1 h1 g4 h4 g5 h5
-  // gh2367 : g2 h2 g3 h3 g6 h6 g7 h7
-  ab0145 = _mm256_unpacklo_ps(a, b);
-  ab2367 = _mm256_unpackhi_ps(a, b);
-  cd0145 = _mm256_unpacklo_ps(c, d);
-  cd2367 = _mm256_unpackhi_ps(c, d);
-  ef0145 = _mm256_unpacklo_ps(e, f);
-  ef2367 = _mm256_unpackhi_ps(e, f);
-  gh0145 = _mm256_unpacklo_ps(g, h);
-  gh2367 = _mm256_unpackhi_ps(g, h);
-
-  // shuffling the 32-bit elements
-  // abcd04 : a0 b0 c0 d0 a4 b4 c4 d4
-  // abcd15 : a1 b1 c1 d1 a5 b5 c5 d5
-  // efgh04 : e0 f0 g0 h0 e4 f4 g4 h4
-  // efgh15 : e1 f1 g1 h1 e5 b5 c5 d5
-  // abcd26 : a2 b2 c2 d2 a6 b6 c6 d6
-  // abcd37 : a3 b3 c3 d3 a7 b7 c7 d7
-  // efgh26 : e2 f2 g2 h2 e6 f6 g6 h6
-  // efgh37 : e3 f3 g3 h3 e7 f7 g7 h7
-  abcd04 = _mm256_shuffle_ps(ab0145, cd0145, 0x44);
-  abcd15 = _mm256_shuffle_ps(ab0145, cd0145, 0xee);
-  efgh04 = _mm256_shuffle_ps(ef0145, gh0145, 0x44);
-  efgh15 = _mm256_shuffle_ps(ef0145, gh0145, 0xee);
-  abcd26 = _mm256_shuffle_ps(ab2367, cd2367, 0x44);
-  abcd37 = _mm256_shuffle_ps(ab2367, cd2367, 0xee);
-  efgh26 = _mm256_shuffle_ps(ef2367, gh2367, 0x44);
-  efgh37 = _mm256_shuffle_ps(ef2367, gh2367, 0xee);
-
-  // shuffling 128-bit elements
-  // a : a0 b0 c0 d0 e0 f0 g0 h0
-  // b : a1 b1 c1 d1 e1 f1 g1 h1
-  // c : a2 b2 c2 d2 e2 f2 g2 h2
-  // d : a3 b3 c3 d3 e3 f3 g3 h3
-  // e : a4 b4 c4 d4 e4 f4 g4 h4
-  // f : a5 b5 c5 d5 e5 f5 g5 h5
-  // g : a6 b6 c6 d6 e6 f6 g6 h6
-  // h : a7 b7 c7 d7 e7 f7 g7 h7
-  a = _mm256_permute2f128_ps(efgh04, abcd04, 0x02);
-  b = _mm256_permute2f128_ps(efgh15, abcd15, 0x02);
-  c = _mm256_permute2f128_ps(efgh26, abcd26, 0x02);
-  d = _mm256_permute2f128_ps(efgh37, abcd37, 0x02);
-  e = _mm256_permute2f128_ps(efgh04, abcd04, 0x13);
-  f = _mm256_permute2f128_ps(efgh15, abcd15, 0x13);
-  g = _mm256_permute2f128_ps(efgh26, abcd26, 0x13);
-  h = _mm256_permute2f128_ps(efgh37, abcd37, 0x13);
-
-  // store from registers to dst
-  _mm256_storeu_ps(&dst[0 * ld_dst], a);
-  _mm256_storeu_ps(&dst[1 * ld_dst], b);
-  _mm256_storeu_ps(&dst[2 * ld_dst], c);
-  _mm256_storeu_ps(&dst[3 * ld_dst], d);
-  _mm256_storeu_ps(&dst[4 * ld_dst], e);
-  _mm256_storeu_ps(&dst[5 * ld_dst], f);
-  _mm256_storeu_ps(&dst[6 * ld_dst], g);
-  _mm256_storeu_ps(&dst[7 * ld_dst], h);
-}
-
-namespace internal {
-
-void transpose_8x8(
-    int M,
-    int N,
-    const float* src,
-    int ld_src,
-    float* dst,
-    int ld_dst) {
-  int ib = 0, jb = 0;
-  for (ib = 0; ib + 8 <= M; ib += 8) {
-    for (jb = 0; jb + 8 <= N; jb += 8) {
-      transpose_kernel_8x8_avx2(
-          &src[ib * ld_src + jb], ld_src, &dst[ib + jb * ld_dst], ld_dst);
-    }
-  }
-  transpose_4x4(ib, N - jb, &src[jb], ld_src, &dst[jb * ld_dst], ld_dst);
-  transpose_4x4(M - ib, N, &src[ib * ld_src], ld_src, &dst[ib], ld_dst);
-}
-
-} // namespace internal
-
 void transpose_simd(
     int M,
     int N,
diff --git a/src/Utils_avx2.cc b/src/Utils_avx2.cc
diff --git a/src/Utils_avx512.cc b/src/Utils_avx512.cc