Some misc swizzle changes (#2138)

zasdfgbnm · web-flow · commit 292ebef7863f · 2022-10-30T09:59:12.000-07:00
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -751,6 +751,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/scheduler/registry.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/utils.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp",
+    "torch/csrc/jit/codegen/cuda/swizzle.cpp",
     "torch/csrc/jit/codegen/cuda/type_inference.cpp",
     "torch/csrc/jit/codegen/cuda/type_promotion.cpp",
     "torch/csrc/jit/codegen/cuda/fusion_segmenter.cpp",
diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
@@ -102,6 +102,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu1.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu2.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu3.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_swizzle.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -948,6 +948,7 @@ NVFUSER_DEFINE_BINARY_FLOAT_OP(atan2, Atan2)
   }
 
 // Integer binary ops
+NVFUSER_DEFINE_BINARY_CAST_OP(cpp_div, Div)
 NVFUSER_DEFINE_BINARY_CAST_OP(mod, Mod)
 NVFUSER_DEFINE_BINARY_CAST_OP(ceilDiv, CeilDiv)
 NVFUSER_DEFINE_BINARY_CAST_OP(add, Add)
diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
@@ -350,6 +350,11 @@ TORCH_CUDA_CU_API Val* div(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* div(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* div(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* div(TensorView* v1, TensorView* v2);
+// cpp_div: similar to div, but don't promote to float
+TORCH_CUDA_CU_API Val* cpp_div(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* cpp_div(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* cpp_div(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* cpp_div(TensorView* v1, TensorView* v2);
 // fmod
 TORCH_CUDA_CU_API Val* fmod(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* fmod(TensorView* v1, Val* v2);
diff --git a/torch/csrc/jit/codegen/cuda/dynamic_type.h b/torch/csrc/jit/codegen/cuda/dynamic_type.h
@@ -132,6 +132,18 @@ class TORCH_CUDA_CU_API IntOrDouble {
     }
     TORCH_INTERNAL_ASSERT(false);
   }
+  IntOrDouble operator^(const IntOrDouble& other) const {
+    if (is_int() && other.is_int()) {
+      return IntOrDouble(as<int64_t>() ^ other.as<int64_t>());
+    }
+    TORCH_INTERNAL_ASSERT(false);
+  }
+  IntOrDouble operator^(int64_t other) const {
+    if (is_int()) {
+      return IntOrDouble(as<int64_t>() ^ other);
+    }
+    TORCH_INTERNAL_ASSERT(false);
+  }
 
 #define DEFINE_COMPARE_OP(op)                           \
   bool operator op(const IntOrDouble& other) const {    \
diff --git a/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp b/torch/csrc/jit/codegen/cuda/expr_evaluator.cpp
@@ -146,7 +146,8 @@ void ExpressionEvaluator::handle(UnaryOp* uop) {
         break;
       default:
         TORCH_CHECK(
-            !"Unexpected operator type ",
+            false,
+            "Unexpected operator type ",
             uop->getUnaryOpType(),
             " in ",
             uop->toString());
@@ -190,8 +191,16 @@ void ExpressionEvaluator::handle(BinaryOp* bop) {
       case BinaryOpType::Min:
         known_values_[bop->out()] = min(*lhs, *rhs);
         break;
+      case BinaryOpType::Xor:
+        known_values_[bop->out()] = *lhs ^ *rhs;
+        break;
       default:
-        TORCH_CHECK(!"Unexpected operator type");
+        TORCH_CHECK(
+            false,
+            "Unexpected operator type: ",
+            bop->getBinaryOpType(),
+            " in ",
+            bop->toString());
     }
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/index_compute.cpp b/torch/csrc/jit/codegen/cuda/index_compute.cpp
@@ -19,6 +19,7 @@
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower_validation.h>
 #include <torch/csrc/jit/codegen/cuda/root_domain_map.h>
+#include <torch/csrc/jit/codegen/cuda/swizzle.h>
 #include <torch/csrc/jit/codegen/cuda/transform_iter.h>
 #include <torch/csrc/jit/codegen/cuda/transform_replay.h>
 
@@ -552,18 +553,14 @@ void IndexCompute::handle(Swizzle2D* swizzle_2d) {
     // Generate integer swizzle math if the
     //  swizzle is activated. See also
     //  [Note on swizzle mode].
-
-    auto out_pair = IrBuilder::swizzle2DIntExpr(
+    std::pair<Val*, Val*> swizzled_index = dispatchSwizzle(
+        swizzle_2d->swizzleType(),
         out_x_ind,
         out_y_ind,
         getExtent(out_x_id),
-        getExtent(out_y_id),
-        swizzle_2d->swizzleType());
-
-    index_map_[in_x_id] =
-        IrBuilder::pairSelectExpr(out_pair, kir::PairSelect::Selection::X);
-    index_map_[in_y_id] =
-        IrBuilder::pairSelectExpr(out_pair, kir::PairSelect::Selection::Y);
+        getExtent(out_y_id));
+    index_map_[in_x_id] = swizzled_index.first;
+    index_map_[in_y_id] = swizzled_index.second;
   }
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/ir_utils.cpp b/torch/csrc/jit/codegen/cuda/ir_utils.cpp
@@ -941,7 +941,8 @@ struct ReplaceValInIndexVal : public OptInDispatch {
 
   void handle(Val* val) override {
     TORCH_INTERNAL_ASSERT(
-        val->isA<Int>() || val->isA<NamedScalar>() || val->isA<kir::IntPair>(),
+        val->isA<Int>() || val->isA<Bool>() || val->isA<NamedScalar>() ||
+            val->isA<kir::IntPair>(),
         "Invalid Val type: ",
         val->toString());
 
@@ -960,6 +961,7 @@ struct ReplaceValInIndexVal : public OptInDispatch {
       switch (def->etype()) {
         case ExprType::UnaryOp:
         case ExprType::BinaryOp:
+        case ExprType::TernaryOp:
         case ExprType::Swizzle2DInt:
         case ExprType::PairSelect:
           handle(val->definition());
@@ -978,7 +980,10 @@ struct ReplaceValInIndexVal : public OptInDispatch {
   void handle(UnaryOp* uop) override {
     handle(uop->in());
     auto inp = last_visited_val_;
-    TORCH_INTERNAL_ASSERT(uop->out()->isA<Int>());
+    TORCH_INTERNAL_ASSERT(
+        uop->out()->isA<Int>() || uop->out()->isA<Bool>(),
+        "Unknown output type for expr ",
+        uop->toInlineString());
     auto out = IrBuilder::create<Int>(c10::nullopt);
     IrBuilder::create<UnaryOp>(uop->getUnaryOpType(), out, inp);
     last_visited_val_ = out;
@@ -990,12 +995,32 @@ struct ReplaceValInIndexVal : public OptInDispatch {
     auto lhs = last_visited_val_;
     handle(bop->rhs());
     auto rhs = last_visited_val_;
-    TORCH_INTERNAL_ASSERT(bop->out()->isA<Int>());
+    TORCH_INTERNAL_ASSERT(
+        bop->out()->isA<Int>() || bop->out()->isA<Bool>(),
+        "Unknown output type for expr ",
+        bop->toInlineString());
     auto out = IrBuilder::create<Int>(c10::nullopt);
     IrBuilder::create<BinaryOp>(bop->getBinaryOpType(), out, lhs, rhs);
     last_visited_val_ = out;
   }
 
+  // Clone expression after recurisvely replacing inputs
+  void handle(TernaryOp* top) override {
+    handle(top->in1());
+    auto in1 = last_visited_val_;
+    handle(top->in2());
+    auto in2 = last_visited_val_;
+    handle(top->in3());
+    auto in3 = last_visited_val_;
+    TORCH_INTERNAL_ASSERT(
+        top->out()->isA<Int>() || top->out()->isA<Bool>(),
+        "Unknown output type for expr ",
+        top->toInlineString());
+    auto out = IrBuilder::create<Int>(c10::nullopt);
+    IrBuilder::create<TernaryOp>(top->getTernaryOpType(), out, in1, in2, in3);
+    last_visited_val_ = out;
+  }
+
   // Clone expression after recurisvely replacing inputs
   void handle(kir::Swizzle2DInt* swizzle_2d) override {
     handle(swizzle_2d->inX());
diff --git a/torch/csrc/jit/codegen/cuda/lower_validation.cpp b/torch/csrc/jit/codegen/cuda/lower_validation.cpp
@@ -312,7 +312,8 @@ class VectorizeValidator : public OptInDispatch {
   }
 
   void handle(Swizzle2D* swizzle) final {
-    if (swizzle->outX() == vectorized_id_ || swizzle->inX() == vectorized_id_) {
+    if (swizzle->outX() == vectorized_id_ || swizzle->inX() == vectorized_id_ ||
+        swizzle->outY() == vectorized_id_ || swizzle->inY() == vectorized_id_) {
       // Do not (yet) allow vectorization across any swizzled id.
       is_valid = false;
     }
diff --git a/torch/csrc/jit/codegen/cuda/swizzle.cpp b/torch/csrc/jit/codegen/cuda/swizzle.cpp
@@ -0,0 +1,135 @@
+#include <torch/csrc/jit/codegen/cuda/swizzle.h>
+
+#include <torch/csrc/jit/codegen/cuda/arith.h>
+#include <torch/csrc/jit/codegen/cuda/ir_builder.h>
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+namespace swizzles {
+
+// ------------------------------------------------------------
+// Swizzle Definitions
+//   for each swizzle name:
+// un(Swizzle Name) e.g. unZShape is the inverse of ZShape,
+//  (unswizzle is needed for inlining and is currently not actively used.)
+// ------------------------------------------------------------
+
+// Unit Z swizzle:
+//  Alternate directions of Y dimension:
+//    1 2 3      1 2 3
+//    4 5 6  =>  6 5 4
+//    7 8 9      7 8 9
+std::pair<Val*, Val*> ZShape(Val* x, Val* y, Val* size_y) {
+  auto zero = x->fusion()->zeroVal();
+  auto one = x->fusion()->oneVal();
+  auto two = IrBuilder::create<Int>(2);
+  return {x, where(eq(mod(x, two), zero), y, sub(sub(size_y, one), y))};
+}
+
+// ZShape is inverse of itself
+std::pair<Val*, Val*> unZShape(Val* x, Val* y, Val* size_y) {
+  return ZShape(x, y, size_y);
+}
+
+// Block cyclic Xor swizzle: (bank conflict removal)
+//  Apply cyclic Xor within blocks:
+//   Example: cyclic Xor
+//    1   2  3  4       1   2   3  4
+//    5   6  7  8       6   5   8  7
+//    9  10 11 12  =>   11  12  9 10
+//    13 14 15 16       16  15 14 13
+std::pair<Val*, Val*> Xor(Val* x, Val* y) {
+  // Need to validate in swizzle configuration:
+  //  size_x == size_y
+  return {x, bitwise_xor(x, y)};
+}
+
+// Xor is inverse of itself
+std::pair<Val*, Val*> unXor(Val* x, Val* y) {
+  return Xor(x, y);
+}
+
+// Block cyclic shift swizzle: (bank conflict removal)
+//  Apply cyclic shift within blocks:
+//   Example: cyclic shift
+//    1   2  3  4       1   2   3   4
+//    5   6  7  8       8   5   6   7
+//    9  10 11 12  =>   11  12  9  10
+//    13 14 15 16       14  15  16 13
+std::pair<Val*, Val*> CyclicShift(Val* x, Val* y, Val* size_x) {
+  return {x, mod(add(x, y), size_x)};
+}
+
+std::pair<Val*, Val*> unCyclicShift(Val* x, Val* y, Val* size_x) {
+  return {x, mod(sub(add(size_x, y), x), size_x)};
+}
+
+// Scatter swizzle:
+//   Corresponds to the data layout out of ldmatrix intrinsic.
+//   supported dimensions are : 8x4, 16x4, 32x4
+std::pair<Val*, Val*> Scatter(Val* x, Val* y, int size_x) {
+  TORCH_CHECK(
+      size_x == 8 || size_x == 16 || size_x == 32,
+      "Unsupported Scatter swizzle size");
+  Val* size_x_val = IrBuilder::create<Int>(size_x);
+  auto four = IrBuilder::create<Int>(4);
+  return {cpp_div(add(mul(y, size_x_val), x), four), mod(x, four)};
+}
+
+std::pair<Val*, Val*> unScatter(Val* x, Val* y, int size_x) {
+  TORCH_CHECK(
+      size_x == 8 || size_x == 16 || size_x == 32,
+      "Unsupported Scatter swizzle size");
+  Val* size_x_div_4 = IrBuilder::create<Int>(size_x / 4);
+  auto four = IrBuilder::create<Int>(4);
+  return {add(y, mul(mod(x, size_x_div_4), four)), cpp_div(x, size_x_div_4)};
+}
+
+} // namespace swizzles
+
+std::pair<Val*, Val*> dispatchSwizzle(
+    Swizzle2DType type,
+    Val* x,
+    Val* y,
+    Val* maybe_size_x,
+    Val* maybe_size_y) {
+  switch (type) {
+    case Swizzle2DType::ZShape:
+      return swizzles::ZShape(x, y, maybe_size_y);
+    case Swizzle2DType::XOR:
+      return swizzles::Xor(x, y);
+    case Swizzle2DType::CyclicShift:
+      return swizzles::CyclicShift(x, y, maybe_size_x);
+    case Swizzle2DType::Scatter:
+      return swizzles::Scatter(x, y, maybe_size_x->evaluateInt());
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unsupported swizzle type");
+  }
+}
+
+std::pair<Val*, Val*> dispatchUnSwizzle(
+    Swizzle2DType type,
+    Val* x,
+    Val* y,
+    Val* maybe_size_x,
+    Val* maybe_size_y) {
+  switch (type) {
+    case Swizzle2DType::ZShape:
+      return swizzles::unZShape(x, y, maybe_size_y);
+    case Swizzle2DType::XOR:
+      return swizzles::unXor(x, y);
+    case Swizzle2DType::CyclicShift:
+      return swizzles::unCyclicShift(x, y, maybe_size_x);
+    case Swizzle2DType::Scatter:
+      return swizzles::unScatter(x, y, maybe_size_x->evaluateInt());
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unsupported swizzle type");
+  }
+}
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/swizzle.h b/torch/csrc/jit/codegen/cuda/swizzle.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <c10/macros/Export.h>
+
+#include <torch/csrc/jit/codegen/cuda/ir_interface_nodes.h>
+#include <torch/csrc/jit/codegen/cuda/type.h>
+
+#include <utility>
+
+//
+// The operations defined in this header is intended as user facing functions.
+// The user will provide the necessary input Vals and the function will
+// create the correct intermediate nodes and return the output Vals.
+//
+
+namespace torch {
+namespace jit {
+namespace fuser {
+namespace cuda {
+
+TORCH_CUDA_CU_API std::pair<Val*, Val*> dispatchSwizzle(
+    Swizzle2DType type,
+    Val* x,
+    Val* y,
+    Val* maybe_size_x,
+    Val* maybe_size_y);
+
+TORCH_CUDA_CU_API std::pair<Val*, Val*> dispatchUnSwizzle(
+    Swizzle2DType type,
+    Val* x,
+    Val* y,
+    Val* maybe_size_x,
+    Val* maybe_size_y);
+
+} // namespace cuda
+} // namespace fuser
+} // namespace jit
+} // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/tensor_view.cpp b/torch/csrc/jit/codegen/cuda/tensor_view.cpp
@@ -579,6 +579,11 @@ TensorView* TensorView::swizzle(
     y += domain()->nDims();
   }
 
+  TORCH_CHECK(
+      !(getMemoryType() == MemoryType::Global &&
+        swizzle_mode == SwizzleMode::Data),
+      "Data swizzle on global memory is not supported.");
+
   TORCH_CHECK(
       x >= (int)getComputeAtPosition(),
       false,
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_swizzle.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_swizzle.cpp
diff --git a/torch/csrc/jit/codegen/cuda/type.cpp b/torch/csrc/jit/codegen/cuda/type.cpp

Original file line number	Diff line number	Diff line change
`@@ -948,6 +948,7 @@ NVFUSER_DEFINE_BINARY_FLOAT_OP(atan2, Atan2)`
`948`	`948`	`}`
`949`	`949`
`950`	`950`	`// Integer binary ops`
	`951`	`+NVFUSER_DEFINE_BINARY_CAST_OP(cpp_div, Div)`
`951`	`952`	`NVFUSER_DEFINE_BINARY_CAST_OP(mod, Mod)`
`952`	`953`	`NVFUSER_DEFINE_BINARY_CAST_OP(ceilDiv, CeilDiv)`
`953`	`954`	`NVFUSER_DEFINE_BINARY_CAST_OP(add, Add)`
Original file line number	Diff line number	Diff line change
`@@ -312,7 +312,8 @@ class VectorizeValidator : public OptInDispatch {`
`312`	`312`	`}`
`313`	`313`
`314`	`314`	`void handle(Swizzle2D* swizzle) final {`
`315`		`- if (swizzle->outX() == vectorized_id_ \|\| swizzle->inX() == vectorized_id_) {`
	`315`	`+ if (swizzle->outX() == vectorized_id_ \|\| swizzle->inX() == vectorized_id_ \|\|`
	`316`	`+ swizzle->outY() == vectorized_id_ \|\| swizzle->inY() == vectorized_id_) {`
`316`	`317`	`// Do not (yet) allow vectorization across any swizzled id.`
`317`	`318`	`is_valid = false;`
`318`	`319`	`}`