csarofeen · csarofeen · Apr 3, 2020 · Apr 3, 2020 · Apr 4, 2020 · Apr 4, 2020
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
@@ -586,8 +586,11 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/ir_iostream.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/iter_visitor.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/kernel_cache.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/manager.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/mutator.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_loops.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower_utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/lower2device.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/parser.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/partition.cpp
@@ -596,6 +599,7 @@ if(NOT INTERN_BUILD_MOBILE OR NOT BUILD_CAFFE2_MOBILE)
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/tensor_view.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_iter.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_replay.cpp
+      ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/transform_rfactor.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/type.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/utils.cpp
       ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/register_interface.cpp

diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
diff --git a/test/cpp/jit/tests.h b/test/cpp/jit/tests.h
@@ -117,7 +117,12 @@ namespace jit {
   _(GPU_FusionCodeGen2)          \
   _(GPU_FusionSimplePWise)       \
   _(GPU_FusionExecKernel)        \
-  _(GPU_FusionForLoop)
+  _(GPU_FusionForLoop)           \
+  _(GPU_FusionLoopUnroll)        \
+  _(GPU_FusionAdvancedComputeAt) \
+  _(GPU_FusionScalarInputs)      \
+  _(GPU_FusionRFactorReplay)     \
+  _(GPU_FusionSimpleReduction)
 #else
 #define TH_FORALL_TESTS_CUDA(_) \
   _(ArgumentSpec)               \

diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
@@ -86,20 +86,57 @@ def t(x, y, z, q):
     @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
     @skipIfRocm
     def test_scalar_input(self):
-        def t(x, y, z):
-            # type: (Tensor, Tensor, float) -> Tensor
+        def t(x : torch.Tensor, y : torch.Tensor, z : float):
             o = x + y
             o = o + z
             return o
         t_jit = torch.jit.script(t)
-        x = torch.randn(4, 8, dtype=torch.float, device="cuda")
-        y = torch.randn(4, 8, dtype=torch.float, device="cuda")
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(4, 8, 1, 32, dtype=torch.float, device="cuda")
+        y = y.expand(4, 8, 32, 32)
+        jit_o = t_jit(x, y, 2.0)
+        jit_o = t_jit(x, y, 2.0)
+        o = t(x, y, 2.0)
+        self.assertEqual(o, jit_o)
+        self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, 2.0)))
+
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_broadcasting(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : float):
+            o = x + y
+            o = o + z
+            return o
+        t_jit = torch.jit.script(t)
+        x = torch.randn(4, 8, 32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(32, 32, dtype=torch.float, device="cuda")
         jit_o = t_jit(x, y, 2.0)
         jit_o = t_jit(x, y, 2.0)
         o = t(x, y, 2.0)
         self.assertEqual(o, jit_o)
         self.assertTrue(self._has_cuda_fusion_group(t_jit.graph_for(x, y, 2.0)))
 
+    @unittest.skipIf(not RUN_CUDA, "requires CUDA")
+    @unittest.skipIf(GRAPH_EXECUTOR != ProfilingMode.PROFILING, "Requires profiling node to run cuda fuser")
+    @skipIfRocm
+    def test_broadcasting_multiple_output_shape(self):
+        def t(x : torch.Tensor, y : torch.Tensor, z : torch.Tensor):
+            o = x + 12
+            o1 = o + y
+            o2 = o + z
+            oo = o1.sum() + o2.sum()
+            return oo
+        t_jit = torch.jit.script(t)
+        x = torch.randn(32, 32, dtype=torch.float, device="cuda")
+        y = torch.randn(2, 32, 32, dtype=torch.float, device="cuda")
+        z = torch.randn(4, 32, 32, dtype=torch.float, device="cuda")
+        jit_o = t_jit(x, y, z)
+        jit_o = t_jit(x, y, z)
+        o = t(x, y, z)
+        self.assertEqual(o, jit_o)
+        # Currently cannot fuse this
+        self.assertFalse(self._has_cuda_fusion_group(t_jit.graph_for(x, y, z)))
 
 if __name__ == '__main__':
     run_tests()
diff --git a/tools/build_variables.bzl b/tools/build_variables.bzl
@@ -239,6 +239,9 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/ir_iostream.cpp",
     "torch/csrc/jit/codegen/cuda/iter_visitor.cpp",
     "torch/csrc/jit/codegen/cuda/kernel.cpp",
+    "torch/csrc/jit/codegen/cuda/kernel_cache.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_loops.cpp",
+    "torch/csrc/jit/codegen/cuda/lower_utils.cpp",
     "torch/csrc/jit/codegen/cuda/lower2device.cpp",
     "torch/csrc/jit/codegen/cuda/manager.cpp",
     "torch/csrc/jit/codegen/cuda/mutator.cpp",
@@ -249,6 +252,7 @@ libtorch_cuda_sources = [
     "torch/csrc/jit/codegen/cuda/tensor_view.cpp",
     "torch/csrc/jit/codegen/cuda/transform_iter.cpp",
     "torch/csrc/jit/codegen/cuda/transform_replay.cpp",
+    "torch/csrc/jit/codegen/cuda/transform_rfactor.cpp",
     "torch/csrc/jit/codegen/cuda/type.cpp",
     "torch/csrc/jit/codegen/cuda/utils.cpp",
     "torch/csrc/jit/codegen/cuda/register_interface.cpp",

diff --git a/torch/csrc/jit/codegen/cuda/arith.cpp b/torch/csrc/jit/codegen/cuda/arith.cpp
@@ -57,6 +57,36 @@ TORCH_CUDA_API Val* promoteNew(Val* v1, Val* v2) {
   return newValLike(v1, out_dtype);
 }
 
+Val* newConstScalar(DataType dtype, long int val) {
+  switch (dtype) {
+    case (DataType::Int):
+      return new Int((int)val);
+    default:
+      break;
+  }
+  TORCH_CHECK(
+      false,
+      "Could not generate a new Scalar with data type ",
+      dtype,
+      "and constant value: ",
+      val);
+}
+
+Val* newConstScalar(DataType dtype, double val) {
+  switch (dtype) {
+    case (DataType::Float):
+      return new Float(val);
+    default:
+      break;
+  }
+  TORCH_CHECK(
+      false,
+      "Could not generate a new Scalar with data type ",
+      dtype,
+      "and constant value: ",
+      val);
+}
+
 TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1) {
   if (v1->getDataType().value() == dtype)
     return v1;
@@ -75,12 +105,16 @@ TORCH_CUDA_API Val* castOp(DataType dtype, Val* v1) {
   return out;
 }
 
+// UNARY OPERATIONS
+
 TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1) {
   Val* out = newValLike(v1);
   Statement* expr = new UnaryOp(type, out, v1);
   return out;
 }
 
+// BINARY OPERATIONS
+
 TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2) {
   Val* out = promoteNew(v1, v2);
   if (type >= BinaryOpType::Mod) {
@@ -123,6 +157,59 @@ TORCH_CUDA_API Val* andOp(Val* v1, Val* v2) {
   return binaryOp(BinaryOpType::And, v1, v2);
 }
 
+// REDUCTION OPERATIONS
+
+Val* reductionOp(
+    BinaryOpType reduction_op_type,
+    std::vector<int> axes,
+    Val* init,
+    Val* v1) {
+  TORCH_CHECK(
+      v1->getValType().value() == ValType::TensorView,
+      "Cannot reduce on values that are not TensorViews, but recieved type ",
+      v1->getValType().value());
+
+  TORCH_CHECK(
+      init->isConstScalar(),
+      "Cannot create a reduction operation where the initial value is not a const scalar.");
+
+  TensorView* tv = static_cast<TensorView*>(v1);
+
+  TORCH_CHECK(
+      tv->getRootDomain() == tv->domain(),
+      "Reducing a tensor once it's gone under transformations is not permitted at this time. Please set reductions before calling split/merge/reorder/computeAt.");
+
+  std::vector<unsigned int> uint_axes;
+  for (int axis : axes) {
+    if (axis < 0)
+      axis += int(tv->nDims());
+
+    TORCH_CHECK(
+        axis >= 0 && axis < tv->nDims(),
+        "Reduction on invalid axis, recieved: ",
+        axis,
+        " however tensor view only has ",
+        tv->nDims(),
+        " dims.");
+
+    uint_axes.push_back((unsigned int)axis);
+  }
+
+  Val* out = tv->newForReduction(uint_axes);
+  if (init->getDataType().value() != v1->getDataType().value())
+    init = castOp(v1->getDataType().value(), init);
+  new ReductionOp(reduction_op_type, init, out, v1);
+  return out;
+}
+
+TORCH_CUDA_API Val* sum(Val* v1, std::vector<int> axes) {
+  return reductionOp(
+      BinaryOpType::Add,
+      axes,
+      newConstScalar(v1->getDataType().value(), 0.0),
+      v1);
+}
+
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/arith.h b/torch/csrc/jit/codegen/cuda/arith.h
@@ -32,6 +32,7 @@ TORCH_CUDA_API Val* unaryOp(UnaryOpType type, Val* v1);
 // Mod, CeilDiv, and LT are considered Int only output operations for now.
 TORCH_CUDA_API Val* binaryOp(BinaryOpType type, Val* v1, Val* v2);
 
+// Binary operations
 TORCH_CUDA_API Val* add(Val* v1, Val* v2);
 TORCH_CUDA_API Val* sub(Val* v1, Val* v2);
 TORCH_CUDA_API Val* mul(Val* v1, Val* v2);
@@ -40,7 +41,13 @@ TORCH_CUDA_API Val* mod(Val* v1, Val* v2);
 TORCH_CUDA_API Val* lt(Val* v1, Val* v2);
 TORCH_CUDA_API Val* ceilDiv(Val* v1, Val* v2);
 TORCH_CUDA_API Val* andOp(Val* v1, Val* v2);
-
+TORCH_CUDA_API Val* reductionOp(
+    BinaryOpType reduction_op_type,
+    std::vector<int> axes,
+    Val* init,
+    Val* v1);
+// REDUCTION OPERATIONS
+TORCH_CUDA_API Val* sum(Val* v1, std::vector<int> reduction_axes);
 } // namespace fuser
 } // namespace jit
 } // namespace torch
diff --git a/torch/csrc/jit/codegen/cuda/data_struct_str.h b/torch/csrc/jit/codegen/cuda/data_struct_str.h
diff --git a/torch/csrc/jit/codegen/cuda/dispatch.cpp b/torch/csrc/jit/codegen/cuda/dispatch.cpp
@@ -1,6 +1,5 @@
 #include <torch/csrc/jit/codegen/cuda/fusion.h>
 #include <torch/csrc/jit/codegen/cuda/ir_all_nodes.h>
-#include <torch/csrc/jit/codegen/cuda/tensor.h>
 #include <torch/csrc/jit/codegen/cuda/type.h>
 
 #include <torch/csrc/jit/codegen/cuda/dispatch.h>
@@ -93,6 +92,9 @@ void Expr::dispatch(T handler, Expr* expr) {
     case ExprType::BinaryOp:
       ptr(handler)->handle(static_cast<BinaryOp*>(expr));
       return;
+    case ExprType::ReductionOp:
+      ptr(handler)->handle(static_cast<ReductionOp*>(expr));
+      return;
     case ExprType::ForLoop:
       ptr(handler)->handle(static_cast<ForLoop*>(expr));
       return;
@@ -170,6 +172,9 @@ void Expr::constDispatch(T handler, const Expr* const expr) {
     case ExprType::BinaryOp:
       ptr(handler)->handle(static_cast<const BinaryOp* const>(expr));
       return;
+    case ExprType::ReductionOp:
+      ptr(handler)->handle(static_cast<const ReductionOp* const>(expr));
+      return;
     case ExprType::ForLoop:
       ptr(handler)->handle(static_cast<const ForLoop* const>(expr));
       return;
@@ -246,6 +251,8 @@ Statement* Expr::mutatorDispatch(T mutator, Expr* expr) {
       return ptr(mutator)->mutate(static_cast<UnaryOp*>(expr));
     case ExprType::BinaryOp:
       return ptr(mutator)->mutate(static_cast<BinaryOp*>(expr));
+    case ExprType::ReductionOp:
+      return ptr(mutator)->mutate(static_cast<ReductionOp*>(expr));
     case ExprType::ForLoop:
       return ptr(mutator)->mutate(static_cast<ForLoop*>(expr));
     case ExprType::IfThenElse:

diff --git a/torch/csrc/jit/codegen/cuda/dispatch.h b/torch/csrc/jit/codegen/cuda/dispatch.h
@@ -70,6 +70,7 @@ struct Merge;
 struct Reorder;
 struct UnaryOp;
 struct BinaryOp;
+struct ReductionOp;
 struct ForLoop;
 struct IfThenElse;
 struct Allocate;
@@ -108,6 +109,7 @@ struct TORCH_CUDA_API OptOutConstDispatch {
   virtual void handle(const Reorder* const) {}
   virtual void handle(const UnaryOp* const) {}
   virtual void handle(const BinaryOp* const) {}
+  virtual void handle(const ReductionOp* const) {}
   virtual void handle(const ForLoop* const) {}
   virtual void handle(const IfThenElse* const) {}
   virtual void handle(const Allocate* const) {}
@@ -143,6 +145,7 @@ struct TORCH_CUDA_API OptOutDispatch {
   virtual void handle(Reorder*) {}
   virtual void handle(UnaryOp*) {}
   virtual void handle(BinaryOp*) {}
+  virtual void handle(ReductionOp*) {}
   virtual void handle(ForLoop*) {}
   virtual void handle(IfThenElse*) {}
   virtual void handle(Allocate*) {}
@@ -202,6 +205,9 @@ struct TORCH_CUDA_API OptInConstDispatch {
   virtual void handle(const BinaryOp* const) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
   }
+  virtual void handle(const ReductionOp* const) {
+    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
+  }
   virtual void handle(const ForLoop* const) {
     AT_ERROR("Handle not overriden for ForLoop.");
   }
@@ -267,6 +273,9 @@ struct TORCH_CUDA_API OptInDispatch {
   virtual void handle(BinaryOp*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for BinaryOp.");
   }
+  virtual void handle(ReductionOp*) {
+    TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ReductionOp.");
+  }
   virtual void handle(ForLoop*) {
     TORCH_INTERNAL_ASSERT(false, "Handle not overriden for ForLoop.");
   }
@@ -332,6 +341,7 @@ struct TORCH_CUDA_API OptOutMutator {
   virtual Statement* mutate(Reorder*);
   virtual Statement* mutate(UnaryOp*);
   virtual Statement* mutate(BinaryOp*);
+  virtual Statement* mutate(ReductionOp*);
   virtual Statement* mutate(ForLoop*);
   virtual Statement* mutate(IfThenElse*);
   virtual Statement* mutate(Allocate*);
@@ -401,6 +411,9 @@ struct TORCH_CUDA_API OptInMutator {
   virtual Statement* mutate(BinaryOp*) {
     TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for BinaryOp.");
   }
+  virtual Statement* mutate(ReductionOp*) {
+    TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ReductionOp.");
+  }
   virtual Statement* mutate(ForLoop*) {
     TORCH_INTERNAL_ASSERT(false, "Mutate not overriden for ForLoop.");
   }