csarofeen · csarofeen · Aug 28, 2020 · Aug 27, 2020 · Aug 27, 2020 · Aug 27, 2020
diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
@@ -5008,7 +5008,7 @@ void testGPU_FusionReductionScheduler() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand({bid_x, tid_x}, options);
+  at::Tensor input = at::randn({bid_x, tid_x}, options);
 
   // Apply reduction heuristic
   const at::ArrayRef<c10::IValue> inputs({input});
@@ -5024,7 +5024,7 @@ void testGPU_FusionReductionScheduler() {
   auto aten_output = input.sum({red_dim});
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
@@ -5100,7 +5100,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
   at::Tensor cg_output = at::empty(tensor_dims_out, options);
 
   // Apply reduction heuristic
@@ -5117,7 +5117,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-04, 1e-04),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
@@ -5142,7 +5142,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
 
   const auto options =
       at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
-  at::Tensor input = at::rand(tensor_dims_in, options);
+  at::Tensor input = at::randn(tensor_dims_in, options);
 
   TORCH_CHECK(
       cuda::scheduleReduction(&fusion, {input}, tv1),
@@ -5155,7 +5155,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
   auto aten_output = input.sum(red_dims64);
 
   TORCH_CHECK(
-      aten_output.allclose(outputs[0]),
+      aten_output.allclose(outputs[0], 1e-05, 1e-05),
       "Error of: ",
       aten_output.sub(outputs[0]).abs().max());
 }
@@ -5205,8 +5205,8 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
                              .dtype((fp16 ? at::kHalf : at::kFloat))
                              .device(at::kCUDA, 0);
           at::Tensor input =
-              (axis ? at::rand({odim, rdim}, options)
-                    : at::rand({rdim, odim}, options));
+              (axis ? at::randn({odim, rdim}, options)
+                    : at::randn({rdim, odim}, options));
 
           const at::ArrayRef<c10::IValue> inputs({input});
 
@@ -5236,7 +5236,7 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
           auto aten_output = input.sum({axis});
 
           TORCH_CHECK(
-              aten_output.allclose(cg_output[0]),
+              aten_output.allclose(cg_output[0], 1e-03, 1e-03),
               "Error of: ",
               aten_output.sub(cg_output[0]).abs().max());
         }

diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -36,6 +36,31 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
   return code;
 }
 
+void FusionExecutor::compileFusionFromStr(
+    Fusion* fusion,
+    const std::string& code,
+    const std::string& name,
+    int id,
+    CompileOptions options) {
+  fusion_ = *fusion;
+  FusionGuard fg(&fusion_);
+  options_ = options;
+
+  const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
+  if (debug_env && atoi(debug_env)) {
+    std::cout << "\n==== codegen output for kernel: " << kernelName()
+              << " ====" << std::endl
+              << code << std::endl
+              << "=====*===============================" << std::endl;
+  }
+
+  fusion_id_ = id;
+  has_random_ = fusion->hasRNG();
+  lowered_ = GpuLower(&fusion_);
+  compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_);
+  compiled_ = true;
+}
+
 void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
   TORCH_INTERNAL_ASSERT(
       !fusion->outputs().empty(), "No output found for this kernel, aborting.");

diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -23,6 +23,12 @@ struct TORCH_CUDA_API CompileOptions {
 
 class TORCH_CUDA_API FusionExecutor : public NonCopyable {
  public:
+  void compileFusionFromStr(
+      Fusion* fusion,
+      const std::string& code,
+      const std::string& name,
+      int id,
+      CompileOptions options = CompileOptions());
   void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions());
 
   std::vector<at::Tensor> runFusion(

diff --git a/torch/csrc/jit/codegen/cuda/scheduler.cpp b/torch/csrc/jit/codegen/cuda/scheduler.cpp
@@ -239,7 +239,10 @@ ReductionParams reductionHeuristic(
 
   // Is fastest dimension a reduction dimension?
   if (rparams.fastest_dim) {
-    bdimx = red_elems;
+    if (red_elems < rparams.loop_unroll) {
+      rparams.loop_unroll = 1;
+    }
+    bdimx = ceilDiv(red_elems, rparams.loop_unroll);
     bdimy = red_outputs;
   } else {
     bdimx = red_outputs;
@@ -426,22 +429,12 @@ c10::optional<ReductionParams> scheduleReduction(
     // Do multiple reductions per block
     if (rparams.mul_reds_per_blk) {
       // Reduction Splits
-      //      [outputs, |rF-Leftover, rf-Unroll, X-Warp|]
-      // Idx:     0     |   1(-1)       2(-2)    3(-1) |
+      //      [outputs, |rF-Leftover, X-Warp, rf-Unroll|]
+      // Idx:     0     |   1(-1)      2(-2)     3(-1) |
       //                --------------------------------
       //                Reduction Dimensions
+      red_tv->split(1, rparams.loop_unroll);
       red_tv->split(1, rparams.lparams.bdimx());
-      red_tv->split(1, kLoopUnrollSplit);
-
-      // Reordering the Unroll dimension eases applying computeAt()
-      // for preceeding operations and the rFactored Tensor.
-      //                               |- Reordered -|
-      //                               V             V
-      //      [outputs, |rF-Leftover, X-Warp, rF-Unroll|]
-      // Idx:     0     |   1(-3)      2(-2)    3(-1)  |
-      //                --------------------------------
-      //                Reduction Dimensions
-      red_tv->reorder({{-1, -2}, {-2, -1}});
 
       // Output Splits
       //      [|Out-Leftover, Out-PerBlock|, <Reduction Dims>]
@@ -454,8 +447,8 @@ c10::optional<ReductionParams> scheduleReduction(
 
       // WARNING: computeAt will coalesce the rFactored dimensions
       // rFactored Reduction Tensor after computeAt():
-      //      [<output dims>, |X-Warp, rF-Leftover, rF-Unroll|]
-      // Idx:      0 -- 1     | 2(-3)      3(-2)       4(-1)  |
+      //      [<output dims>, | rF-Leftover, X-Warp, rF-Unroll|]
+      // Idx:      0 -- 1     |    2(-3)      3(-2)     4(-1)  |
       //                      ---------------------------------
       //                      Reduction Dimensions
       red_tv_rf->computeAt(red_tv, -1);
@@ -481,47 +474,37 @@ c10::optional<ReductionParams> scheduleReduction(
     } else {
       if (rparams.cross_grid) {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Grid, X-Block, X-Warp|]
-        // Idx:     0     |   1(-5)       2(-4)     3(-3)   4(-2)   5(-1) |
+        //      [outputs, |rF-Leftover, X-Grid, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-5)      2(-4)    3(-3)   4(-2)     5(-1) |
         //                -------------------------------------------------
         //                Reduction Dimensions
+        red_tv->split(1, rparams.loop_unroll);
         red_tv->split(1, rparams.lparams.bdimx());
         red_tv->split(1, rparams.lparams.bdimy());
         red_tv->split(1, rparams.lparams.gdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |------ Reordered --------|
-        //                                 V                         V
-        //      [outputs, |rF-Leftover, X-Warp, X-Grid, X-Block, rf-Unroll|]
-        // Idx:     0     |   1(-5)     2(-4)    3(-3)    4(-2)    5(-1)  |
-        //                -------------------------------------------------
-        //                Reduction Dimensions
-        red_tv->reorder({{-1, -4}, {-4, -1}});
 
         auto red_tv_rf = red_tv->rFactor(
             {-5, -1}); // NOLINT(cppcoreguidelines-avoid-magic-numbers)
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Grid, X-Block, rF-Leftover, rF-Unroll|]
-        // Idx:     0     | 1(-5)   2(-4)   3(-3)      4(-2)       5(-1)  |
+        //      [Outputs, |X-Grid, X-Block, X-Warp, rF-Leftover, rF-Unroll|]
+        // Idx:     0     | 1(-5)    2(-4)   3(-3)      4(-2)      5(-1)  |
         //                -------------------------------------------------
         //                Reduction Dimensions
         red_tv_rf->computeAt(red_tv, -1);
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Grid, X-Block]
-        // Idx:     0     1(-3)    2(-2)    3(-1)
+        //      [Outputs, X-Grid, X-Block, X-Warp]
+        // Idx:     0      1(-3)   2(-2)    3(-1)
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
         red_tv->axis(-2)->parallelize(ParallelType::BIDy);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        red_tv->axis(-3)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -531,44 +514,34 @@ c10::optional<ReductionParams> scheduleReduction(
         }
       } else {
         // Reduction Splits
-        //      [outputs, |rF-Leftover, rf-Unroll, X-Block, X-Warp|]
-        // Idx:     0     |   1(-4)       2(-3)     3(-2)   4(-1) |
+        //      [outputs, |rF-Leftover, X-Block, X-Warp, rf-Unroll|]
+        // Idx:     0     |   1(-4)       2(-3)   3(-2)     4(-1) |
         //                -----------------------------------------
         //                Reduction Dimensions
+        red_tv->split(1, rparams.loop_unroll);
         red_tv->split(1, rparams.lparams.bdimx());
         red_tv->split(1, rparams.lparams.bdimy());
-        red_tv->split(1, kLoopUnrollSplit);
-
-        // Reordering the Unroll dimension eases applying computeAt()
-        // for preceeding operations and the rFactored Tensor.
-        //                                 |--- Reordered ----|
-        //                                 V                  V
-        //      [outputs, |rF-Leftover, X-Warp, X-Block, rF-Unroll|]
-        // Idx:     0     |   1(-4)      2(-3)   3(-2)     4(-1)  |
-        //                -----------------------------------------
-        //                Reduction Dimensions
-        red_tv->reorder({{-1, -3}, {-3, -1}});
 
         auto red_tv_rf = red_tv->rFactor({-4, -1});
 
         // WARNING: computeAt will coalesce the rFactored dimensions
         // rFactored Reduction Tensor after computeAt():
-        //      [Outputs, |X-Warp, X-Block, rF-Leftover, rF-Unroll|]
+        //      [Outputs, |X-Block, X-Warp, rF-Leftover, rF-Unroll|]
         // Idx:     0     | 1(-4)   2(-3)      3(-2)       4(-1)  |
         //                -----------------------------------------
         //                Reduction Dimensions
         red_tv_rf->computeAt(red_tv, -1);
 
         // After the Reduction Tensor has rFactoring applied
         // Reduction Output Tensor:
-        //      [Outputs, X-Warp, X-Block]
-        // Idx:     0     1(-2)    2(-1)
+        //      [Outputs, X-Block, X-Warp]
+        // Idx:     0      1(-2)    2(-1)
 
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
-        red_tv->axis(-1)->parallelize(ParallelType::TIDy);
+        red_tv->axis(-1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction
         for (auto input : fusion->inputsOf(red_tv_rf)) {
@@ -625,7 +598,7 @@ c10::optional<ReductionParams> scheduleReduction(
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-3)->parallelize(ParallelType::TIDx);
         red_tv->axis(-2)->parallelize(ParallelType::TIDy);
         red_tv->axis(-1)->parallelize(ParallelType::BIDy);
 
@@ -679,7 +652,7 @@ c10::optional<ReductionParams> scheduleReduction(
         red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);
 
         red_tv->axis(0)->parallelize(ParallelType::BIDx);
-        red_tv->axis(1)->parallelize(ParallelType::TIDx);
+        red_tv->axis(-2)->parallelize(ParallelType::TIDx);
         red_tv->axis(-1)->parallelize(ParallelType::TIDy);
 
         // Bind Inputs to Reduction

diff --git a/torch/csrc/jit/codegen/cuda/scheduler.h b/torch/csrc/jit/codegen/cuda/scheduler.h
@@ -24,12 +24,15 @@ struct ReductionParams {
   bool cross_grid = false;
   bool mul_reds_per_blk = false;
 
+  int loop_unroll = 4;
+
   LaunchParams lparams;
 
   bool operator==(const ReductionParams& other) const {
     bool attr_equal = other.fastest_dim == fastest_dim &&
         other.cross_block == cross_block && other.cross_grid == cross_grid &&
-        other.mul_reds_per_blk == mul_reds_per_blk;
+        other.mul_reds_per_blk == mul_reds_per_blk &&
+        other.loop_unroll == loop_unroll;
     return attr_equal && lparams == other.lparams;
   }
 };
@@ -38,7 +41,7 @@ class ReductionParamsHash {
  public:
   size_t operator()(const ReductionParams& rp) const {
     size_t lp_hash = rp.lparams.gdimx() ^ rp.lparams.gdimy() ^
-        rp.lparams.bdimx() ^ rp.lparams.bdimy();
+        rp.lparams.bdimx() ^ rp.lparams.bdimy() ^ rp.loop_unroll;
     constexpr size_t bits = sizeof(std::size_t) * 8;
     size_t attr_hash = static_cast<size_t>(rp.fastest_dim) << (bits - 1) |
         static_cast<size_t>(rp.cross_block) << (bits - 2) |