From 3306d75d2a8505a446aa80dc8756b21f56cf4f93 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 22 Jul 2020 09:54:23 -0400
Subject: [PATCH 1/2] Allocations must use a mix of local information and
 computeAt information.

---
 torch/csrc/jit/codegen/cuda/lower_loops.cpp | 24 +++++++++++++++------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
index 98d1cea9bf39..0a078fec8ee1 100644
--- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp
+++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp
@@ -25,26 +25,35 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) {
     if (tv->hasComputeAt() && alloc_pos == tv->getThisComputeAtAxis()) {
       break;
     }
+    // If we found an unroll, we want to place the allocation outside the unroll
+    if (alloc_pos < tv->nDims() &&
+        tv->getComputeAtAxis(alloc_pos).first->parallel_method() ==
+            ParallelType::Unroll) {
+      break;
+    }
     alloc_pos++;
   }
 
   // Grab the dimensions the allocation will be based on
   std::vector<Val*> alloc_dims;
   for (auto i = alloc_pos; i < tv->nDims(); i++) {
-    IterDomain* dim = tv->getComputeAtAxis(i).first;
+    IterDomain* compute_at_dim = tv->getComputeAtAxis(i).first;
+    IterDomain* local_dim = tv->axis(i);
     if (
         // If shared memory, don't use any IDs bound to a grid dimension
-        (tv->memory_type_ == MemoryType::Shared && dim->isBlockDim()) ||
+        (tv->memory_type_ == MemoryType::Shared &&
+         compute_at_dim->isBlockDim()) ||
         // If local memory, don't use any IDs bound to a grid or block dimension
-        (tv->memory_type_ == MemoryType::Local && dim->isThread()) ||
+        (tv->memory_type_ == MemoryType::Local && compute_at_dim->isThread()) ||
         // If we're reducing this dimension, don't use it in the allocation
         // computation
-        dim->isReduction() ||
+        local_dim->isReduction() ||
         // If this is a broadcast dimension, don't use it in the allocation
         // computation
-        dim->isBroadcast())
+        local_dim->isBroadcast()) {
       continue;
-    alloc_dims.push_back(dim->extent());
+    }
+    alloc_dims.push_back(compute_at_dim->extent());
   }
 
   // Multiply all the dimensions we're going to use for the allocation together
@@ -282,8 +291,9 @@ void LoopNestGenerator::handle(Expr* expr) {
   Expr* alloc_stmt = nullptr;
   //  3) Allocate the output.
   if (!FusionGuard::getCurFusion()->hasInput(out) &&
-      !FusionGuard::getCurFusion()->hasOutput(out))
+      !FusionGuard::getCurFusion()->hasOutput(out)) {
     alloc_stmt = pushAlloc(out);
+  }
 
   //  4) If this is a reduction, initialize the output (open for loops to inner
   //  most, predicate, initialize, place next after allocation if exists, close

From 17fdadb657ac3781c808a179d4c83eeac45938d9 Mon Sep 17 00:00:00 2001
From: Christian Sarofeen <csarofeen@nvidia.com>
Date: Wed, 22 Jul 2020 10:01:58 -0400
Subject: [PATCH 2/2] Update test.

---
 test/cpp/jit/test_gpu.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp
index d9d151a4da94..df214bbd784d 100644
--- a/test/cpp/jit/test_gpu.cpp
+++ b/test/cpp/jit/test_gpu.cpp
@@ -1219,8 +1219,8 @@ void testGPU_FusionSimplePWise() {
   tv3->merge(0);
 
   // Split by n_threads
-  tv3->split(-1, 128 * 2);
-  tv3->split(-1, 128);
+  tv3->split(0, 128);
+  tv3->split(0, 4);
 
   // For all inputs, computeAt the output inline, temporaries should be squeezed
   // between them
@@ -1229,7 +1229,7 @@ void testGPU_FusionSimplePWise() {
 
   // Parallelize TV3
   tv3->axis(0)->parallelize(ParallelType::BIDx);
-  tv3->axis(-2)->parallelize(ParallelType::TIDy);
+  tv3->axis(-2)->parallelize(ParallelType::Unroll);
   tv3->axis(-1)->parallelize(ParallelType::TIDx);
 
   auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);