From 3306d75d2a8505a446aa80dc8756b21f56cf4f93 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 22 Jul 2020 09:54:23 -0400 Subject: [PATCH 1/2] Allocations must use a mix of local information and computeAt information. --- torch/csrc/jit/codegen/cuda/lower_loops.cpp | 24 +++++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/lower_loops.cpp b/torch/csrc/jit/codegen/cuda/lower_loops.cpp index 98d1cea9bf39..0a078fec8ee1 100644 --- a/torch/csrc/jit/codegen/cuda/lower_loops.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_loops.cpp @@ -25,26 +25,35 @@ Expr* LoopNestGenerator::pushAlloc(TensorView* tv) { if (tv->hasComputeAt() && alloc_pos == tv->getThisComputeAtAxis()) { break; } + // If we found an unroll, we want to place the allocation outside the unroll + if (alloc_pos < tv->nDims() && + tv->getComputeAtAxis(alloc_pos).first->parallel_method() == + ParallelType::Unroll) { + break; + } alloc_pos++; } // Grab the dimensions the allocation will be based on std::vector alloc_dims; for (auto i = alloc_pos; i < tv->nDims(); i++) { - IterDomain* dim = tv->getComputeAtAxis(i).first; + IterDomain* compute_at_dim = tv->getComputeAtAxis(i).first; + IterDomain* local_dim = tv->axis(i); if ( // If shared memory, don't use any IDs bound to a grid dimension - (tv->memory_type_ == MemoryType::Shared && dim->isBlockDim()) || + (tv->memory_type_ == MemoryType::Shared && + compute_at_dim->isBlockDim()) || // If local memory, don't use any IDs bound to a grid or block dimension - (tv->memory_type_ == MemoryType::Local && dim->isThread()) || + (tv->memory_type_ == MemoryType::Local && compute_at_dim->isThread()) || // If we're reducing this dimension, don't use it in the allocation // computation - dim->isReduction() || + local_dim->isReduction() || // If this is a broadcast dimension, don't use it in the allocation // computation - dim->isBroadcast()) + local_dim->isBroadcast()) { continue; - alloc_dims.push_back(dim->extent()); + } + alloc_dims.push_back(compute_at_dim->extent()); } // Multiply all the dimensions we're going to use for the allocation together @@ -282,8 +291,9 @@ void LoopNestGenerator::handle(Expr* expr) { Expr* alloc_stmt = nullptr; // 3) Allocate the output. if (!FusionGuard::getCurFusion()->hasInput(out) && - !FusionGuard::getCurFusion()->hasOutput(out)) + !FusionGuard::getCurFusion()->hasOutput(out)) { alloc_stmt = pushAlloc(out); + } // 4) If this is a reduction, initialize the output (open for loops to inner // most, predicate, initialize, place next after allocation if exists, close From 17fdadb657ac3781c808a179d4c83eeac45938d9 Mon Sep 17 00:00:00 2001 From: Christian Sarofeen Date: Wed, 22 Jul 2020 10:01:58 -0400 Subject: [PATCH 2/2] Update test. --- test/cpp/jit/test_gpu.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/cpp/jit/test_gpu.cpp b/test/cpp/jit/test_gpu.cpp index d9d151a4da94..df214bbd784d 100644 --- a/test/cpp/jit/test_gpu.cpp +++ b/test/cpp/jit/test_gpu.cpp @@ -1219,8 +1219,8 @@ void testGPU_FusionSimplePWise() { tv3->merge(0); // Split by n_threads - tv3->split(-1, 128 * 2); - tv3->split(-1, 128); + tv3->split(0, 128); + tv3->split(0, 4); // For all inputs, computeAt the output inline, temporaries should be squeezed // between them @@ -1229,7 +1229,7 @@ void testGPU_FusionSimplePWise() { // Parallelize TV3 tv3->axis(0)->parallelize(ParallelType::BIDx); - tv3->axis(-2)->parallelize(ParallelType::TIDy); + tv3->axis(-2)->parallelize(ParallelType::Unroll); tv3->axis(-1)->parallelize(ParallelType::TIDx); auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);