Add short cut for recomputed tv (#2134)

zasdfgbnm · web-flow · commit 19e5af790775 · 2022-10-29T13:16:23.000-07:00
diff --git a/torch/csrc/jit/codegen/cuda/ir_iostream.cpp b/torch/csrc/jit/codegen/cuda/ir_iostream.cpp
@@ -161,6 +161,8 @@ void IrPrinter::handle(const TensorView* tv) {
     case MemoryType::Local:
       os_ << "_l";
       break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type.");
   }
   handle(tv->domain());
 
@@ -704,6 +706,8 @@ void IrPrinter::handle(const kir::TensorIndex* ti) {
     case MemoryType::Local:
       os_ << "_l";
       break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Unknown tensor memory type.");
   }
   os_ << "[";
   for (auto index : ti->indices()) {
diff --git a/torch/csrc/jit/codegen/cuda/kernel.cpp b/torch/csrc/jit/codegen/cuda/kernel.cpp
@@ -77,6 +77,8 @@ class KernelIrScanner : private IrVisitor {
           summary_.dynamic_lmem_allocations.emplace_back(allocate);
         }
         break;
+      default:
+        TORCH_INTERNAL_ASSERT(false, "Unknown memory type to allocate.");
     }
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp b/torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp
@@ -974,11 +974,14 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
 
   // Project the persistent buffers to the inputs. Inputs will be cached in a
   // later step, this will move them to be in a register buffer as expected.
+  // dummy outputs are helper tensors to make sure persistent buffer projection
+  // does not create trouble for transform propagation.
   // TODO: Fix projected persistent buffers with view
   // https://github.com/csarofeen/pytorch/issues/2054
+  std::vector<TensorView*> dummy_outputs;
   if (rparams.project_persistent_buffers &&
       ir_utils::getViewOps(fusion).empty()) {
-    reduction_scheduler_utils::projectPersistentBuffers(fusion);
+    dummy_outputs = reduction_scheduler_utils::projectPersistentBuffers(fusion);
   }
 
   // Cache tensors before grabbing any references to reductions as cache_before
@@ -1043,6 +1046,9 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
       reference_tv != nullptr && reduction_tv != nullptr,
       "Need these two tensor views to finish the scheduling.");
 
+  for (auto output : dummy_outputs) {
+    fusion->addOutput(output);
+  }
   reduction_scheduler_utils::multiReductionInliner(
       fusion,
       rparams,
@@ -1051,6 +1057,9 @@ TORCH_CUDA_CU_API void schedulePersistentKernel(
       reduction_tvs,
       cached_inputs,
       cached_outputs);
+  for (auto output : dummy_outputs) {
+    fusion->removeOutput(output);
+  }
 }
 
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.cpp
@@ -1,5 +1,6 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h>
 
+#include <torch/csrc/jit/codegen/cuda/arith.h>
 #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h>
 #include <torch/csrc/jit/codegen/cuda/inlining.h>
 #include <torch/csrc/jit/codegen/cuda/ir_cloner.h>
@@ -493,8 +494,9 @@ TensorView* sortAndRFactor(TensorView* reference_tv) {
   return ir_utils::rfactorHelper(reference_tv, rfactor_axes);
 }
 
-void projectPersistentBuffers(Fusion* fusion) {
+std::vector<TensorView*> projectPersistentBuffers(Fusion* fusion) {
   auto persistent_info = scheduler_utils::persistentBuffers(fusion);
+  std::vector<TensorView*> dummy_outputs;
 
   // Convenience accessors
   const auto& persistent_buffers = persistent_info.persistent_buffers;
@@ -562,10 +564,39 @@ void projectPersistentBuffers(Fusion* fusion) {
       for (auto use : persistent_use_of_buffer) {
         TORCH_INTERNAL_ASSERT(use->definition() != nullptr);
         auto buffer_replicate = RecomputeTv::recompute(buffer);
+        // Create a shortcut buffer <--> buffer_replicate for propagation.
+        // Why is this needed?
+        // Consider that we have a fusion
+        //
+        //   T0[I]
+        //   T1[b b I] = broadcast(T0)
+        //   T2[b b r] = reduction(T1)
+        //   T3[b b b] = broadcast(T2)
+        //   T4[b, b, I] = T1 + T3
+        //   T5[b, b, r] = reduction(T4)
+        //
+        // After projection, it becomes
+        //
+        //   T0[I]
+        //   T1[b b I] = broadcast(T0)
+        //   T2[b b r] = reduction(T1)
+        //   T3[b b b] = broadcast(T2)
+        //   T6[b b I] = broadcast(T0)
+        //   T4[b, b, I] = T6 + T3
+        //   T5[b, b, r] = reduction(T4)
+        //
+        // During schedule, we need to propagate from T2 to T5. However, in the
+        // resulting DAG, neither the propagation path T2->T3->T4->T5 nor
+        // T2->T1->T0->T6->T4->T5 works because they both have missing root
+        // domain. But adding `T7 = T1 + T6` creates a new propagation path
+        // `T2->T1->T7->T6->T4->T5` which has all root domain information.
+        // See FusionBroadcastPersistentReduction_CUDA for an example
+        dummy_outputs.emplace_back(add(buffer_replicate, buffer));
         ir_utils::replaceValInExpr(use->definition(), buffer, buffer_replicate);
       }
     }
   }
+  return dummy_outputs;
 }
 
 } // namespace reduction_scheduler_utils
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/reduction_utils.h
@@ -43,8 +43,11 @@ TORCH_CUDA_CU_API void multiReductionInliner(
 // Reduction inliner expects an rfactored domain.
 TORCH_CUDA_CU_API TensorView* sortAndRFactor(TensorView* reference_tv);
 
-// Take all projectable persistent buffers, and move them to the inputs.
-TORCH_CUDA_CU_API void projectPersistentBuffers(Fusion* fusion);
+// Take all projectable persistent buffers, and move them to the inputs. This
+// function create dummy outputs which should be used in later stages of the
+// scheduling.
+TORCH_CUDA_CU_API std::vector<TensorView*> projectPersistentBuffers(
+    Fusion* fusion);
 
 } // namespace reduction_scheduler_utils
 } // namespace cuda
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu3.cpp
@@ -6408,10 +6408,37 @@ TEST_F(NVFuserTest, FusionVectorizeRepro1843_CUDA) {
   testValidate(fusion, cg_outputs, {t1, t0}, {ref}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionBroadcastPersistentReduction_CUDA) {
+  // Simplified repro for
+  // https://github.com/csarofeen/pytorch/issues/2094
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  auto tv0 = makeContigTensor(2, DataType::Half);
+  auto tv1 = castOp(DataType::Float, tv0);
+  auto tv2 = broadcast(tv1, {true, true, false, false});
+  auto tv3 = sum(tv2, {-1}, true);
+  auto tv4 = add(tv2, tv3); // TODO: changing this to tv1 there is still errors
+  auto tv5 = sum(tv4, {-1});
+  fusion->addInput(tv0);
+  fusion->addOutput(tv5);
+
+  auto options = at::TensorOptions().dtype(kHalf).device(at::kCUDA, 0);
+  auto t0 = at::randn({1024, 768}, options);
+  auto t1 = t0.view({1, 1, 1024, 768}).to(kFloat);
+  auto t3 = t1.sum({-1}, true);
+  auto t4 = t1 + t3;
+  auto t5 = t4.sum({-1});
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({t0});
+  testValidate(fusion, cg_outputs, {t0}, {t5}, __LINE__, __FILE__);
+}
+
 // Repro for
 // https://github.com/csarofeen/pytorch/issues/2094
 TEST_F(NVFuserTest, FusionRepro2094_CUDA) {
-  return;
   std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
   auto fusion = fusion_ptr.get();
   FusionGuard fg(fusion);

Original file line number	Diff line number	Diff line change
`@@ -77,6 +77,8 @@ class KernelIrScanner : private IrVisitor {`
`77`	`77`	`summary_.dynamic_lmem_allocations.emplace_back(allocate);`
`78`	`78`	`}`
`79`	`79`	`break;`
	`80`	`+ default:`
	`81`	`+ TORCH_INTERNAL_ASSERT(false, "Unknown memory type to allocate.");`
`80`	`82`	`}`
`81`	`83`	`}`
`82`	`84`