Check siblings in getMaxPosAll (csarofeen#1805)

zasdfgbnm · web-flow · commit fa4e6a4739a9 · 2022-07-06T08:07:46.000-04:00
diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.cpp b/torch/csrc/jit/codegen/cuda/inline_propagator.cpp
@@ -132,12 +132,17 @@ size_t MaxPosCalculator::getMaxProducerPosFromConsumer(
   return producer->nDims();
 }
 
-size_t InlinePropagator::getMaxPosAll(TensorView* tv) {
+size_t InlinePropagator::getMaxPosAll(TensorView* tv, bool check_siblings) {
   auto max_pos = max_pos_calc.getMaxPosSelf(tv, false, false, false);
   for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {
     max_pos = std::min<size_t>(
         max_pos, max_pos_calc.getMaxProducerPosFromConsumer(tv, consumer_tv));
   }
+  if (check_siblings) {
+    for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {
+      max_pos = std::min<size_t>(max_pos, getMaxPosAll(sibling_tv, false));
+    }
+  }
   return max_pos;
 }
 
diff --git a/torch/csrc/jit/codegen/cuda/inline_propagator.h b/torch/csrc/jit/codegen/cuda/inline_propagator.h
@@ -70,7 +70,7 @@ class MaxPosCalculator {
 class InlinePropagator : public MaxInfoSpanningTree::Propagator {
   // Checks producers and consumers to see what the maximum position in tv is
   // that can be shared across both directions.
-  size_t getMaxPosAll(TensorView* tv);
+  size_t getMaxPosAll(TensorView* tv, bool check_siblings = true);
 
   // We use mapped_reference_pos_ to keep track of the outer axes information of
   // the reference tensor. That is, mapped_reference_pos_[tv] answers the
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -24167,6 +24167,30 @@ TEST_F(NVFuserTest, FusionSkipReplay_CUDA) {
   }
 }
 
+TEST_F(NVFuserTest, FusionInlineRepro1803_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  TensorView* tv0 = makeContigTensor(2);
+
+  fusion.addInput(tv0);
+  auto tv1 = set(tv0);
+  auto tvs = Welford(tv1, {1});
+  auto tvo = set(tvs.var_sum);
+  fusion.addOutput(tvo);
+
+  tvo->split(0, 16);
+  tvo->axis(1)->parallelize(ParallelType::Unroll);
+
+  tv0->computeAt(tvo, -1, ComputeAtMode::BestEffort);
+
+  TORCH_CHECK(
+      tvs.var_sum->getComputeAtPosition() == tvs.avg->getComputeAtPosition());
+  TORCH_CHECK(
+      tvs.var_sum->getComputeAtPosition() == tvs.n->getComputeAtPosition());
+  TORCH_CHECK(tvs.var_sum->getComputeAtPosition() == 1);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

Original file line number	Diff line number	Diff line change
`@@ -132,12 +132,17 @@ size_t MaxPosCalculator::getMaxProducerPosFromConsumer(`
`132`	`132`	`return producer->nDims();`
`133`	`133`	`}`
`134`	`134`
`135`		`-size_t InlinePropagator::getMaxPosAll(TensorView* tv) {`
	`135`	`+size_t InlinePropagator::getMaxPosAll(TensorView* tv, bool check_siblings) {`
`136`	`136`	`auto max_pos = max_pos_calc.getMaxPosSelf(tv, false, false, false);`
`137`	`137`	`for (auto consumer_tv : ir_utils::consumerTvsOf(tv)) {`
`138`	`138`	`max_pos = std::min<size_t>(`
`139`	`139`	`max_pos, max_pos_calc.getMaxProducerPosFromConsumer(tv, consumer_tv));`
`140`	`140`	`}`
	`141`	`+ if (check_siblings) {`
	`142`	`+ for (auto sibling_tv : ir_utils::siblingTvsOf(tv)) {`
	`143`	`+ max_pos = std::min<size_t>(max_pos, getMaxPosAll(sibling_tv, false));`
	`144`	`+ }`
	`145`	`+ }`
`141`	`146`	`return max_pos;`
`142`	`147`	`}`
`143`	`148`