WAR on index mapping when exact and permissive maps differ (#1960)

shmsong · web-flow · commit 93505bcbb90a · 2022-09-08T16:46:08.000-07:00
diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
@@ -732,8 +732,36 @@ void LoopIndexingAnalysis::constructLoopDomains() {
               !concrete_id_to_consumer_.count(concrete_id) &&
               // Use permissive map so the selected ID indeed represents the
               // loop.
-              GpuLower::current()->caMap()->areMapped(
-                  concrete_id, loop_id, IdMappingMode::PERMISSIVE);
+              // Note: see PR https://github.com/csarofeen/pytorch/pull/1960
+              //  and issue https://github.com/csarofeen/pytorch/issues/1873
+              // This mapping look up is part of a staged indexing scheme.
+              //  When we find a replayed exact id that exactly map to the loop
+              //  id, this means that we can resolve indexing involved in this
+              //  loop "locally", i.e. only with and with only the iterdomains
+              //  on the
+              //
+              //  given consumer tv.
+              //  When we cannot find an exact mapping, the permissive mapping
+              //  would
+              //   help defering the indexing resolution for this loop nest
+              //   level to other iterdomain expressions from tv's that are
+              //   further concretized and usually they are further down the
+              //   consumer chain of the given consumer tv.
+              //
+              //  Intuitively exact mapping of two iterdomains should imply
+              //  permissive mapping
+              //   of them as well and if that was the case, only looking up
+              //   permissive mapping would be enough to address both of the
+              //   cases above.
+              //  FIXME: But currently exact mapping does not imply permissive
+              //  mapping (See issue:
+              //       https://github.com/csarofeen/pytorch/issues/1963)
+              // Which means we should check both exact and permissive mapping
+              // here.
+              (GpuLower::current()->caMap()->areMapped(
+                   concrete_id, loop_id, IdMappingMode::EXACT) ||
+               GpuLower::current()->caMap()->areMapped(
+                   concrete_id, loop_id, IdMappingMode::PERMISSIVE));
         });
 
     TORCH_INTERNAL_ASSERT(
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -25747,6 +25747,51 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
       fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({1, 1});
+  TensorView* tv1 = makeConcreteTensor({-1, 1, 1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = set(tv0);
+  auto tv3 = broadcast(tv2, {true, false, false});
+  auto tv4 = add(tv3, tv1);
+
+  fusion->addOutput(tv4);
+
+  tv4->merge(-2);
+  tv4->merge(-1);
+
+  tv0->computeAt(tv4, -1);
+  tv1->computeAt(tv4, -1);
+
+  ComputeAtMap ca_map(fusion);
+
+  // FIXME: This is the concerning part that would motivate some
+  //  more formalization on concrete/permissive mapping:
+  //   exact mapping should ideally imply permissive mapping.
+  auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as<IterDomain>();
+  TORCH_CHECK(
+      ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT));
+  TORCH_CHECK(!ca_map.areMapped(
+      tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE));
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1, 1}, options);
+  at::Tensor t1 = at::randn({2, 1, 1}, options);
+
+  FusionExecutor fe;
+  fe.compileFusion(fusion, {t0, t1});
+  auto cg_outputs = fe.runFusion({t0, t1});
+  auto out = cg_outputs[0];
+
+  testValidate(
+      fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)