diff --git a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp index e7d53ca59a93..513066a5c71c 100644 --- a/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp +++ b/torch/csrc/jit/codegen/cuda/lower_index_compute.cpp @@ -732,8 +732,36 @@ void LoopIndexingAnalysis::constructLoopDomains() { !concrete_id_to_consumer_.count(concrete_id) && // Use permissive map so the selected ID indeed represents the // loop. - GpuLower::current()->caMap()->areMapped( - concrete_id, loop_id, IdMappingMode::PERMISSIVE); + // Note: see PR https://github.com/csarofeen/pytorch/pull/1960 + // and issue https://github.com/csarofeen/pytorch/issues/1873 + // This mapping look up is part of a staged indexing scheme. + // When we find a replayed exact id that exactly map to the loop + // id, this means that we can resolve indexing involved in this + // loop "locally", i.e. only with and with only the iterdomains + // on the + // + // given consumer tv. + // When we cannot find an exact mapping, the permissive mapping + // would + // help defering the indexing resolution for this loop nest + // level to other iterdomain expressions from tv's that are + // further concretized and usually they are further down the + // consumer chain of the given consumer tv. + // + // Intuitively exact mapping of two iterdomains should imply + // permissive mapping + // of them as well and if that was the case, only looking up + // permissive mapping would be enough to address both of the + // cases above. + // FIXME: But currently exact mapping does not imply permissive + // mapping (See issue: + // https://github.com/csarofeen/pytorch/issues/1963) + // Which means we should check both exact and permissive mapping + // here. + (GpuLower::current()->caMap()->areMapped( + concrete_id, loop_id, IdMappingMode::EXACT) || + GpuLower::current()->caMap()->areMapped( + concrete_id, loop_id, IdMappingMode::PERMISSIVE)); }); TORCH_INTERNAL_ASSERT( diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp index be190bbb520e..799b22043865 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp @@ -25747,6 +25747,51 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) { fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__); } +TEST_F(NVFuserTest, FusionMappingRelation_CUDA) { + std::unique_ptr fusion_ptr = std::make_unique(); + auto fusion = fusion_ptr.get(); + FusionGuard fg(fusion); + + TensorView* tv0 = makeConcreteTensor({1, 1}); + TensorView* tv1 = makeConcreteTensor({-1, 1, 1}); + fusion->addInput(tv0); + fusion->addInput(tv1); + auto tv2 = set(tv0); + auto tv3 = broadcast(tv2, {true, false, false}); + auto tv4 = add(tv3, tv1); + + fusion->addOutput(tv4); + + tv4->merge(-2); + tv4->merge(-1); + + tv0->computeAt(tv4, -1); + tv1->computeAt(tv4, -1); + + ComputeAtMap ca_map(fusion); + + // FIXME: This is the concerning part that would motivate some + // more formalization on concrete/permissive mapping: + // exact mapping should ideally imply permissive mapping. + auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as(); + TORCH_CHECK( + ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT)); + TORCH_CHECK(!ca_map.areMapped( + tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE)); + + auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0); + at::Tensor t0 = at::randn({1, 1}, options); + at::Tensor t1 = at::randn({2, 1, 1}, options); + + FusionExecutor fe; + fe.compileFusion(fusion, {t0, t1}); + auto cg_outputs = fe.runFusion({t0, t1}); + auto out = cg_outputs[0]; + + testValidate( + fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__); +} + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA)