Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions torch/csrc/jit/codegen/cuda/lower_index_compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,8 +732,36 @@ void LoopIndexingAnalysis::constructLoopDomains() {
!concrete_id_to_consumer_.count(concrete_id) &&
// Use permissive map so the selected ID indeed represents the
// loop.
GpuLower::current()->caMap()->areMapped(
concrete_id, loop_id, IdMappingMode::PERMISSIVE);
// Note: see PR https://github.com/csarofeen/pytorch/pull/1960
// and issue https://github.com/csarofeen/pytorch/issues/1873
// This mapping look up is part of a staged indexing scheme.
// When we find a replayed exact id that exactly map to the loop
// id, this means that we can resolve indexing involved in this
// loop "locally", i.e. only with and with only the iterdomains
// on the
//
// given consumer tv.
// When we cannot find an exact mapping, the permissive mapping
// would
// help defering the indexing resolution for this loop nest
// level to other iterdomain expressions from tv's that are
// further concretized and usually they are further down the
// consumer chain of the given consumer tv.
//
// Intuitively exact mapping of two iterdomains should imply
// permissive mapping
// of them as well and if that was the case, only looking up
// permissive mapping would be enough to address both of the
// cases above.
// FIXME: But currently exact mapping does not imply permissive
// mapping (See issue:
// https://github.com/csarofeen/pytorch/issues/1963)
// Which means we should check both exact and permissive mapping
// here.
(GpuLower::current()->caMap()->areMapped(
concrete_id, loop_id, IdMappingMode::EXACT) ||
GpuLower::current()->caMap()->areMapped(
concrete_id, loop_id, IdMappingMode::PERMISSIVE));
});

TORCH_INTERNAL_ASSERT(
Expand Down
45 changes: 45 additions & 0 deletions torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25747,6 +25747,51 @@ TEST_F(NVFuserTest, FusionMergeBroadcastingTrivialReduction2_CUDA) {
fusion, {out}, {t0, t1}, {t1 + t0.squeeze(-1)}, __LINE__, __FILE__);
}

TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
FusionGuard fg(fusion);

TensorView* tv0 = makeConcreteTensor({1, 1});
TensorView* tv1 = makeConcreteTensor({-1, 1, 1});
fusion->addInput(tv0);
fusion->addInput(tv1);
auto tv2 = set(tv0);
auto tv3 = broadcast(tv2, {true, false, false});
auto tv4 = add(tv3, tv1);

fusion->addOutput(tv4);

tv4->merge(-2);
tv4->merge(-1);

tv0->computeAt(tv4, -1);
tv1->computeAt(tv4, -1);

ComputeAtMap ca_map(fusion);

// FIXME: This is the concerning part that would motivate some
// more formalization on concrete/permissive mapping:
// exact mapping should ideally imply permissive mapping.
auto tv4_inner_node = tv4->axis(0)->definition()->input(1)->as<IterDomain>();
TORCH_CHECK(
ca_map.areMapped(tv2->axis(0), tv4_inner_node, IdMappingMode::EXACT));
TORCH_CHECK(!ca_map.areMapped(
tv2->axis(0), tv4_inner_node, IdMappingMode::PERMISSIVE));

auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({1, 1}, options);
at::Tensor t1 = at::randn({2, 1, 1}, options);

FusionExecutor fe;
fe.compileFusion(fusion, {t0, t1});
auto cg_outputs = fe.runFusion({t0, t1});
auto out = cg_outputs[0];

testValidate(
fusion, {out}, {t0, t1}, {t1 + t0.squeeze(0)}, __LINE__, __FILE__);
}

} // namespace jit
} // namespace torch
#endif // #if defined(USE_CUDA)