From 63e3e762905a236c014505abe461211e12f3cfea Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 11:45:19 -0700 Subject: [PATCH 1/5] Allow splitting inner-most ID to create virtual innermost ID --- .../codegen/cuda/scheduler/pointwise_utils.h | 10 +++ .../jit/codegen/cuda/scheduler/transpose.cpp | 72 ++++++++++++++++--- .../codegen/cuda/test/test_gpu_transpose.cpp | 31 ++++++++ 3 files changed, 102 insertions(+), 11 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h index 7947a27f48360..8b40a306922ee 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h @@ -21,9 +21,19 @@ class DomainMap { virtual ~DomainMap() = default; bool areExactMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT); } + bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const { + if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { + return false; + } + return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE); + } + const ComputeAtMap& getComputeAtMap() const { return ca_map_; } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 5ef502321b773..6a0a932481157 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -59,9 +59,42 @@ class DomainMap : public pointwise_utils::DomainMap { } int getPosMappedTo(TensorView* tv, IterDomain* id) const { + // Find the root id mapped to `id` + const auto& root_dom = tv->getRootDomain(); + IterDomain* mapped_id = nullptr; + for (auto i : c10::irange(root_dom.size())) { + if (arePermissiveMapped(root_dom[i], id)) { + mapped_id = root_dom[i]; + break; + } + } + TORCH_INTERNAL_ASSERT( + mapped_id != nullptr, + "Can not find ID mapped to ", + id, + " in tensor ", + tv); + // Project the root id to leaf id + while (!mapped_id->uses().empty()) { + TORCH_INTERNAL_ASSERT(mapped_id->uses().size() == 1); + auto expr = mapped_id->uses()[0]; + if (expr->isA()) { + mapped_id = expr->as()->inner(); + } else { + auto merge = expr->as(); + TORCH_INTERNAL_ASSERT( + mapped_id == merge->inner(), + "Can not find ID mapped to ", + id, + " in tensor ", + tv); + mapped_id = merge->out(); + } + } + // Find the position of the leaf id const auto& dom = tv->domain()->domain(); for (auto i : c10::irange(dom.size())) { - if (areExactMapped(id, tv->axis(i))) { + if (dom[i] == mapped_id) { return i; } } @@ -240,22 +273,35 @@ void maybeBuildVirtualInnerDims( // both virtual innermost dim. // 2. The satisfied one did not merge in anything. For example, // T0[I0{1024*1024}, I1{2}] + // If this is the case, this means that we need to split the large + // inner-most dimension to satisfy the small innermost dimension int64_t large_dim; int64_t split_factor; + bool split_inner_most; if (merged_size1 < params.tile_size1) { if (params.dims_merged_with_2.empty()) { // case 2 - return; + split_inner_most = true; + large_dim = inner_most2; + split_factor = params.tile_size2; + } else { + // case 1 + split_inner_most = false; + large_dim = params.dims_merged_with_2.back(); + split_factor = ceilDiv(params.tile_size1, merged_size1); } - large_dim = params.dims_merged_with_2.back(); - split_factor = ceilDiv(params.tile_size1, merged_size1); } else { if (params.dims_merged_with_1.empty()) { // case 2 - return; + split_inner_most = true; + large_dim = inner_most1; + split_factor = params.tile_size1; + } else { + // case 1 + split_inner_most = false; + large_dim = params.dims_merged_with_1.back(); + split_factor = ceilDiv(params.tile_size2, merged_size2); } - large_dim = params.dims_merged_with_1.back(); - split_factor = ceilDiv(params.tile_size2, merged_size2); } params.split_before_tiling.push_back({large_dim, split_factor}); // adjust all dims to after-split @@ -271,12 +317,16 @@ void maybeBuildVirtualInnerDims( } // Give the split-out dim to the unsatisfied one, so that both are satisfied. if (merged_size1 < params.tile_size1) { - params.dims_merged_with_2.pop_back(); - params.dims_merged_with_2.push_back(large_dim + 1); + if (!split_inner_most) { + params.dims_merged_with_2.pop_back(); + params.dims_merged_with_2.push_back(large_dim + 1); + } params.dims_merged_with_1.push_back(large_dim); } else { - params.dims_merged_with_1.pop_back(); - params.dims_merged_with_1.push_back(large_dim + 1); + if (!split_inner_most) { + params.dims_merged_with_1.pop_back(); + params.dims_merged_with_1.push_back(large_dim + 1); + } params.dims_merged_with_2.push_back(large_dim); } } diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp index b9d8e9d294782..d5823c22683c0 100644 --- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp @@ -932,6 +932,37 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) { testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__); } +// x->sin->transpose->cos->y +TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) { + std::array, 2> shapes{ + std::vector{1024 * 1024 * 128, 2}, + std::vector{2, 1024 * 1024 * 128}}; + for (const auto& shape : shapes) { + Fusion fusion; + FusionGuard fg(&fusion); + + auto tv0 = makeContigTensor(2); + fusion.addInput(tv0); + auto tv1 = sin(tv0); + auto tv2 = transpose(tv1, 0, 1); + auto tv3 = cos(tv2); + fusion.addOutput(tv3); + + auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); + at::Tensor input = at::randn(shape, options); + + auto lparams = scheduleTranspose(&fusion, {input}); + + FusionExecutor fe; + fe.compileFusion(&fusion, {input}, lparams); + auto outputs = fe.runFusion({input}, lparams); + + auto tv_ref = input.sin().transpose(0, 1).cos(); + + testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__); + } +} + } // namespace jit } // namespace torch #endif // #if defined(USE_CUDA) From 5a423a01d7181f883e2c49fb5da6be9674da9f52 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 25 Aug 2022 16:54:03 -0700 Subject: [PATCH 2/5] remove obselete comment --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 6a0a932481157..0db554ebb9849 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -419,12 +419,6 @@ std::shared_ptr getTransposeHeuristics( if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) { params->tile_size1 = 8; params->tile_size2 = 8; - // TODO: I was trying the following but I got silent wrong result - // params->tile_size1 = 8; - // params->tile_size2 = 4; - // This should not happen, because the correctness should be irrevalent to - // schedulers. We don't have to use tile size (8, 4), but we need to fix our - // bug in codegen. } // Expand inner-most dims to virtual inner-most dims so that the inner-most From 79b7ce63d54344532e1467dd077b0a237fe00091 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 6 Sep 2022 11:45:40 -0700 Subject: [PATCH 3/5] getInnerLeafDim --- .../jit/codegen/cuda/scheduler/transpose.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index b197924d5d368..1cad4f3061eab 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -58,12 +58,12 @@ class DomainMap : public pointwise_utils::DomainMap { domain_map.findReferenceFor(grouped_inputs_outputs[1]) != nullptr; } - int getPosMappedTo(TensorView* tv, IterDomain* id) const { - // Find the root id mapped to `id` + int getInnerLeafDim(TensorView* tv, IterDomain* root_dim) const { + // Find the root id mapped to `root_dim` const auto& root_dom = tv->getRootDomain(); IterDomain* mapped_id = nullptr; for (auto i : c10::irange(root_dom.size())) { - if (arePermissiveMapped(root_dom[i], id)) { + if (arePermissiveMapped(root_dom[i], root_dim)) { mapped_id = root_dom[i]; break; } @@ -71,7 +71,7 @@ class DomainMap : public pointwise_utils::DomainMap { TORCH_INTERNAL_ASSERT( mapped_id != nullptr, "Can not find ID mapped to ", - id, + root_dim, " in tensor ", tv); // Project the root id to leaf id @@ -85,7 +85,7 @@ class DomainMap : public pointwise_utils::DomainMap { TORCH_INTERNAL_ASSERT( mapped_id == merge->inner(), "Can not find ID mapped to ", - id, + root_dim, " in tensor ", tv); mapped_id = merge->out(); @@ -99,7 +99,7 @@ class DomainMap : public pointwise_utils::DomainMap { } } TORCH_INTERNAL_ASSERT( - false, "Can not find ID mapped to ", id, " in tensor ", tv); + false, "Can not find ID mapped to ", root_dim, " in tensor ", tv); } // Group inputs and outputs of a fusion by its inner most domain. For example @@ -427,9 +427,9 @@ std::shared_ptr getTransposeHeuristics( auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2); auto inner_most_pos1_in_ref1 = - domain_map.getPosMappedTo(reference1, inner_most_id1); + domain_map.getInnerLeafDim(reference1, inner_most_id1); auto inner_most_pos2_in_ref1 = - domain_map.getPosMappedTo(reference1, inner_most_id2); + domain_map.getInnerLeafDim(reference1, inner_most_id2); // See note [Supporting small transpose dimensions] maybeBuildVirtualInnerDims( @@ -687,9 +687,9 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) { // merge with inner most dims to get virtual inner most dims size_t inner_most_pos1_in_ref1 = - domain_map.getPosMappedTo(reference1, inner_most_id1); + domain_map.getInnerLeafDim(reference1, inner_most_id1); size_t inner_most_pos2_in_ref1 = - domain_map.getPosMappedTo(reference1, inner_most_id2); + domain_map.getInnerLeafDim(reference1, inner_most_id2); if (merged1.has_value()) { if (inner_most_pos1_in_ref1 < *merged1) { reference1->reorder( From 32e954cf08606a9cffccaa5e42fd20e6d3bea60d Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 6 Sep 2022 12:20:40 -0700 Subject: [PATCH 4/5] fix --- torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 1cad4f3061eab..eaa6e8df1929a 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -288,7 +288,8 @@ void maybeBuildVirtualInnerDims( // case 1 split_inner_most = false; large_dim = params.dims_merged_with_2.back(); - split_factor = ceilDiv(params.tile_size1, merged_size1); + auto prev_merged_size2 = merged_size2 / shape_in_ref1[large_dim]; + split_factor = ceilDiv(params.tile_size2, prev_merged_size2); } } else { if (params.dims_merged_with_1.empty()) { @@ -300,7 +301,8 @@ void maybeBuildVirtualInnerDims( // case 1 split_inner_most = false; large_dim = params.dims_merged_with_1.back(); - split_factor = ceilDiv(params.tile_size2, merged_size2); + auto prev_merged_size1 = merged_size1 / shape_in_ref1[large_dim]; + split_factor = ceilDiv(params.tile_size1, prev_merged_size1); } } params.split_before_tiling.push_back({large_dim, split_factor}); From fff3396f53c4468467e9d3204ecaaef60525fcb6 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Tue, 6 Sep 2022 12:31:43 -0700 Subject: [PATCH 5/5] remove thin wrapper --- .../jit/codegen/cuda/scheduler/pointwise_utils.h | 14 -------------- .../csrc/jit/codegen/cuda/scheduler/transpose.cpp | 3 ++- 2 files changed, 2 insertions(+), 15 deletions(-) diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h index 8b40a306922ee..6cc4b1b8b93bd 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h +++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h @@ -20,20 +20,6 @@ class DomainMap { } virtual ~DomainMap() = default; - bool areExactMapped(IterDomain* id1, IterDomain* id2) const { - if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { - return false; - } - return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT); - } - - bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const { - if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) { - return false; - } - return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE); - } - const ComputeAtMap& getComputeAtMap() const { return ca_map_; } diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index eaa6e8df1929a..bc8c3b4c71c99 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -63,7 +63,8 @@ class DomainMap : public pointwise_utils::DomainMap { const auto& root_dom = tv->getRootDomain(); IterDomain* mapped_id = nullptr; for (auto i : c10::irange(root_dom.size())) { - if (arePermissiveMapped(root_dom[i], root_dim)) { + if (ca_map_.idGraph().permissiveNodes().permissiveAreMapped( + root_dom[i], root_dim)) { mapped_id = root_dom[i]; break; }