From 63e3e762905a236c014505abe461211e12f3cfea Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 11:45:19 -0700
Subject: [PATCH 1/5] Allow splitting inner-most ID to create virtual innermost
 ID

---
 .../codegen/cuda/scheduler/pointwise_utils.h  | 10 +++
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 72 ++++++++++++++++---
 .../codegen/cuda/test/test_gpu_transpose.cpp  | 31 ++++++++
 3 files changed, 102 insertions(+), 11 deletions(-)
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 7947a27f48360..8b40a306922ee 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -21,9 +21,19 @@ class DomainMap {
   virtual ~DomainMap() = default;
 
   bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
     return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
   }
 
+  bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const {
+    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
+      return false;
+    }
+    return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE);
+  }
+
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 5ef502321b773..6a0a932481157 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -59,9 +59,42 @@ class DomainMap : public pointwise_utils::DomainMap {
   }
 
   int getPosMappedTo(TensorView* tv, IterDomain* id) const {
+    // Find the root id mapped to `id`
+    const auto& root_dom = tv->getRootDomain();
+    IterDomain* mapped_id = nullptr;
+    for (auto i : c10::irange(root_dom.size())) {
+      if (arePermissiveMapped(root_dom[i], id)) {
+        mapped_id = root_dom[i];
+        break;
+      }
+    }
+    TORCH_INTERNAL_ASSERT(
+        mapped_id != nullptr,
+        "Can not find ID mapped to ",
+        id,
+        " in tensor ",
+        tv);
+    // Project the root id to leaf id
+    while (!mapped_id->uses().empty()) {
+      TORCH_INTERNAL_ASSERT(mapped_id->uses().size() == 1);
+      auto expr = mapped_id->uses()[0];
+      if (expr->isA<Split>()) {
+        mapped_id = expr->as<Split>()->inner();
+      } else {
+        auto merge = expr->as<Merge>();
+        TORCH_INTERNAL_ASSERT(
+            mapped_id == merge->inner(),
+            "Can not find ID mapped to ",
+            id,
+            " in tensor ",
+            tv);
+        mapped_id = merge->out();
+      }
+    }
+    // Find the position of the leaf id
     const auto& dom = tv->domain()->domain();
     for (auto i : c10::irange(dom.size())) {
-      if (areExactMapped(id, tv->axis(i))) {
+      if (dom[i] == mapped_id) {
         return i;
       }
     }
@@ -240,22 +273,35 @@ void maybeBuildVirtualInnerDims(
   //    both virtual innermost dim.
   // 2. The satisfied one did not merge in anything. For example,
   //    T0[I0{1024*1024}, I1{2}]
+  //    If this is the case, this means that we need to split the large
+  //    inner-most dimension to satisfy the small innermost dimension
   int64_t large_dim;
   int64_t split_factor;
+  bool split_inner_most;
   if (merged_size1 < params.tile_size1) {
     if (params.dims_merged_with_2.empty()) {
       // case 2
-      return;
+      split_inner_most = true;
+      large_dim = inner_most2;
+      split_factor = params.tile_size2;
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_2.back();
+      split_factor = ceilDiv(params.tile_size1, merged_size1);
     }
-    large_dim = params.dims_merged_with_2.back();
-    split_factor = ceilDiv(params.tile_size1, merged_size1);
   } else {
     if (params.dims_merged_with_1.empty()) {
       // case 2
-      return;
+      split_inner_most = true;
+      large_dim = inner_most1;
+      split_factor = params.tile_size1;
+    } else {
+      // case 1
+      split_inner_most = false;
+      large_dim = params.dims_merged_with_1.back();
+      split_factor = ceilDiv(params.tile_size2, merged_size2);
     }
-    large_dim = params.dims_merged_with_1.back();
-    split_factor = ceilDiv(params.tile_size2, merged_size2);
   }
   params.split_before_tiling.push_back({large_dim, split_factor});
   // adjust all dims to after-split
@@ -271,12 +317,16 @@ void maybeBuildVirtualInnerDims(
   }
   // Give the split-out dim to the unsatisfied one, so that both are satisfied.
   if (merged_size1 < params.tile_size1) {
-    params.dims_merged_with_2.pop_back();
-    params.dims_merged_with_2.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_2.pop_back();
+      params.dims_merged_with_2.push_back(large_dim + 1);
+    }
     params.dims_merged_with_1.push_back(large_dim);
   } else {
-    params.dims_merged_with_1.pop_back();
-    params.dims_merged_with_1.push_back(large_dim + 1);
+    if (!split_inner_most) {
+      params.dims_merged_with_1.pop_back();
+      params.dims_merged_with_1.push_back(large_dim + 1);
+    }
     params.dims_merged_with_2.push_back(large_dim);
   }
 }
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
index b9d8e9d294782..d5823c22683c0 100644
--- a/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp
@@ -932,6 +932,37 @@ TEST_F(NVFuserTest, FusionScheduleTransposeSmallInnerSize3_CUDA) {
   testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
 }
 
+// x->sin->transpose->cos->y
+TEST_F(NVFuserTest, FusionScheduleTranspose2DSmallInnerSize_CUDA) {
+  std::array<std::vector<int64_t>, 2> shapes{
+      std::vector<int64_t>{1024 * 1024 * 128, 2},
+      std::vector<int64_t>{2, 1024 * 1024 * 128}};
+  for (const auto& shape : shapes) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    auto tv0 = makeContigTensor(2);
+    fusion.addInput(tv0);
+    auto tv1 = sin(tv0);
+    auto tv2 = transpose(tv1, 0, 1);
+    auto tv3 = cos(tv2);
+    fusion.addOutput(tv3);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+    at::Tensor input = at::randn(shape, options);
+
+    auto lparams = scheduleTranspose(&fusion, {input});
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, {input}, lparams);
+    auto outputs = fe.runFusion({input}, lparams);
+
+    auto tv_ref = input.sin().transpose(0, 1).cos();
+
+    testValidate(&fusion, outputs, {input}, {tv_ref}, __LINE__, __FILE__);
+  }
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)

From 5a423a01d7181f883e2c49fb5da6be9674da9f52 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Thu, 25 Aug 2022 16:54:03 -0700
Subject: [PATCH 2/5] remove obselete comment

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 6a0a932481157..0db554ebb9849 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -419,12 +419,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   if (n_elems < device_multiprocessor_count * kMaxTileSize * kMaxTileSize) {
     params->tile_size1 = 8;
     params->tile_size2 = 8;
-    // TODO: I was trying the following but I got silent wrong result
-    // params->tile_size1 = 8;
-    // params->tile_size2 = 4;
-    // This should not happen, because the correctness should be irrevalent to
-    // schedulers. We don't have to use tile size (8, 4), but we need to fix our
-    // bug in codegen.
   }
 
   // Expand inner-most dims to virtual inner-most dims so that the inner-most

From 79b7ce63d54344532e1467dd077b0a237fe00091 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 6 Sep 2022 11:45:40 -0700
Subject: [PATCH 3/5] getInnerLeafDim

---
 .../jit/codegen/cuda/scheduler/transpose.cpp  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index b197924d5d368..1cad4f3061eab 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -58,12 +58,12 @@ class DomainMap : public pointwise_utils::DomainMap {
         domain_map.findReferenceFor(grouped_inputs_outputs[1]) != nullptr;
   }
 
-  int getPosMappedTo(TensorView* tv, IterDomain* id) const {
-    // Find the root id mapped to `id`
+  int getInnerLeafDim(TensorView* tv, IterDomain* root_dim) const {
+    // Find the root id mapped to `root_dim`
     const auto& root_dom = tv->getRootDomain();
     IterDomain* mapped_id = nullptr;
     for (auto i : c10::irange(root_dom.size())) {
-      if (arePermissiveMapped(root_dom[i], id)) {
+      if (arePermissiveMapped(root_dom[i], root_dim)) {
         mapped_id = root_dom[i];
         break;
       }
@@ -71,7 +71,7 @@ class DomainMap : public pointwise_utils::DomainMap {
     TORCH_INTERNAL_ASSERT(
         mapped_id != nullptr,
         "Can not find ID mapped to ",
-        id,
+        root_dim,
         " in tensor ",
         tv);
     // Project the root id to leaf id
@@ -85,7 +85,7 @@ class DomainMap : public pointwise_utils::DomainMap {
         TORCH_INTERNAL_ASSERT(
             mapped_id == merge->inner(),
             "Can not find ID mapped to ",
-            id,
+            root_dim,
             " in tensor ",
             tv);
         mapped_id = merge->out();
@@ -99,7 +99,7 @@ class DomainMap : public pointwise_utils::DomainMap {
       }
     }
     TORCH_INTERNAL_ASSERT(
-        false, "Can not find ID mapped to ", id, " in tensor ", tv);
+        false, "Can not find ID mapped to ", root_dim, " in tensor ", tv);
   }
 
   // Group inputs and outputs of a fusion by its inner most domain. For example
@@ -427,9 +427,9 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
   auto inner_most_id2 = scheduler_utils::innerMostRootDim(reference2);
 
   auto inner_most_pos1_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id1);
+      domain_map.getInnerLeafDim(reference1, inner_most_id1);
   auto inner_most_pos2_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id2);
+      domain_map.getInnerLeafDim(reference1, inner_most_id2);
 
   // See note [Supporting small transpose dimensions]
   maybeBuildVirtualInnerDims(
@@ -687,9 +687,9 @@ void scheduleTranspose(Fusion* fusion, TransposeParams params) {
 
   // merge with inner most dims to get virtual inner most dims
   size_t inner_most_pos1_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id1);
+      domain_map.getInnerLeafDim(reference1, inner_most_id1);
   size_t inner_most_pos2_in_ref1 =
-      domain_map.getPosMappedTo(reference1, inner_most_id2);
+      domain_map.getInnerLeafDim(reference1, inner_most_id2);
   if (merged1.has_value()) {
     if (inner_most_pos1_in_ref1 < *merged1) {
       reference1->reorder(

From 32e954cf08606a9cffccaa5e42fd20e6d3bea60d Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 6 Sep 2022 12:20:40 -0700
Subject: [PATCH 4/5] fix

---
 torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index 1cad4f3061eab..eaa6e8df1929a 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -288,7 +288,8 @@ void maybeBuildVirtualInnerDims(
       // case 1
       split_inner_most = false;
       large_dim = params.dims_merged_with_2.back();
-      split_factor = ceilDiv(params.tile_size1, merged_size1);
+      auto prev_merged_size2 = merged_size2 / shape_in_ref1[large_dim];
+      split_factor = ceilDiv(params.tile_size2, prev_merged_size2);
     }
   } else {
     if (params.dims_merged_with_1.empty()) {
@@ -300,7 +301,8 @@ void maybeBuildVirtualInnerDims(
       // case 1
       split_inner_most = false;
       large_dim = params.dims_merged_with_1.back();
-      split_factor = ceilDiv(params.tile_size2, merged_size2);
+      auto prev_merged_size1 = merged_size1 / shape_in_ref1[large_dim];
+      split_factor = ceilDiv(params.tile_size1, prev_merged_size1);
     }
   }
   params.split_before_tiling.push_back({large_dim, split_factor});

From fff3396f53c4468467e9d3204ecaaef60525fcb6 Mon Sep 17 00:00:00 2001
From: Xiang Gao <qasdfgtyuiop@gmail.com>
Date: Tue, 6 Sep 2022 12:31:43 -0700
Subject: [PATCH 5/5] remove thin wrapper

---
 .../jit/codegen/cuda/scheduler/pointwise_utils.h   | 14 --------------
 .../csrc/jit/codegen/cuda/scheduler/transpose.cpp  |  3 ++-
 2 files changed, 2 insertions(+), 15 deletions(-)

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
index 8b40a306922ee..6cc4b1b8b93bd 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
+++ b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -20,20 +20,6 @@ class DomainMap {
   }
   virtual ~DomainMap() = default;
 
-  bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
-    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
-      return false;
-    }
-    return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
-  }
-
-  bool arePermissiveMapped(IterDomain* id1, IterDomain* id2) const {
-    if (!ca_map_.idExistsInMap(id1) || !ca_map_.idExistsInMap(id2)) {
-      return false;
-    }
-    return ca_map_.areMapped(id1, id2, IdMappingMode::PERMISSIVE);
-  }
-
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
index eaa6e8df1929a..bc8c3b4c71c99 100644
--- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
+++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
@@ -63,7 +63,8 @@ class DomainMap : public pointwise_utils::DomainMap {
     const auto& root_dom = tv->getRootDomain();
     IterDomain* mapped_id = nullptr;
     for (auto i : c10::irange(root_dom.size())) {
-      if (arePermissiveMapped(root_dom[i], root_dim)) {
+      if (ca_map_.idGraph().permissiveNodes().permissiveAreMapped(
+              root_dom[i], root_dim)) {
         mapped_id = root_dom[i];
         break;
       }