csarofeen · zasdfgbnm · Aug 11, 2022 · Jul 14, 2022 · Jul 15, 2022 · Jul 15, 2022
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -718,6 +718,7 @@ libtorch_cuda_core_sources = [
     "torch/csrc/jit/codegen/cuda/root_domain_map.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp",
+    "torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/normalization.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/reduction.cpp",
     "torch/csrc/jit/codegen/cuda/scheduler/matmul.cpp",

diff --git a/test/cpp/jit/CMakeLists.txt b/test/cpp/jit/CMakeLists.txt
@@ -101,6 +101,7 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensorcore.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_view.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_transpose.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_rng.cu)
 endif()
 

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h b/torch/csrc/jit/codegen/cuda/scheduler/all_schedulers.h
@@ -2,6 +2,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/normalization.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/reduction.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
 
 namespace torch {
 namespace jit {
@@ -12,7 +13,8 @@ enum class TORCH_CUDA_CU_API ScheduleHeuristic {
   None,
   PointWise,
   Reduction,
-  Persistent
+  Persistent,
+  Transpose
 };
 }
 } // namespace fuser

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h b/torch/csrc/jit/codegen/cuda/scheduler/compile_time_info.h
@@ -28,6 +28,7 @@ enum class CompileTimeEntryType {
   DOMAIN_MAP,
   REFERENCE_TENSORS,
   VECTORIZABLE_INPUTS_AND_OUTPUTS,
+  INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS,
   UNROLLABLE_INPUTS_AND_OUTPUTS,
   REDUCTION_TVS,
   PERSISTENT_BUFFER_INFO,
@@ -62,6 +63,15 @@ class VectorizableInputsAndOutputs {
       CompileTimeEntryType::VECTORIZABLE_INPUTS_AND_OUTPUTS;
 };
 
+//! Entry type definition class for `INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS`,
+//!  stores the fusion's inputs and outputs grouped by inner most dimension.
+class InputsOutputsInnerDimGroups {
+ public:
+  using DataType = std::vector<std::vector<TensorView*>>;
+  static const CompileTimeEntryType EntryType =
+      CompileTimeEntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS;
+};
+
 //! Entry type definition class for `UNROLLABLE_INPUTS_AND_OUTPUTS`,
 //!  stores the unrollable TensorViews on a fusion's inputs and outputs.
 class UnrollableInputsAndOutputs {

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise.cpp
@@ -6,6 +6,7 @@
 #include <torch/csrc/jit/codegen/cuda/ir_iostream.h>
 #include <torch/csrc/jit/codegen/cuda/ir_utils.h>
 #include <torch/csrc/jit/codegen/cuda/lower_utils.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.h>
@@ -57,29 +58,6 @@ class DomainMap : public pointwise_utils::DomainMap {
     return domain_map.findReferenceTensorView() != nullptr;
   }
 
-  // Determine if output TensorView is a valid reference tensor for this fusion.
-  // The reference tensor must map to all the iterDomains in each input.
-  bool isValidReference(TensorView* output_tv) const {
-    if (output_tv->isFusionInput()) {
-      return false;
-    }
-    for (auto input_tv :
-         ir_utils::filterByType<TensorView>(fusion_->inputs())) {
-      if (input_tv->uses().empty()) {
-        continue;
-      }
-
-      if (fusion_->getOutputAlias(output_tv) == input_tv) {
-        continue;
-      }
-
-      if (!areAllInputIdsMappedToOutput(input_tv, output_tv)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
  private:
   bool hasMinimumSize(TensorView* tv, int num_axes) const {
     TORCH_INTERNAL_ASSERT(tv != nullptr);

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.cpp
@@ -6,19 +6,9 @@ namespace fuser {
 namespace cuda {
 namespace pointwise_utils {
 
-DomainMap::DomainMap(Fusion* fusion)
-    : fusion_(fusion), ca_map_(ComputeAtMap(fusion)) {
-  view_tvs_ = scheduler_utils::getViewTVs(fusion);
-}
-
-bool DomainMap::areExactMapped(IterDomain* id1, IterDomain* id2) {
-  return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
-}
-
-// Determine if all IterDomains in input are mapped to output
-bool DomainMap::areAllInputIdsMappedToOutput(
-    TensorView* input_tv,
-    TensorView* output_tv) const {
+// Determine if all IterDomains in input are mapped to the given tensor
+bool DomainMap::areAllInputIdsMappedTo(TensorView* input_tv, TensorView* tv)
+    const {
   // Get concrete IDs for input root or rfactor domain
   std::unordered_set<IterDomain*> in_concrete_ids;
   for (auto in_id : input_tv->getMaybeRFactorDomain()) {
@@ -30,11 +20,9 @@ bool DomainMap::areAllInputIdsMappedToOutput(
 
   // Erase all input concrete IDs mapped to the output domain
   // Ignore unresolved broadcast dimensions
-  for (auto out_id : output_tv->getMaybeRFactorDomain()) {
-    if (!out_id->isBroadcast()) {
-      if (!eraseIfMapped(in_concrete_ids, out_id)) {
-        eraseIfInputMappedThroughViewToOutput(in_concrete_ids, out_id);
-      }
+  for (auto id : tv->getMaybeRFactorDomain()) {
+    if (!eraseIfMapped(in_concrete_ids, id)) {
+      eraseIfInputMappedThroughViewTo(in_concrete_ids, id);
     }
   }
   return in_concrete_ids.empty();
@@ -45,7 +33,7 @@ bool DomainMap::eraseIfMapped(
     std::unordered_set<IterDomain*>& in_concrete_ids,
     IterDomain* out_id) const {
   auto out_concrete_id =
-      ca_map_.getConcreteMappedID(out_id, IdMappingMode::EXACT);
+      ca_map_.getConcreteMappedID(out_id, IdMappingMode::PERMISSIVE);
   auto in_concrete_id_iter = in_concrete_ids.find(out_concrete_id);
   bool found_match = in_concrete_id_iter != in_concrete_ids.end();
   if (found_match) {
@@ -58,12 +46,12 @@ bool DomainMap::eraseIfMapped(
 // Currently this function only allow having one view on the path from input to
 // output. If there are multiple views, then likely the pointwise scheduler will
 // reject the fusion because we can not correctly find a reference tensor.
-void DomainMap::eraseIfInputMappedThroughViewToOutput(
+void DomainMap::eraseIfInputMappedThroughViewTo(
     std::unordered_set<IterDomain*>& in_concrete_ids,
-    IterDomain* out_id) const {
+    IterDomain* id) const {
   for (auto view : view_tvs_) {
     // Find any ID in view rfactor domain that is mapped to output ID
-    auto view_rfactor_id = anyMapped(view->getRFactorDomain(), out_id);
+    auto view_rfactor_id = anyMapped(view->getRFactorDomain(), id);
     if (view_rfactor_id == nullptr) {
       continue;
     }
@@ -94,6 +82,20 @@ IterDomain* DomainMap::anyMapped(
   return nullptr;
 }
 
+// Determine if output TensorView is a valid reference tensor for this fusion.
+// The reference tensor must map to all the iterDomains in each input.
+bool DomainMap::isValidReference(TensorView* tv) const {
+  for (auto input_tv : ir_utils::filterByType<TensorView>(fusion_->inputs())) {
+    if (input_tv->uses().empty()) {
+      continue;
+    }
+    if (!areAllInputIdsMappedTo(input_tv, tv)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 } // namespace pointwise_utils
 } // namespace cuda
 } // namespace fuser

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h b/torch/csrc/jit/codegen/cuda/scheduler/pointwise_utils.h
@@ -15,29 +15,37 @@ namespace pointwise_utils {
 // that maps to all IterDomains in the fusion.
 class DomainMap {
  public:
-  DomainMap(Fusion* fusion);
+  DomainMap(Fusion* fusion) : fusion_(fusion), ca_map_(fusion) {
+    view_tvs_ = scheduler_utils::getViewTVs(fusion);
+  }
   virtual ~DomainMap() = default;
 
-  bool areExactMapped(IterDomain* id1, IterDomain* id2);
+  bool areExactMapped(IterDomain* id1, IterDomain* id2) const {
+    return ca_map_.areMapped(id1, id2, IdMappingMode::EXACT);
+  }
 
   const ComputeAtMap& getComputeAtMap() const {
     return ca_map_;
   }
 
+  // Determine if a TensorView is a valid reference tensor for this fusion.
+  // The reference tensor must map to all the iterDomains in each input.
+  bool isValidReference(TensorView* tv) const;
+
  protected:
-  // Determine if all iterDomains are mapped between input and output tvs
-  bool areAllInputIdsMappedToOutput(TensorView* input_tv, TensorView* output_tv)
+  // Determine if all IterDomains are mapped between input and the given tvs
+  bool areAllInputIdsMappedTo(TensorView* input_tv, TensorView* output_tv)
       const;
 
   // Erase input concrete ID if it is mapped to output ID
   bool eraseIfMapped(
       std::unordered_set<IterDomain*>& in_concrete_ids,
       IterDomain* out_id) const;
 
-  // Check if in_id is mapped to out_id through any view rfactor domain
-  void eraseIfInputMappedThroughViewToOutput(
+  // Check if in_id is mapped to id through any view rfactor domain
+  void eraseIfInputMappedThroughViewTo(
       std::unordered_set<IterDomain*>& in_concrete_ids,
-      IterDomain* out_id) const;
+      IterDomain* id) const;
 
   // Find any id in domain that maps with target id
   IterDomain* anyMapped(

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
@@ -9,6 +9,7 @@
 #include <torch/csrc/jit/codegen/cuda/scheduler/debug_utils.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/pointwise.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/registry.h>
+#include <torch/csrc/jit/codegen/cuda/scheduler/transpose.h>
 #include <torch/csrc/jit/codegen/cuda/scheduler/utils.h>
 
 #include <limits>
@@ -1244,10 +1245,75 @@ class PersistentKernelScheduler : public SchedulerEntry {
   }
 };
 
+class TransposeScheduler : public SchedulerEntry {
+ public:
+  explicit TransposeScheduler(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr)
+      : SchedulerEntry(ScheduleHeuristic::Transpose) {
+    computeHeuristics(fusion, runtime_info, data_cache);
+  }
+
+  static bool canScheduleCompileTime(Fusion* fusion) {
+    // Not enabling this yet. Needs more validation.
+    return false;
+#if 0
+    if (!hasAtLeastTwoValidGroups(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "cannot find two mismatching inner most dimensions");
+      return false;
+    }
+
+    // TODO: add support for trivial reduction
+    auto reduction_ops =
+        ir_utils::getReductionOps(fusion, false /* ignore_trivial */);
+
+    if (!reduction_ops.empty()) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose, "no support for reduction ops");
+      return false;
+    }
+
+    if (hasNonUniqueBcast(fusion)) {
+      scheduler_debug_utils::canScheduleRejectReason(
+          ScheduleHeuristic::Transpose,
+          "Broadcasting dimension might be broadcasting to multiple sizes.");
+      return false;
+    }
+
+    return true;
+#endif
+  }
+
+  static bool canScheduleRunTime(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    return true;
+  }
+
+  void schedule(Fusion* fusion) override {
+    FUSER_PERF_SCOPE("Schedule Transpose Fusion");
+    scheduleTranspose(fusion, transposeParams());
+  }
+
+ private:
+  void computeHeuristics(
+      Fusion* fusion,
+      SchedulerRuntimeInfo& runtime_info,
+      HeuristicSummary* data_cache = nullptr) {
+    params_ = getTransposeHeuristics(fusion, runtime_info, data_cache);
+    TORCH_INTERNAL_ASSERT(params_ != nullptr);
+  }
+};
+
 // Schedule Table
 const std::vector<ScheduleHeuristic>& all_heuristics() {
   static const std::vector<ScheduleHeuristic> hlist = {
       ScheduleHeuristic::Reduction,
+      ScheduleHeuristic::Transpose,
       ScheduleHeuristic::PointWise,
       ScheduleHeuristic::Persistent};
   return hlist;
@@ -1294,6 +1360,9 @@ bool SchedulerEntry::canSchedule(
     case ScheduleHeuristic::Persistent:
       return checkCanSchedule<PersistentKernelScheduler>(
           fusion, runtime_info, data_cache);
+    case ScheduleHeuristic::Transpose:
+      return checkCanSchedule<TransposeScheduler>(
+          fusion, runtime_info, data_cache);
     default:
       TORCH_INTERNAL_ASSERT(false, "unreachable");
       return false;
@@ -1320,6 +1389,10 @@ std::unique_ptr<SchedulerEntry> SchedulerEntry::makeEntry(
       scheduler_entry = std::make_unique<PersistentKernelScheduler>(
           fusion, runtime_info, data_cache);
       break;
+    case ScheduleHeuristic::Transpose:
+      scheduler_entry = std::make_unique<TransposeScheduler>(
+          fusion, runtime_info, data_cache);
+      break;
     default:
       TORCH_INTERNAL_ASSERT(false, "unreachable");
   }
@@ -1353,6 +1426,8 @@ std::string toString(ScheduleHeuristic sh) {
       return "reduction";
     case ScheduleHeuristic::Persistent:
       return "persistent";
+    case ScheduleHeuristic::Transpose:
+      return "transpose";
     default:
       TORCH_INTERNAL_ASSERT(false, "undefined schedule");
   }
@@ -1405,6 +1480,10 @@ HeuristicSummary::HeuristicSummary(
       getPersistentHeuristics(fusion, runtime_info, this);
       PersistentKernelScheduler::canScheduleRunTime(fusion, runtime_info, this);
       break;
+    case ScheduleHeuristic::Transpose:
+      getTransposeHeuristics(fusion, runtime_info, this);
+      TransposeScheduler::canScheduleRunTime(fusion, runtime_info, this);
+      break;
     default:
       TORCH_INTERNAL_ASSERT(false, "unknown heuristic");
   }
@@ -1451,6 +1530,11 @@ void HeuristicSummary::validate() const {
           entry_type_map_.count(EntryType::SCOPE_PERSISTENT_FACTOR_INFO));
       break;
     }
+    case ScheduleHeuristic::Transpose: {
+      TORCH_INTERNAL_ASSERT(entry_type_map_.count(
+          EntryType::INPUTS_AND_OUTPUTS_INNER_DIM_GROUPS));
+      break;
+    }
     default:
       TORCH_INTERNAL_ASSERT(false, "unknown heuristic");
   }
@@ -1490,6 +1574,8 @@ template class HeuristicSummaryEntry<HeuristicCompileTime::DomainMap>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::ReferenceTensors>;
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::VectorizableInputsAndOutputs>;
+template class HeuristicSummaryEntry<
+    HeuristicCompileTime::InputsOutputsInnerDimGroups>;
 template class HeuristicSummaryEntry<
     HeuristicCompileTime::UnrollableInputsAndOutputs>;
 template class HeuristicSummaryEntry<HeuristicCompileTime::ReductionTVs>;

diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.h b/torch/csrc/jit/codegen/cuda/scheduler/registry.h
@@ -187,6 +187,13 @@ class TORCH_CUDA_CU_API SchedulerEntry {
     return *pparams;
   }
 
+  const TransposeParams& transposeParams() const {
+    auto tparams = std::dynamic_pointer_cast<TransposeParams>(params_);
+    TORCH_INTERNAL_ASSERT(
+        tparams != nullptr, "Heuristic parameter is not a transpose parameter");
+    return *tparams;
+  }
+
   void updateLaunchConstraint(const LaunchParams& launch_params) {
     params_->lparams = launch_params;
   }