csarofeen
diff --git a/‎test/cpp/jit/CMakeLists.txt
Lines changed: 3 additions & 1 deletion b/‎test/cpp/jit/CMakeLists.txt
Lines changed: 3 additions & 1 deletion
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
Lines changed: 2 additions & 10 deletions b/‎torch/csrc/jit/codegen/cuda/ir_internal_nodes.h
Lines changed: 2 additions & 10 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_nodes.cpp
Lines changed: 36 additions & 1 deletion b/‎torch/csrc/jit/codegen/cuda/ir_nodes.cpp
Lines changed: 36 additions & 1 deletion
diff --git a/‎torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Lines changed: 25 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Lines changed: 25 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/scheduler/registry.h
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/scheduler/registry.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
Lines changed: 14 additions & 16 deletions b/‎torch/csrc/jit/codegen/cuda/scheduler/vectorize_helper.cpp
Lines changed: 14 additions & 16 deletions
@@ -99,7 +99,9 @@ if(USE_CUDA)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_definition.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_cache.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/python_frontend/test/test_nvfuser_fusion_record.cpp)
-  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu1.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu2.cpp)
+  list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu3.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_tensor_factories.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_fused_reduction.cpp)
   list(APPEND JIT_TEST_SRCS ${TORCH_SRC_DIR}/csrc/jit/codegen/cuda/test/test_gpu_shift.cpp)
 
@@ -1416,16 +1416,8 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
   }
 
   //! Check if IterDomain is a reduction axis with size of 1, i.e.
-  //! a "squeeze" operator.
-  //!
-  //! NOTE: Detection of trivial reduction here is not
-  //! comprehensive. See detectTrivialReductionDerivedDomains for more
-  //! comprehensive analysis. We typically use this for root domain trivial
-  //! reduction checks. So we ship to the correct scheduler. It may
-  //! not be incredibly robust, but it makes sense to keep it for now.
-  bool isTrivialReduction() const {
-    return isReduction() && extent()->isOneInt();
-  }
+  //! a "squeeze" operator, or solely derived from such axes.
+  bool isTrivialReduction() const;
 
   //! Split for stride by a given factor. It effectively does an inner
   //! split by the factor and sets the inner domain as a Stride
 
@@ -1720,6 +1720,37 @@ IterDomain* IterDomain::cloneWithoutRFactor() const {
   return cloned;
 }
 
+bool IterDomain::isTrivialReduction() const {
+  if (!isReduction()) {
+    return false;
+  }
+
+  if (extent()->isOneInt()) {
+    return true;
+  }
+
+  // If this domain is an output of an expression, i.e., not a root
+  // domain, check if all root domains are trivial reductions. This is
+  // almost the same as the analysis done in TrivialReductionInfo, but
+  // is limited within a single tensor, whereas TrivialReductionInfo
+  // does more expensive analysis potentially traversing through
+  // rfactor domains
+  if (definition()) {
+    // Note: There's no const version of IterVisitor.
+    auto id_inputs = InputsOf::output(fusion(), const_cast<IterDomain*>(this));
+    if (std::all_of(
+            ir_utils::filterByType<IterDomain>(id_inputs).begin(),
+            ir_utils::filterByType<IterDomain>(id_inputs).end(),
+            [](IterDomain* root_id) {
+              return root_id->isReduction() && root_id->extent()->isOneInt();
+            })) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::vector<IterDomain*> IterDomain::clone(
     const std::vector<IterDomain*>& domains) {
   std::vector<IterDomain*> cloned_domains;
@@ -1744,7 +1775,11 @@ IterDomain* IterDomain::merge(IterDomain* outer, IterDomain* inner) {
       outer->isReduction() == inner->isReduction() ||
           (!outer->isReduction() && inner->isTrivialReduction()) ||
           (outer->isTrivialReduction() && !inner->isReduction()),
-      "Merging IterDomains requires that their iteration types match.");
+      "Merging IterDomains requires that their iteration types match. ",
+      "Outer: ",
+      outer->toString(),
+      ", Inner: ",
+      inner->toString());
   TORCH_CHECK(
       (outer->isGather() && inner->isGather()) ||
           (!outer->isGather() && !inner->isGather()),
 
@@ -463,6 +463,24 @@ void SchedulerRuntimeInfo::initialize(
       auto fusion_inp = complete_fusion_->inputs()[inp_i];
       auto data_ptr = tensor_arg_abstract->getPointer();
       input_ptrs_[fusion_inp] = (size_t)data_ptr;
+
+      // find and push discontiguous stride
+      auto dtype_size = dataTypeSize(tensor_arg_abstract->getDataType());
+      input_discontig_strides_[fusion_inp] = {};
+      auto dims = tensor_arg_abstract->getRank();
+      auto expected_stride = 1;
+      for (auto dim = dims - 1; dim >= 0; dim--) {
+        auto size = tensor_arg_abstract->getSize(dim);
+        if (size <= 1) {
+          continue;
+        }
+        auto stride = tensor_arg_abstract->getStride(dim);
+        if (stride != expected_stride) {
+          input_discontig_strides_[fusion_inp].push_back(stride * dtype_size);
+          expected_stride = stride;
+        }
+        expected_stride *= size;
+      }
     }
   }
 
@@ -529,6 +547,13 @@ size_t SchedulerRuntimeInfo::getAlignmentSize(TensorView* tv) {
   }
 
   auto alignment_size = SchedulerRuntimeInfo::computeAlignmentSize(ptrOf(tv));
+  auto strides_it = input_discontig_strides_.find(tv);
+  if (strides_it != input_discontig_strides_.end()) {
+    for (auto stride : strides_it->second) {
+      alignment_size = std::min(
+          alignment_size, SchedulerRuntimeInfo::computeAlignmentSize(stride));
+    }
+  }
   alignment_map_[tv] = alignment_size;
   return alignment_size;
 }
 
@@ -27,6 +27,7 @@ class ExpressionEvaluator;
 //!    segmenter and schedulers.
 //!  It is important that input id encoding should be up to date with any change
 //!   of this class to avoid launching compiled kernels with illegal inputs.
+
 class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
  public:
   // Max vector size we will consider, in bytes,
@@ -112,6 +113,9 @@ class TORCH_CUDA_CU_API SchedulerRuntimeInfo : public NonCopyable {
   // TODO: Support output tensor pointers
   std::unordered_map<Val*, size_t> input_ptrs_;
 
+  // Copy of aten input tensor strides (in bytes)
+  std::unordered_map<Val*, std::vector<size_t>> input_discontig_strides_;
+
   // Cache for getAlignmentSize
   std::unordered_map<TensorView*, size_t> alignment_map_;
   // Cache for getMaxVectorizableWidth
 
@@ -82,18 +82,6 @@ size_t collectMaxVectorizeSizeWithContigMerge(
     size_t max_vector_size_in_byte,
     ExpressionEvaluator& expression_evaluator,
     DataType index_type) {
-  // Maybe too conservative, but only handles fully contiguous tensors
-  // TODO: Relax the contiguity constraint to be similar to that in index
-  // computing. Just looking for all merged root domains in the right order,
-  // all merged root dimensions are contiguous, all merged root dimensions are
-  // next to eachother (exlcuding broadcast).
-  if (std::any_of(
-          tv->domain()->contiguity().begin(),
-          tv->domain()->contiguity().end(),
-          [](const auto contig) { return !contig; })) {
-    return 1;
-  }
-
   auto dtype_size = dataTypeSize(tv->dtype(), index_type);
   const size_t max_vector_size = max_vector_size_in_byte / dtype_size;
 
@@ -205,8 +193,16 @@ size_t expandVectorizationToContigMergedDomains(
 
   // Merge the domains right of the break point
   const auto& ref_root = reference_tv->getMaybeRFactorDomain();
-  const int num_merged_domains =
+  const int max_num_merged_domains =
       static_cast<int>(ref_root.size()) - static_cast<int>(break_point);
+  int64_t num_merged_domains = 0;
+  while (num_merged_domains < max_num_merged_domains) {
+    auto pos = (int64_t)ref_root.size() - 1 - num_merged_domains;
+    if (!reference_tv->domain()->contiguity()[pos]) {
+      break;
+    }
+    num_merged_domains++;
+  }
 
   // No expansion with no merged domain
   if (num_merged_domains == 0) {
@@ -245,14 +241,16 @@ size_t expandVectorizationToContigMergedDomains(
     const auto& tv_root = tv->getMaybeRFactorDomain();
 
     int tv_num_merged_domains = 0;
-    for (const auto i : c10::irange(num_merged_domains)) {
+    for (const auto i : c10::irange(max_num_merged_domains)) {
       if (i == tv_root.size()) {
         break;
       }
       auto ref_id = ref_root.at(ref_root.size() - 1 - i);
-      IterDomain* tv_id = tv_root.at(tv_root.size() - 1 - i);
+      auto pos = tv_root.size() - 1 - i;
+      IterDomain* tv_id = tv_root.at(pos);
       // If not mapped, stop expanding.
-      if (!ca_map.areMapped(ref_id, tv_id, IdMappingMode::EXACT)) {
+      if (!ca_map.areMapped(ref_id, tv_id, IdMappingMode::EXACT) ||
+          !tv->domain()->contiguity()[pos]) {
         break;
       } else {
         ++tv_num_merged_domains;