csarofeen · zasdfgbnm · Mar 1, 2023 · Feb 21, 2023 · Feb 21, 2023 · Feb 21, 2023
diff --git a/third_party/nvfuser/CMakeLists.txt b/third_party/nvfuser/CMakeLists.txt
@@ -72,6 +72,7 @@ list(APPEND NVFUSER_SRCS
     ${NVFUSER_SRCS_DIR}/lower_scalar_hoist.cpp
     ${NVFUSER_SRCS_DIR}/lower_insert_syncs.cpp
     ${NVFUSER_SRCS_DIR}/lower_instrument.cpp
+    ${NVFUSER_SRCS_DIR}/lower_loop_rotation.cpp
     ${NVFUSER_SRCS_DIR}/lower_loops.cpp
     ${NVFUSER_SRCS_DIR}/lower_magic_zero.cpp
     ${NVFUSER_SRCS_DIR}/lower_misaligned_vectorization.cpp
@@ -341,6 +342,7 @@ if(BUILD_TEST)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensor_factories.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_outer_reduction.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_loop_rotation.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_shift.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensorcore.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_matmul_sass.cpp)

diff --git a/third_party/nvfuser/csrc/executor.h b/third_party/nvfuser/csrc/executor.h
@@ -16,6 +16,24 @@ namespace nvfuser {
 TORCH_CUDA_CU_API bool shouldFillAllocationWithNan();
 TORCH_CUDA_CU_API void setFillAllocationWithNan(bool value);
 
+// Note [Limitation of boundary assert]:
+// When set to true we will add boundary check to the generated kernel's
+// Tensor::operator[]. However, this does not always work and can have false
+// positives and false negatives.
+//
+// False positive:
+// For some cases, such as reduction, we generate code like
+//  int index = 1025;
+//  blockReduce(/*reference*/T0[index], ..., index < 1024);
+// In the above example, we do not really read from T0[index], thanks to the
+// predicate, however, the boundary check in operator[] is still executed.
+// As a result, this would causes false alarm.
+//
+// False negative:
+// Not all global memory accesses use operator[], for example, vectorized access
+// uses loadGeneric on pointers. And this might miss cases like
+//   int index = 1024;
+//   loadGeneric<dtype, 4>(dest, &T0[index]); // T0.size[0] == 1026
 TORCH_CUDA_CU_API bool shouldAssertOutOfBound();
 TORCH_CUDA_CU_API void setAssertOutOfBound(bool value);
 

diff --git a/third_party/nvfuser/csrc/fusion.cpp b/third_party/nvfuser/csrc/fusion.cpp
@@ -80,6 +80,8 @@ IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
     to->io_alias_[copied_output] = copied_input;
   }
 
+  to->loop_rotation_param_ = ir_cloner.clone(from->loop_rotation_param_);
+
   to->permuted_input_map_ = from->permuted_input_map_;
   to->permuted_output_map_ = from->permuted_output_map_;
 
@@ -138,6 +140,7 @@ void Fusion::clear() noexcept {
 
   permuted_input_map_.clear();
   permuted_output_map_.clear();
+  loop_rotation_param_.clear();
 
   all_tv_uses_valid_ = false;
   is_during_update_uses_ = false;

diff --git a/third_party/nvfuser/csrc/fusion.h b/third_party/nvfuser/csrc/fusion.h
@@ -235,6 +235,23 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
     return io_alias_;
   }
 
+  // vector of (tv, dim, selection)
+  // For each entry in the vector, the selected tv/expr in loop tv->axis(dim)
+  // will be rotated
+  using LoopRotationParam = std::vector<
+      std::tuple<TensorView*, int64_t, std::unordered_set<Statement*>>>;
+
+  const LoopRotationParam& getLoopRotationParam() const {
+    return loop_rotation_param_;
+  }
+
+  void rotateLoop(
+      TensorView* loop_tv,
+      int64_t axis,
+      std::unordered_set<Statement*> selection) {
+    loop_rotation_param_.emplace_back(loop_tv, axis, std::move(selection));
+  }
+
  protected:
   friend SegmentCandidateFinder;
   friend SegmentedFusion;
@@ -285,6 +302,10 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
   //  the states are either all valid or all invalid
   bool all_tv_uses_valid_ = false;
   bool is_during_update_uses_ = false;
+
+  // Compilation parameters guiding the loop rotation pass. See note
+  // [Loop Rotation] for detail.
+  LoopRotationParam loop_rotation_param_;
 };
 
 } // namespace nvfuser