Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
24dad4f
New lowering pass: loop rotation
zasdfgbnm Feb 21, 2023
37ed0ed
save
zasdfgbnm Feb 21, 2023
337d780
cleanup
zasdfgbnm Feb 21, 2023
b3b99cf
cleanup
zasdfgbnm Feb 21, 2023
1789922
better comment
zasdfgbnm Feb 21, 2023
335e443
more cleanup
zasdfgbnm Feb 22, 2023
5eb594f
cleanup
zasdfgbnm Feb 22, 2023
c98b7d2
Revert "cleanup"
zasdfgbnm Feb 22, 2023
d3f2e60
fix
zasdfgbnm Feb 22, 2023
f517acd
adjust tests
zasdfgbnm Feb 22, 2023
3a39b8e
better predicate printting
zasdfgbnm Feb 22, 2023
3cd238e
fix canOmitStopPredicate
zasdfgbnm Feb 22, 2023
5445186
Merge branch 'devel' of github.com:csarofeen/pytorch into loop-rotati…
zasdfgbnm Feb 22, 2023
a806b13
save
zasdfgbnm Feb 22, 2023
a1311b2
test
zasdfgbnm Feb 22, 2023
d71980f
save
zasdfgbnm Feb 22, 2023
d2b3f8e
comment tests
zasdfgbnm Feb 22, 2023
1ee9a4b
save
zasdfgbnm Feb 22, 2023
abefa1a
fix
zasdfgbnm Feb 27, 2023
655f5a7
update
zasdfgbnm Feb 27, 2023
9c17d49
comment
zasdfgbnm Feb 27, 2023
836cbbd
cleanup
zasdfgbnm Feb 27, 2023
b077be4
Merge branch 'devel' of github.com:csarofeen/pytorch into loop-rotati…
zasdfgbnm Feb 27, 2023
a86d11e
fix is_same
zasdfgbnm Feb 27, 2023
8ce278d
check instead of warn
zasdfgbnm Feb 28, 2023
f7257ed
fix
zasdfgbnm Feb 28, 2023
d9c1efd
save
zasdfgbnm Feb 28, 2023
67f7df3
Support multiple nested
zasdfgbnm Feb 28, 2023
80d7763
canOmitStopPredicate
zasdfgbnm Mar 1, 2023
896b867
fix
zasdfgbnm Mar 1, 2023
20564e3
IrBuilder
zasdfgbnm Mar 1, 2023
e083f84
scheduler_utils::rotateLoop
zasdfgbnm Mar 1, 2023
9a85085
fix include
zasdfgbnm Mar 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions third_party/nvfuser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/lower_scalar_hoist.cpp
${NVFUSER_SRCS_DIR}/lower_insert_syncs.cpp
${NVFUSER_SRCS_DIR}/lower_instrument.cpp
${NVFUSER_SRCS_DIR}/lower_loop_rotation.cpp
${NVFUSER_SRCS_DIR}/lower_loops.cpp
${NVFUSER_SRCS_DIR}/lower_magic_zero.cpp
${NVFUSER_SRCS_DIR}/lower_misaligned_vectorization.cpp
Expand Down Expand Up @@ -341,6 +342,7 @@ if(BUILD_TEST)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensor_factories.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_fused_reduction.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_outer_reduction.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_loop_rotation.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_shift.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_tensorcore.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_matmul_sass.cpp)
Expand Down
18 changes: 18 additions & 0 deletions third_party/nvfuser/csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,24 @@ namespace nvfuser {
TORCH_CUDA_CU_API bool shouldFillAllocationWithNan();
TORCH_CUDA_CU_API void setFillAllocationWithNan(bool value);

// Note [Limitation of boundary assert]:
// When set to true we will add boundary check to the generated kernel's
// Tensor::operator[]. However, this does not always work and can have false
// positives and false negatives.
//
// False positive:
// For some cases, such as reduction, we generate code like
// int index = 1025;
// blockReduce(/*reference*/T0[index], ..., index < 1024);
// In the above example, we do not really read from T0[index], thanks to the
// predicate, however, the boundary check in operator[] is still executed.
// As a result, this would causes false alarm.
//
// False negative:
// Not all global memory accesses use operator[], for example, vectorized access
// uses loadGeneric on pointers. And this might miss cases like
// int index = 1024;
// loadGeneric<dtype, 4>(dest, &T0[index]); // T0.size[0] == 1026
TORCH_CUDA_CU_API bool shouldAssertOutOfBound();
TORCH_CUDA_CU_API void setAssertOutOfBound(bool value);

Expand Down
3 changes: 3 additions & 0 deletions third_party/nvfuser/csrc/fusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ IrCloner Fusion::copy(const Fusion* from, Fusion* to) {
to->io_alias_[copied_output] = copied_input;
}

to->loop_rotation_param_ = ir_cloner.clone(from->loop_rotation_param_);

to->permuted_input_map_ = from->permuted_input_map_;
to->permuted_output_map_ = from->permuted_output_map_;

Expand Down Expand Up @@ -138,6 +140,7 @@ void Fusion::clear() noexcept {

permuted_input_map_.clear();
permuted_output_map_.clear();
loop_rotation_param_.clear();

all_tv_uses_valid_ = false;
is_during_update_uses_ = false;
Expand Down
21 changes: 21 additions & 0 deletions third_party/nvfuser/csrc/fusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,23 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
return io_alias_;
}

// vector of (tv, dim, selection)
// For each entry in the vector, the selected tv/expr in loop tv->axis(dim)
// will be rotated
using LoopRotationParam = std::vector<
std::tuple<TensorView*, int64_t, std::unordered_set<Statement*>>>;

const LoopRotationParam& getLoopRotationParam() const {
return loop_rotation_param_;
}

void rotateLoop(
TensorView* loop_tv,
int64_t axis,
std::unordered_set<Statement*> selection) {
loop_rotation_param_.emplace_back(loop_tv, axis, std::move(selection));
}

protected:
friend SegmentCandidateFinder;
friend SegmentedFusion;
Expand Down Expand Up @@ -285,6 +302,10 @@ class TORCH_CUDA_CU_API Fusion : public IrContainer {
// the states are either all valid or all invalid
bool all_tv_uses_valid_ = false;
bool is_during_update_uses_ = false;

// Compilation parameters guiding the loop rotation pass. See note
// [Loop Rotation] for detail.
LoopRotationParam loop_rotation_param_;
};

} // namespace nvfuser
Loading