Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions test/cpp/jit/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5008,7 +5008,7 @@ void testGPU_FusionReductionScheduler() {

const auto options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::rand({bid_x, tid_x}, options);
at::Tensor input = at::randn({bid_x, tid_x}, options);

// Apply reduction heuristic
const at::ArrayRef<c10::IValue> inputs({input});
Expand All @@ -5024,7 +5024,7 @@ void testGPU_FusionReductionScheduler() {
auto aten_output = input.sum({red_dim});

TORCH_CHECK(
aten_output.allclose(outputs[0]),
aten_output.allclose(outputs[0], 1e-04, 1e-04),
"Error of: ",
aten_output.sub(outputs[0]).abs().max());
}
Expand Down Expand Up @@ -5100,7 +5100,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {

const auto options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::rand(tensor_dims_in, options);
at::Tensor input = at::randn(tensor_dims_in, options);
at::Tensor cg_output = at::empty(tensor_dims_out, options);

// Apply reduction heuristic
Expand All @@ -5117,7 +5117,7 @@ void testGPU_FusionReductionSchedulerMultiDimNonFastest() {
auto aten_output = input.sum(red_dims64);

TORCH_CHECK(
aten_output.allclose(outputs[0]),
aten_output.allclose(outputs[0], 1e-04, 1e-04),
"Error of: ",
aten_output.sub(outputs[0]).abs().max());
}
Expand All @@ -5142,7 +5142,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {

const auto options =
at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor input = at::rand(tensor_dims_in, options);
at::Tensor input = at::randn(tensor_dims_in, options);

TORCH_CHECK(
cuda::scheduleReduction(&fusion, {input}, tv1),
Expand All @@ -5155,7 +5155,7 @@ void testGPU_FusionReductionSchedulerMultiDimFastest() {
auto aten_output = input.sum(red_dims64);

TORCH_CHECK(
aten_output.allclose(outputs[0]),
aten_output.allclose(outputs[0], 1e-05, 1e-05),
"Error of: ",
aten_output.sub(outputs[0]).abs().max());
}
Expand Down Expand Up @@ -5205,8 +5205,8 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
.dtype((fp16 ? at::kHalf : at::kFloat))
.device(at::kCUDA, 0);
at::Tensor input =
(axis ? at::rand({odim, rdim}, options)
: at::rand({rdim, odim}, options));
(axis ? at::randn({odim, rdim}, options)
: at::randn({rdim, odim}, options));

const at::ArrayRef<c10::IValue> inputs({input});

Expand Down Expand Up @@ -5236,7 +5236,7 @@ void testGPU_FusionReductionSchedulerDimShmoo() {
auto aten_output = input.sum({axis});

TORCH_CHECK(
aten_output.allclose(cg_output[0]),
aten_output.allclose(cg_output[0], 1e-03, 1e-03),
"Error of: ",
aten_output.sub(cg_output[0]).abs().max());
}
Expand Down
25 changes: 25 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,31 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
return code;
}

void FusionExecutor::compileFusionFromStr(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these only used for debugging? I don't see any real code using them. Pointing it out in case this is accidentally checked in.

Fusion* fusion,
const std::string& code,
const std::string& name,
int id,
CompileOptions options) {
fusion_ = *fusion;
FusionGuard fg(&fusion_);
options_ = options;

const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
if (debug_env && atoi(debug_env)) {
std::cout << "\n==== codegen output for kernel: " << kernelName()
<< " ====" << std::endl
<< code << std::endl
<< "=====*===============================" << std::endl;
}

fusion_id_ = id;
has_random_ = fusion->hasRNG();
lowered_ = GpuLower(&fusion_);
compiled_kernel_ = executor_utils::nvrtcCompile(code, name, fusion_id_);
compiled_ = true;
}

void FusionExecutor::compileFusion(Fusion* fusion, CompileOptions options) {
TORCH_INTERNAL_ASSERT(
!fusion->outputs().empty(), "No output found for this kernel, aborting.");
Expand Down
6 changes: 6 additions & 0 deletions torch/csrc/jit/codegen/cuda/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ struct TORCH_CUDA_API CompileOptions {

class TORCH_CUDA_API FusionExecutor : public NonCopyable {
public:
void compileFusionFromStr(
Fusion* fusion,
const std::string& code,
const std::string& name,
int id,
CompileOptions options = CompileOptions());
void compileFusion(Fusion* fusion, CompileOptions options = CompileOptions());

std::vector<at::Tensor> runFusion(
Expand Down
83 changes: 28 additions & 55 deletions torch/csrc/jit/codegen/cuda/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,10 @@ ReductionParams reductionHeuristic(

// Is fastest dimension a reduction dimension?
if (rparams.fastest_dim) {
bdimx = red_elems;
if (red_elems < rparams.loop_unroll) {
rparams.loop_unroll = 1;
}
bdimx = ceilDiv(red_elems, rparams.loop_unroll);
bdimy = red_outputs;
} else {
bdimx = red_outputs;
Expand Down Expand Up @@ -426,22 +429,12 @@ c10::optional<ReductionParams> scheduleReduction(
// Do multiple reductions per block
if (rparams.mul_reds_per_blk) {
// Reduction Splits
// [outputs, |rF-Leftover, rf-Unroll, X-Warp|]
// Idx: 0 | 1(-1) 2(-2) 3(-1) |
// [outputs, |rF-Leftover, X-Warp, rf-Unroll|]
// Idx: 0 | 1(-1) 2(-2) 3(-1) |
// --------------------------------
// Reduction Dimensions
red_tv->split(1, rparams.loop_unroll);
red_tv->split(1, rparams.lparams.bdimx());
red_tv->split(1, kLoopUnrollSplit);

// Reordering the Unroll dimension eases applying computeAt()
// for preceeding operations and the rFactored Tensor.
// |- Reordered -|
// V V
// [outputs, |rF-Leftover, X-Warp, rF-Unroll|]
// Idx: 0 | 1(-3) 2(-2) 3(-1) |
// --------------------------------
// Reduction Dimensions
red_tv->reorder({{-1, -2}, {-2, -1}});

// Output Splits
// [|Out-Leftover, Out-PerBlock|, <Reduction Dims>]
Expand All @@ -454,8 +447,8 @@ c10::optional<ReductionParams> scheduleReduction(

// WARNING: computeAt will coalesce the rFactored dimensions
// rFactored Reduction Tensor after computeAt():
// [<output dims>, |X-Warp, rF-Leftover, rF-Unroll|]
// Idx: 0 -- 1 | 2(-3) 3(-2) 4(-1) |
// [<output dims>, | rF-Leftover, X-Warp, rF-Unroll|]
// Idx: 0 -- 1 | 2(-3) 3(-2) 4(-1) |
// ---------------------------------
// Reduction Dimensions
red_tv_rf->computeAt(red_tv, -1);
Expand All @@ -481,47 +474,37 @@ c10::optional<ReductionParams> scheduleReduction(
} else {
if (rparams.cross_grid) {
// Reduction Splits
// [outputs, |rF-Leftover, rf-Unroll, X-Grid, X-Block, X-Warp|]
// Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) |
// [outputs, |rF-Leftover, X-Grid, X-Block, X-Warp, rf-Unroll|]
// Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) |
// -------------------------------------------------
// Reduction Dimensions
red_tv->split(1, rparams.loop_unroll);
red_tv->split(1, rparams.lparams.bdimx());
red_tv->split(1, rparams.lparams.bdimy());
red_tv->split(1, rparams.lparams.gdimy());
red_tv->split(1, kLoopUnrollSplit);

// Reordering the Unroll dimension eases applying computeAt()
// for preceeding operations and the rFactored Tensor.
// |------ Reordered --------|
// V V
// [outputs, |rF-Leftover, X-Warp, X-Grid, X-Block, rf-Unroll|]
// Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) |
// -------------------------------------------------
// Reduction Dimensions
red_tv->reorder({{-1, -4}, {-4, -1}});

auto red_tv_rf = red_tv->rFactor(
{-5, -1}); // NOLINT(cppcoreguidelines-avoid-magic-numbers)

// WARNING: computeAt will coalesce the rFactored dimensions
// rFactored Reduction Tensor after computeAt():
// [Outputs, |X-Warp, X-Grid, X-Block, rF-Leftover, rF-Unroll|]
// Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) |
// [Outputs, |X-Grid, X-Block, X-Warp, rF-Leftover, rF-Unroll|]
// Idx: 0 | 1(-5) 2(-4) 3(-3) 4(-2) 5(-1) |
// -------------------------------------------------
// Reduction Dimensions
red_tv_rf->computeAt(red_tv, -1);

// After the Reduction Tensor has rFactoring applied
// Reduction Output Tensor:
// [Outputs, X-Warp, X-Grid, X-Block]
// Idx: 0 1(-3) 2(-2) 3(-1)
// [Outputs, X-Grid, X-Block, X-Warp]
// Idx: 0 1(-3) 2(-2) 3(-1)

red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);

red_tv->axis(0)->parallelize(ParallelType::BIDx);
red_tv->axis(-3)->parallelize(ParallelType::TIDx);
red_tv->axis(-1)->parallelize(ParallelType::TIDx);
red_tv->axis(-2)->parallelize(ParallelType::BIDy);
red_tv->axis(-1)->parallelize(ParallelType::TIDy);
red_tv->axis(-3)->parallelize(ParallelType::TIDy);

// Bind Inputs to Reduction
for (auto input : fusion->inputsOf(red_tv_rf)) {
Expand All @@ -531,44 +514,34 @@ c10::optional<ReductionParams> scheduleReduction(
}
} else {
// Reduction Splits
// [outputs, |rF-Leftover, rf-Unroll, X-Block, X-Warp|]
// Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) |
// [outputs, |rF-Leftover, X-Block, X-Warp, rf-Unroll|]
// Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) |
// -----------------------------------------
// Reduction Dimensions
red_tv->split(1, rparams.loop_unroll);
red_tv->split(1, rparams.lparams.bdimx());
red_tv->split(1, rparams.lparams.bdimy());
red_tv->split(1, kLoopUnrollSplit);

// Reordering the Unroll dimension eases applying computeAt()
// for preceeding operations and the rFactored Tensor.
// |--- Reordered ----|
// V V
// [outputs, |rF-Leftover, X-Warp, X-Block, rF-Unroll|]
// Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) |
// -----------------------------------------
// Reduction Dimensions
red_tv->reorder({{-1, -3}, {-3, -1}});

auto red_tv_rf = red_tv->rFactor({-4, -1});

// WARNING: computeAt will coalesce the rFactored dimensions
// rFactored Reduction Tensor after computeAt():
// [Outputs, |X-Warp, X-Block, rF-Leftover, rF-Unroll|]
// [Outputs, |X-Block, X-Warp, rF-Leftover, rF-Unroll|]
// Idx: 0 | 1(-4) 2(-3) 3(-2) 4(-1) |
// -----------------------------------------
// Reduction Dimensions
red_tv_rf->computeAt(red_tv, -1);

// After the Reduction Tensor has rFactoring applied
// Reduction Output Tensor:
// [Outputs, X-Warp, X-Block]
// Idx: 0 1(-2) 2(-1)
// [Outputs, X-Block, X-Warp]
// Idx: 0 1(-2) 2(-1)

red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);

red_tv->axis(0)->parallelize(ParallelType::BIDx);
red_tv->axis(-2)->parallelize(ParallelType::TIDx);
red_tv->axis(-1)->parallelize(ParallelType::TIDy);
red_tv->axis(-1)->parallelize(ParallelType::TIDx);
red_tv->axis(-2)->parallelize(ParallelType::TIDy);

// Bind Inputs to Reduction
for (auto input : fusion->inputsOf(red_tv_rf)) {
Expand Down Expand Up @@ -625,7 +598,7 @@ c10::optional<ReductionParams> scheduleReduction(
red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);

red_tv->axis(0)->parallelize(ParallelType::BIDx);
red_tv->axis(1)->parallelize(ParallelType::TIDx);
red_tv->axis(-3)->parallelize(ParallelType::TIDx);
red_tv->axis(-2)->parallelize(ParallelType::TIDy);
red_tv->axis(-1)->parallelize(ParallelType::BIDy);

Expand Down Expand Up @@ -679,7 +652,7 @@ c10::optional<ReductionParams> scheduleReduction(
red_tv_rf->axis(-1)->parallelize(ParallelType::Unroll);

red_tv->axis(0)->parallelize(ParallelType::BIDx);
red_tv->axis(1)->parallelize(ParallelType::TIDx);
red_tv->axis(-2)->parallelize(ParallelType::TIDx);
red_tv->axis(-1)->parallelize(ParallelType::TIDy);

// Bind Inputs to Reduction
Expand Down
7 changes: 5 additions & 2 deletions torch/csrc/jit/codegen/cuda/scheduler.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@ struct ReductionParams {
bool cross_grid = false;
bool mul_reds_per_blk = false;

int loop_unroll = 4;

LaunchParams lparams;

bool operator==(const ReductionParams& other) const {
bool attr_equal = other.fastest_dim == fastest_dim &&
other.cross_block == cross_block && other.cross_grid == cross_grid &&
other.mul_reds_per_blk == mul_reds_per_blk;
other.mul_reds_per_blk == mul_reds_per_blk &&
other.loop_unroll == loop_unroll;
return attr_equal && lparams == other.lparams;
}
};
Expand All @@ -38,7 +41,7 @@ class ReductionParamsHash {
public:
size_t operator()(const ReductionParams& rp) const {
size_t lp_hash = rp.lparams.gdimx() ^ rp.lparams.gdimy() ^
rp.lparams.bdimx() ^ rp.lparams.bdimy();
rp.lparams.bdimx() ^ rp.lparams.bdimy() ^ rp.loop_unroll;
constexpr size_t bits = sizeof(std::size_t) * 8;
size_t attr_hash = static_cast<size_t>(rp.fastest_dim) << (bits - 1) |
static_cast<size_t>(rp.cross_block) << (bits - 2) |
Expand Down