Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions benchmarks/cpp/nvfuser/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

using namespace torch::jit::fuser::cuda;

std::string toString(ReductionParams rparams) {
std::string toString(const ReductionParams& rparams) {
std::stringstream ss;
ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ")
<< (rparams.persistent_kernel ? "Persistent Kernel // " : "")
Expand Down Expand Up @@ -65,7 +65,7 @@ std::string toString(ReductionParams rparams) {
return ss.str();
}

std::string toString(PointwiseParams params) {
std::string toString(const PointwiseParams& params) {
std::stringstream ss;
if (params.break_point) {
ss << "2D Schedule at " << params.break_point << "/";
Expand All @@ -89,6 +89,15 @@ std::string toString(PointwiseParams params) {
return ss.str();
}

std::string toString(const TransposeParams& params) {
std::stringstream ss;
ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2
<< ")/";
ss << "Vectorize size: (" << params.vectorize_factor1 << ","
<< params.vectorize_factor2 << ")";
return ss.str();
}

std::string toString(const std::shared_ptr<HeuristicParams>& params) {
auto rparams = std::dynamic_pointer_cast<ReductionParams>(params);
if (rparams) {
Expand All @@ -98,6 +107,10 @@ std::string toString(const std::shared_ptr<HeuristicParams>& params) {
if (pparams) {
return toString(*pparams);
}
auto tparams = std::dynamic_pointer_cast<TransposeParams>(params);
if (tparams) {
return toString(*tparams);
}
TORCH_INTERNAL_ASSERT(
false,
"Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?");
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/cpp/nvfuser/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@ TensorView* makeContigConcreteTensor(
std::vector<int64_t> shape,
DataType dtype = DataType::Float);

std::string toString(ReductionParams rparams);
std::string toString(PointwiseParams params);
std::string toString(const ReductionParams& rparams);
std::string toString(const PointwiseParams& params);
std::string toString(const TransposeParams& params);
std::string toString(const std::shared_ptr<HeuristicParams>& params);
std::string toString(LaunchParams lparams);

Expand Down
8 changes: 6 additions & 2 deletions test/test_jit_cuda_fuser.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@
if RUN_NVFUSER and torch.version.cuda is not None:
CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2])

os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition'
os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma'
if 'PYTORCH_NVFUSER_ENABLE' not in os.environ:
os.environ['PYTORCH_NVFUSER_ENABLE'] = ""
os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE']
if 'PYTORCH_NVFUSER_DISABLE' not in os.environ:
os.environ['PYTORCH_NVFUSER_DISABLE'] = ""
os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE']
os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0'
# TODO: enable complex when we fixes the extremal cases in OpInfo
# see issue https://github.com/csarofeen/pytorch/issues/1730"
Expand Down
6 changes: 4 additions & 2 deletions torch/csrc/jit/codegen/cuda/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -974,14 +974,16 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
const auto& input_tensor = input.toTensor();
std::cout << " " << input_tensor.scalar_type() << " "
<< input.toTensor().sizes()
<< " (strides = " << input.toTensor().strides() << ")"
<< " (strides = " << input.toTensor().strides()
<< ", address = " << input.toTensor().data_ptr() << ")"
<< std::endl;
}
}
std::cout << "Outputs:" << std::endl;
for (const auto& output : allocated_outputs) {
std::cout << " " << output.scalar_type() << " " << output.sizes()
<< " (strides = " << output.strides() << ")" << std::endl;
<< " (strides = " << output.strides()
<< ", address = " << output.data_ptr() << ")" << std::endl;
}
std::cout << "Reduction and semaphore buffers:" << std::endl;
TORCH_INTERNAL_ASSERT(
Expand Down
12 changes: 7 additions & 5 deletions torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1256,15 +1256,18 @@ class TransposeScheduler : public SchedulerEntry {
}

static bool canScheduleCompileTime(Fusion* fusion) {
// Not enabling this yet. Needs more validation.
return false;
#if 0
if (!isOptionEnabled(EnableOption::TransposeScheduler)) {
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::Transpose, "not enabled");
return false;
}

// Temporarily disallow view in transpose scheduler
// TODO Add more testing before enabling
auto view_tvs = scheduler_utils::getViewTVs(fusion);
if (view_tvs.size() > 0) {
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::Reduction, "No support for view op");
ScheduleHeuristic::Transpose, "No support for view op");
return false;
}

Expand Down Expand Up @@ -1293,7 +1296,6 @@ class TransposeScheduler : public SchedulerEntry {
}

return true;
#endif
}

static bool canScheduleRunTime(
Expand Down
44 changes: 29 additions & 15 deletions torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -126,13 +126,15 @@ class DomainMap : public pointwise_utils::DomainMap {
auto group =
scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false);
for (auto member_tv : group) {
TORCH_INTERNAL_ASSERT(
grouped.count(member_tv) == 0 || member_tv == tv,
"The group of ",
member_tv->toString(),
" is ambiguous. This is likely a bug.");
grouped.emplace(member_tv);
groups.back().emplace_back(member_tv);
if (grouped.count(member_tv) == 0) {
grouped.emplace(member_tv);
groups.back().emplace_back(member_tv);
} else if (member_tv != tv) {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are there cases where you expect this to happen or you're just deciding not to error in case it does but just return not supported?

Copy link
Collaborator Author

@zasdfgbnm zasdfgbnm Aug 23, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The transpose scheduler is currently on the top of hlist, so this check will run on all fusions. And this does happen for some fusion that should be scheduled by the persistent scheduler. I don't know why though.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this from aliasing inputs to outputs on BN kernels?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are many of them:

[  PASSED  ] 685 tests.
[  FAILED  ] 14 tests, listed below:
[  FAILED  ] NVFuserTest.FusionMagicSchedulerLayerNormBackward_CUDA
[  FAILED  ] NVFuserTest.FusionMagicSchedulerRMSNormBackward_CUDA
[  FAILED  ] NVFuserTest.FusionMagicSchedulerBatchNormalization_CUDA
[  FAILED  ] NVFuserTest.FusionMagicSchedulerInstanceNormalization_CUDA
[  FAILED  ] NVFuserTest.FusionSegmentReduceSoftmax_CUDA
[  FAILED  ] NVFuserTest.FusionBNBackwardRepro_CUDA
[  FAILED  ] NVFuserTest.FusionBNRepro_CUDA
[  FAILED  ] NVFuserTest.FusionBNRepro2_CUDA
[  FAILED  ] NVFuserTest.FusionTranslate2Welford_CUDA
[  FAILED  ] NVFuserTest.FusionForceFp16NotAllCast_CUDA
[  FAILED  ] NVFuserTest.FusionForceBf16NotAllCast_CUDA
[  FAILED  ] NVFuserTest.FusionViewPersistentShmoo_CUDA
[  FAILED  ] NVFuserTest.FusionViewConcreteDomain2_CUDA

// Ambiguous grouping. This should only happen at `canSchedule`, so
// we just return a null result which will tell the scheduler to
// reject the fusion
return {};
}
}
}
}
Expand Down Expand Up @@ -229,15 +231,26 @@ void maybeBuildVirtualInnerDims(
(merged_size2 < params.tile_size2)) {
return; // no need to split
}
// If one of them are not satisfied, this usually means that the satisfied one
// just merged in a large dim. We split this large dim, so that now we have
// two available dims to satisfy both virtual innermost dim.
// If one of them are not satisfied, there might be two cases:
// 1. The satisfied one just merged in a large dim. If this is the case, We
// split this large dim, so that now we have two available dims to satisfy
// both virtual innermost dim.
// 2. The satisfied one did not merge in anything. For example,
// T0[I0{1024*1024}, I1{2}]
int64_t large_dim;
int64_t split_factor;
if (merged_size1 < params.tile_size1) {
if (params.dims_merged_with_2.empty()) {
// case 2
return;
}
large_dim = params.dims_merged_with_2.back();
split_factor = ceilDiv(params.tile_size1, merged_size1);
} else {
if (params.dims_merged_with_1.empty()) {
// case 2
return;
}
large_dim = params.dims_merged_with_1.back();
split_factor = ceilDiv(params.tile_size2, merged_size2);
}
Expand Down Expand Up @@ -435,7 +448,6 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
auto max_unroll_factor_block =
ceilDiv(params->tile_size1 * params->tile_size2, 32);
max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_block);
max_unroll_factor = scheduler_utils::lastPow2(max_unroll_factor);

// Compute maximum vectorize factor that can be used
size_t vectorize_factor1 = max_unroll_factor;
Expand All @@ -456,15 +468,17 @@ std::shared_ptr<TransposeParams> getTransposeHeuristics(
vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor);
}

params->vectorize_factor1 =
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1);
params->vectorize_factor2 =
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2);
params->vectorize_factor1 = scheduler_utils::lastPow2(
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor1));
params->vectorize_factor2 = scheduler_utils::lastPow2(
std::min(static_cast<size_t>(max_unroll_factor), vectorize_factor2));

params->lparams.bind(params->getThreadsPerBlock(), ParallelType::TIDx);

if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) {
std::cerr << "\n===== Transpose Stats ========\n"
<< "inputs: " << ir_utils::toString(fusion->inputs()) << "\n"
<< "outputs: " << ir_utils::toString(fusion->outputs()) << "\n"
<< "num_elems: " << n_elems << "\n"
<< "n_input_tensors: " << n_input_tensors << "\n"
<< "max_input_dtype_size: " << max_input_dtype_size << "\n"
Expand Down
16 changes: 9 additions & 7 deletions torch/csrc/jit/codegen/cuda/scheduler/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1390,20 +1390,22 @@ std::vector<TensorView*> getInputsOutputsWithInnerDim(

std::vector<TensorView*> vectorizable_tensors;

for (auto input_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(input_tv);
}
}

// We put outputs in front of inputs because this would make the transpose
// scheduler prefer to use output instead of input as reference tensor.
for (auto output_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->outputs())) {
if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(output_tv);
}
}

for (auto input_tv :
ir_utils::filterByType<TensorView>(reference_tv->fusion()->inputs())) {
if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) {
vectorizable_tensors.push_back(input_tv);
}
}

return vectorizable_tensors;
}

Expand Down
10 changes: 7 additions & 3 deletions torch/csrc/jit/codegen/cuda/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ auto parseEnableOptions() {
{EnableOption::Complex, false},
{EnableOption::KernelProfile, false},
{EnableOption::LinearDecomposition, false},
{EnableOption::ConvDecomposition, false}};
{EnableOption::ConvDecomposition, false},
{EnableOption::TransposeScheduler, false}};

if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) {
c10::string_view options_view(dump_options);
Expand All @@ -184,13 +185,16 @@ auto parseEnableOptions() {
options_map[EnableOption::LinearDecomposition] = true;
} else if (token == "conv_decomposition") {
options_map[EnableOption::ConvDecomposition] = true;
} else if (token == "transpose_scheduler") {
options_map[EnableOption::TransposeScheduler] = true;
} else {
TORCH_CHECK(
false,
"Invalid disable option: '",
"Invalid enable option: '",
token,
"'\nAvailable options:\n",
"\tcomplex, kernel_profile");
"\tcomplex, kernel_profile, linear_decomposition,",
"conv_decomposition, transpose_scheduler");
}
options_view = (end_pos != c10::string_view::npos)
? options_view.substr(end_pos + 1)
Expand Down
3 changes: 2 additions & 1 deletion torch/csrc/jit/codegen/cuda/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ enum class EnableOption {
Complex, //! Enable complex support on python
KernelProfile, //! Enable intra-kernel performance profiling
LinearDecomposition, //! Enable linear-bias decomposition
ConvDecomposition //! Enable conv-bias decomposition
ConvDecomposition, //! Enable conv-bias decomposition
TransposeScheduler //! Enable the experimental transpose scheduler
};

TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);
Expand Down