diff --git a/benchmarks/cpp/nvfuser/utils.cpp b/benchmarks/cpp/nvfuser/utils.cpp index 3915f7d65298..227195740d56 100644 --- a/benchmarks/cpp/nvfuser/utils.cpp +++ b/benchmarks/cpp/nvfuser/utils.cpp @@ -6,7 +6,7 @@ using namespace torch::jit::fuser::cuda; -std::string toString(ReductionParams rparams) { +std::string toString(const ReductionParams& rparams) { std::stringstream ss; ss << (rparams.fastest_dim ? "Red On Fastest Dim // " : "Red On Slow Dim // ") << (rparams.persistent_kernel ? "Persistent Kernel // " : "") @@ -65,7 +65,7 @@ std::string toString(ReductionParams rparams) { return ss.str(); } -std::string toString(PointwiseParams params) { +std::string toString(const PointwiseParams& params) { std::stringstream ss; if (params.break_point) { ss << "2D Schedule at " << params.break_point << "/"; @@ -89,6 +89,15 @@ std::string toString(PointwiseParams params) { return ss.str(); } +std::string toString(const TransposeParams& params) { + std::stringstream ss; + ss << "Tile size: (" << params.tile_size1 << "," << params.tile_size2 + << ")/"; + ss << "Vectorize size: (" << params.vectorize_factor1 << "," + << params.vectorize_factor2 << ")"; + return ss.str(); +} + std::string toString(const std::shared_ptr& params) { auto rparams = std::dynamic_pointer_cast(params); if (rparams) { @@ -98,6 +107,10 @@ std::string toString(const std::shared_ptr& params) { if (pparams) { return toString(*pparams); } + auto tparams = std::dynamic_pointer_cast(params); + if (tparams) { + return toString(*tparams); + } TORCH_INTERNAL_ASSERT( false, "Unknown heuristic parameter type. Did you just added a new heuristic parameter type but forget to update here?"); diff --git a/benchmarks/cpp/nvfuser/utils.h b/benchmarks/cpp/nvfuser/utils.h index e24fdfb127da..67c5329d9668 100644 --- a/benchmarks/cpp/nvfuser/utils.h +++ b/benchmarks/cpp/nvfuser/utils.h @@ -36,8 +36,9 @@ TensorView* makeContigConcreteTensor( std::vector shape, DataType dtype = DataType::Float); -std::string toString(ReductionParams rparams); -std::string toString(PointwiseParams params); +std::string toString(const ReductionParams& rparams); +std::string toString(const PointwiseParams& params); +std::string toString(const TransposeParams& params); std::string toString(const std::shared_ptr& params); std::string toString(LaunchParams lparams); diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py index 5df55ee45dfd..d8f2589f81c2 100644 --- a/test/test_jit_cuda_fuser.py +++ b/test/test_jit_cuda_fuser.py @@ -41,8 +41,12 @@ if RUN_NVFUSER and torch.version.cuda is not None: CUDA_MAJOR, CUDA_MINOR = (int(x) for x in torch.version.cuda.split('.')[:2]) -os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition' -os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma' +if 'PYTORCH_NVFUSER_ENABLE' not in os.environ: + os.environ['PYTORCH_NVFUSER_ENABLE'] = "" +os.environ['PYTORCH_NVFUSER_ENABLE'] = 'linear_decomposition,conv_decomposition,' + os.environ['PYTORCH_NVFUSER_ENABLE'] +if 'PYTORCH_NVFUSER_DISABLE' not in os.environ: + os.environ['PYTORCH_NVFUSER_DISABLE'] = "" +os.environ['PYTORCH_NVFUSER_DISABLE'] = 'fallback,fma,' + os.environ['PYTORCH_NVFUSER_DISABLE'] os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0' # TODO: enable complex when we fixes the extremal cases in OpInfo # see issue https://github.com/csarofeen/pytorch/issues/1730" diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp index 2fd96e1313c7..86834a853e60 100644 --- a/torch/csrc/jit/codegen/cuda/executor.cpp +++ b/torch/csrc/jit/codegen/cuda/executor.cpp @@ -974,14 +974,16 @@ std::vector FusionExecutor::runFusion( const auto& input_tensor = input.toTensor(); std::cout << " " << input_tensor.scalar_type() << " " << input.toTensor().sizes() - << " (strides = " << input.toTensor().strides() << ")" + << " (strides = " << input.toTensor().strides() + << ", address = " << input.toTensor().data_ptr() << ")" << std::endl; } } std::cout << "Outputs:" << std::endl; for (const auto& output : allocated_outputs) { std::cout << " " << output.scalar_type() << " " << output.sizes() - << " (strides = " << output.strides() << ")" << std::endl; + << " (strides = " << output.strides() + << ", address = " << output.data_ptr() << ")" << std::endl; } std::cout << "Reduction and semaphore buffers:" << std::endl; TORCH_INTERNAL_ASSERT( diff --git a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp index 55f08882c1f0..7b3edfd74cdb 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/registry.cpp @@ -1256,15 +1256,18 @@ class TransposeScheduler : public SchedulerEntry { } static bool canScheduleCompileTime(Fusion* fusion) { - // Not enabling this yet. Needs more validation. - return false; -#if 0 + if (!isOptionEnabled(EnableOption::TransposeScheduler)) { + scheduler_debug_utils::canScheduleRejectReason( + ScheduleHeuristic::Transpose, "not enabled"); + return false; + } + // Temporarily disallow view in transpose scheduler // TODO Add more testing before enabling auto view_tvs = scheduler_utils::getViewTVs(fusion); if (view_tvs.size() > 0) { scheduler_debug_utils::canScheduleRejectReason( - ScheduleHeuristic::Reduction, "No support for view op"); + ScheduleHeuristic::Transpose, "No support for view op"); return false; } @@ -1293,7 +1296,6 @@ class TransposeScheduler : public SchedulerEntry { } return true; -#endif } static bool canScheduleRunTime( diff --git a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp index 6937685e7fb7..afb5f09f2ec6 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/transpose.cpp @@ -126,13 +126,15 @@ class DomainMap : public pointwise_utils::DomainMap { auto group = scheduler_utils::getInputsOutputsWithInnerDim(tv, true, false); for (auto member_tv : group) { - TORCH_INTERNAL_ASSERT( - grouped.count(member_tv) == 0 || member_tv == tv, - "The group of ", - member_tv->toString(), - " is ambiguous. This is likely a bug."); - grouped.emplace(member_tv); - groups.back().emplace_back(member_tv); + if (grouped.count(member_tv) == 0) { + grouped.emplace(member_tv); + groups.back().emplace_back(member_tv); + } else if (member_tv != tv) { + // Ambiguous grouping. This should only happen at `canSchedule`, so + // we just return a null result which will tell the scheduler to + // reject the fusion + return {}; + } } } } @@ -229,15 +231,26 @@ void maybeBuildVirtualInnerDims( (merged_size2 < params.tile_size2)) { return; // no need to split } - // If one of them are not satisfied, this usually means that the satisfied one - // just merged in a large dim. We split this large dim, so that now we have - // two available dims to satisfy both virtual innermost dim. + // If one of them are not satisfied, there might be two cases: + // 1. The satisfied one just merged in a large dim. If this is the case, We + // split this large dim, so that now we have two available dims to satisfy + // both virtual innermost dim. + // 2. The satisfied one did not merge in anything. For example, + // T0[I0{1024*1024}, I1{2}] int64_t large_dim; int64_t split_factor; if (merged_size1 < params.tile_size1) { + if (params.dims_merged_with_2.empty()) { + // case 2 + return; + } large_dim = params.dims_merged_with_2.back(); split_factor = ceilDiv(params.tile_size1, merged_size1); } else { + if (params.dims_merged_with_1.empty()) { + // case 2 + return; + } large_dim = params.dims_merged_with_1.back(); split_factor = ceilDiv(params.tile_size2, merged_size2); } @@ -435,7 +448,6 @@ std::shared_ptr getTransposeHeuristics( auto max_unroll_factor_block = ceilDiv(params->tile_size1 * params->tile_size2, 32); max_unroll_factor = std::min(max_unroll_factor, max_unroll_factor_block); - max_unroll_factor = scheduler_utils::lastPow2(max_unroll_factor); // Compute maximum vectorize factor that can be used size_t vectorize_factor1 = max_unroll_factor; @@ -456,15 +468,17 @@ std::shared_ptr getTransposeHeuristics( vectorize_factor2 = std::min(vectorize_factor2, tv_vectorize_factor); } - params->vectorize_factor1 = - std::min(static_cast(max_unroll_factor), vectorize_factor1); - params->vectorize_factor2 = - std::min(static_cast(max_unroll_factor), vectorize_factor2); + params->vectorize_factor1 = scheduler_utils::lastPow2( + std::min(static_cast(max_unroll_factor), vectorize_factor1)); + params->vectorize_factor2 = scheduler_utils::lastPow2( + std::min(static_cast(max_unroll_factor), vectorize_factor2)); params->lparams.bind(params->getThreadsPerBlock(), ParallelType::TIDx); if (isDebugDumpEnabled(DebugDumpOption::SchedulerDebug)) { std::cerr << "\n===== Transpose Stats ========\n" + << "inputs: " << ir_utils::toString(fusion->inputs()) << "\n" + << "outputs: " << ir_utils::toString(fusion->outputs()) << "\n" << "num_elems: " << n_elems << "\n" << "n_input_tensors: " << n_input_tensors << "\n" << "max_input_dtype_size: " << max_input_dtype_size << "\n" diff --git a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp index 1fa0f16a7d78..dea4917fc34d 100644 --- a/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/scheduler/utils.cpp @@ -1390,13 +1390,8 @@ std::vector getInputsOutputsWithInnerDim( std::vector vectorizable_tensors; - for (auto input_tv : - ir_utils::filterByType(reference_tv->fusion()->inputs())) { - if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) { - vectorizable_tensors.push_back(input_tv); - } - } - + // We put outputs in front of inputs because this would make the transpose + // scheduler prefer to use output instead of input as reference tensor. for (auto output_tv : ir_utils::filterByType(reference_tv->fusion()->outputs())) { if (hasInnerDim(output_tv, vectorizable_dims, vectorize_pass)) { @@ -1404,6 +1399,13 @@ std::vector getInputsOutputsWithInnerDim( } } + for (auto input_tv : + ir_utils::filterByType(reference_tv->fusion()->inputs())) { + if (hasInnerDim(input_tv, vectorizable_dims, vectorize_pass)) { + vectorizable_tensors.push_back(input_tv); + } + } + return vectorizable_tensors; } diff --git a/torch/csrc/jit/codegen/cuda/utils.cpp b/torch/csrc/jit/codegen/cuda/utils.cpp index 91afac7270d7..5e82014c0c38 100644 --- a/torch/csrc/jit/codegen/cuda/utils.cpp +++ b/torch/csrc/jit/codegen/cuda/utils.cpp @@ -169,7 +169,8 @@ auto parseEnableOptions() { {EnableOption::Complex, false}, {EnableOption::KernelProfile, false}, {EnableOption::LinearDecomposition, false}, - {EnableOption::ConvDecomposition, false}}; + {EnableOption::ConvDecomposition, false}, + {EnableOption::TransposeScheduler, false}}; if (const char* dump_options = std::getenv("PYTORCH_NVFUSER_ENABLE")) { c10::string_view options_view(dump_options); @@ -184,13 +185,16 @@ auto parseEnableOptions() { options_map[EnableOption::LinearDecomposition] = true; } else if (token == "conv_decomposition") { options_map[EnableOption::ConvDecomposition] = true; + } else if (token == "transpose_scheduler") { + options_map[EnableOption::TransposeScheduler] = true; } else { TORCH_CHECK( false, - "Invalid disable option: '", + "Invalid enable option: '", token, "'\nAvailable options:\n", - "\tcomplex, kernel_profile"); + "\tcomplex, kernel_profile, linear_decomposition,", + "conv_decomposition, transpose_scheduler"); } options_view = (end_pos != c10::string_view::npos) ? options_view.substr(end_pos + 1) diff --git a/torch/csrc/jit/codegen/cuda/utils.h b/torch/csrc/jit/codegen/cuda/utils.h index 69e3aa4a4d68..679776b383af 100644 --- a/torch/csrc/jit/codegen/cuda/utils.h +++ b/torch/csrc/jit/codegen/cuda/utils.h @@ -79,7 +79,8 @@ enum class EnableOption { Complex, //! Enable complex support on python KernelProfile, //! Enable intra-kernel performance profiling LinearDecomposition, //! Enable linear-bias decomposition - ConvDecomposition //! Enable conv-bias decomposition + ConvDecomposition, //! Enable conv-bias decomposition + TransposeScheduler //! Enable the experimental transpose scheduler }; TORCH_CUDA_CU_API bool isOptionEnabled(EnableOption option);