Skip to content

Commit 20a36c1

Browse files
authored
Improve nsight compute support (#1855)
1 parent 4059103 commit 20a36c1

File tree

4 files changed

+18
-7
lines changed

4 files changed

+18
-7
lines changed

torch/csrc/jit/codegen/cuda/executor.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,9 @@ std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
9393
std::cout << "\n======= Codegen output for kernel: " << kernelName()
9494
<< " =======\n\n"
9595
<< code << "\n======================================\n\n";
96-
} else if (isDebugDumpEnabled(DebugDumpOption::CudaToFile)) {
96+
}
97+
if (isDebugDumpEnabled(DebugDumpOption::CudaToFile) ||
98+
isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
9799
std::stringstream file_name;
98100
file_name << "__tmp_kernel" << fusion_id_ << ".cu";
99101
std::cout << "PRINTING: " << file_name.str() << std::endl;

torch/csrc/jit/codegen/cuda/executor_utils.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -915,9 +915,11 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
915915
nvrtcProgram program; // NOLINT(cppcoreguidelines-init-variables)
916916

917917
{
918+
std::stringstream ss;
919+
ss << "__tmp_kernel" << id << ".cu";
918920
FUSER_PERF_SCOPE("executor_utils::NvrtcCreateProgram");
919921
AT_CUDA_NVRTC_CHECK(at::globalContext().getNVRTC().nvrtcCreateProgram(
920-
&program, code.c_str(), nullptr, 0, nullptr, nullptr));
922+
&program, code.c_str(), ss.str().c_str(), 0, nullptr, nullptr));
921923
}
922924

923925
ResourceGuard holdProgram([&] {
@@ -964,11 +966,13 @@ std::pair<NvrtcFunction, std::string> nvrtcCompile(
964966
args.push_back("--fmad=true");
965967
}
966968
#endif
967-
968-
#ifndef NDEBUG
969969
// Add line info to generated kernels
970-
args.push_back("-lineinfo");
971-
#else
970+
if (isDebugDumpEnabled(DebugDumpOption::DebugInfo)) {
971+
args.push_back("-lineinfo");
972+
args.push_back("-G");
973+
args.push_back("--dopt=on");
974+
}
975+
#ifdef NDEBUG
972976
// Avoid excessive register usage from assertion
973977
args.push_back("-DNDEBUG");
974978
#endif

torch/csrc/jit/codegen/cuda/utils.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ auto parseDebugDumpOptions() {
2323
{DebugDumpOption::CudaKernel, false},
2424
{DebugDumpOption::CudaFull, false},
2525
{DebugDumpOption::CudaToFile, false},
26+
{DebugDumpOption::DebugInfo, false},
2627
{DebugDumpOption::LaunchParam, false},
2728
{DebugDumpOption::FusionSegments, false},
2829
{DebugDumpOption::FusionSegmenterLog, false},
@@ -58,6 +59,8 @@ auto parseDebugDumpOptions() {
5859
options_map[DebugDumpOption::CudaFull] = true;
5960
} else if (token == "cuda_to_file") {
6061
options_map[DebugDumpOption::CudaToFile] = true;
62+
} else if (token == "debug_info") {
63+
options_map[DebugDumpOption::DebugInfo] = true;
6164
} else if (token == "launch_param") {
6265
options_map[DebugDumpOption::LaunchParam] = true;
6366
} else if (token == "segmented_fusion") {
@@ -95,7 +98,7 @@ auto parseDebugDumpOptions() {
9598
token,
9699
"'\nAvailable options:\n",
97100
"\tfusion_ir, fusion_ir_math, kernel_ir, ca_map, cuda_kernel, cuda_full,\n",
98-
"\tcuda_to_file, launch_param, segmented_fusion, fusion_args,\n",
101+
"\tcuda_to_file, debug_info, launch_param, segmented_fusion, fusion_args,\n",
99102
"\tkernel_args, dump_eff_bandwidth, draw_segmented_fusion,\n",
100103
"\tscheduler_params, parallel_dimensions, buffer_reuse_verbose,\n",
101104
"\tptxas_verbose, halo, segmenter_logging, perf_debug_verbose\n",

torch/csrc/jit/codegen/cuda/utils.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ enum class DebugDumpOption {
2929
CudaKernel, //!< Dump the generated CUDA C++ kernel code
3030
CudaFull, //!< Dump the complete CUDA C++ code
3131
CudaToFile, //!< Dump CUDA Strings to File
32+
DebugInfo, //!< Embed line info and debug info to compiled kernel, and dump
33+
//!< the full CUDA C++ code
3234
LaunchParam, //!< Dump the Launch parameters of kernel
3335
FusionSegments, //!< Dump Segmented Fusion Graph
3436
FusionSegmenterLog, //!< Dump Detailed Segmenter Logging

0 commit comments

Comments
 (0)