Kernel args patch to show zero_init buffer (#1809)

jjsjann123 · web-flow · commit 3ed8330f881f · 2022-07-07T16:25:56.000-07:00
Updated kernel args print to indicate zero_init buffers, which explains elementwise kernels happening before fusion.

Changes from this

```
Reduction and semaphore buffers:
  Float [16]
  Long [1]
```

To

```
Reduction and semaphore buffers:
  Float [16] is_zero_initialized: 0
  Long [1] is_zero_initialized: 1
```

The is_zero_initialized: 1 on a given buffer means an extra init kernel would be needed.
diff --git a/torch/csrc/jit/codegen/cuda/README.md b/torch/csrc/jit/codegen/cuda/README.md
@@ -187,7 +187,7 @@ There're a few debug dump that could be turned on via environment variables. Loo
 1. `dump_eff_bandwidth`: print out effective bandwidth of each generated kernel. This naively measure the kernel time divided by I/O buffer size and is a good/simple metric of performance for bandwidth bound kernels
 2. `cuda_kernel`: print out generated cuda kernels
 3. `launch_param`: print out launch config of generated kernels
-4. `print_args`: print out input output tensors of executed codegen kernels
+4. `kernel_args`: print out input/output/buffer tensors of all executed codegen kernels, note that for buffers, we indicate whether they are zero-initialized, which hints on an extra kernel to fill the tensor before codegen kernels.
 
 ### FAQs
 
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -790,13 +790,15 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
                 at::TensorOptions()
                     .dtype(executor_entry->buffer_types[i])
                     .device(options_.device)));
+            global_buffers.zero_init.push_back(true);
           } else {
             global_buffers.buffers.push_back(at::native::empty_cuda(
                 executor_entry->buffer_sizes[i],
                 executor_entry->buffer_types[i],
                 c10::nullopt,
                 options_.device,
                 c10::nullopt));
+            global_buffers.zero_init.push_back(false);
           }
         }
       }
@@ -984,9 +986,14 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
                 << " (strides = " << output.strides() << ")" << std::endl;
     }
     std::cout << "Reduction and semaphore buffers:" << std::endl;
-    for (const auto& buffer : global_buffers.buffers) {
+    TORCH_INTERNAL_ASSERT(
+        global_buffers.buffers.size() == global_buffers.zero_init.size(),
+        "global_buffer buffer & zero_init container should have identical sizes");
+    for (const auto i : c10::irange(global_buffers.buffers.size())) {
+      const auto& buffer = global_buffers.buffers[i];
+      const auto& zero_init = global_buffers.zero_init[i];
       std::cout << "  " << buffer.scalar_type() << " " << buffer.sizes()
-                << std::endl;
+                << " is_zero_initialized: " << zero_init << std::endl;
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -790,13 +790,15 @@ std::vector<at::Tensor> FusionExecutor::runFusion(`
`790`	`790`	`at::TensorOptions()`
`791`	`791`	`.dtype(executor_entry->buffer_types[i])`
`792`	`792`	`.device(options_.device)));`
	`793`	`+ global_buffers.zero_init.push_back(true);`
`793`	`794`	`} else {`
`794`	`795`	`global_buffers.buffers.push_back(at::native::empty_cuda(`
`795`	`796`	`executor_entry->buffer_sizes[i],`
`796`	`797`	`executor_entry->buffer_types[i],`
`797`	`798`	`c10::nullopt,`
`798`	`799`	`options_.device,`
`799`	`800`	`c10::nullopt));`
	`801`	`+ global_buffers.zero_init.push_back(false);`
`800`	`802`	`}`
`801`	`803`	`}`
`802`	`804`	`}`
`@@ -984,9 +986,14 @@ std::vector<at::Tensor> FusionExecutor::runFusion(`
`984`	`986`	`<< " (strides = " << output.strides() << ")" << std::endl;`
`985`	`987`	`}`
`986`	`988`	`std::cout << "Reduction and semaphore buffers:" << std::endl;`
`987`		`- for (const auto& buffer : global_buffers.buffers) {`
	`989`	`+ TORCH_INTERNAL_ASSERT(`
	`990`	`+ global_buffers.buffers.size() == global_buffers.zero_init.size(),`
	`991`	`+ "global_buffer buffer & zero_init container should have identical sizes");`
	`992`	`+ for (const auto i : c10::irange(global_buffers.buffers.size())) {`
	`993`	`+ const auto& buffer = global_buffers.buffers[i];`
	`994`	`+ const auto& zero_init = global_buffers.zero_init[i];`
`988`	`995`	`std::cout << " " << buffer.scalar_type() << " " << buffer.sizes()`
`989`		`- << std::endl;`
	`996`	`+ << " is_zero_initialized: " << zero_init << std::endl;`
`990`	`997`	`}`
`991`	`998`	`}`
`992`	`999`