csarofeen · jjsjann123 · Sep 30, 2022 · Sep 26, 2022 · Sep 26, 2022 · Sep 26, 2022
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -741,29 +741,24 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
 }
 
 std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    const KernelArgumentHolder& args,
     kir::ExpressionEvaluator& expr_eval,
     const std::unordered_set<int>& alias_indices) {
   FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
   const auto kernel = lowered_->kernel();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> outputs;
+  TORCH_INTERNAL_ASSERT(
+      args.size() == kernel->inputs().size(),
+      "kernel arguments length does not match runtime arguments.");
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // TODO: FIX this short-cut where we trivially forward inputs to outputs
     if (kernel->outputs()[out_i]->isFusionInput()) {
-      TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
-      // for (auto inp_i : c10::irange(kernel->inputs().size())) {
-      //   if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
-      //     TORCH_INTERNAL_ASSERT(
-      //         inp_i < inputs.size(),
-      //         "Issue with an input showing up as output, couldn't find
-      //         input.");
-      //     TORCH_INTERNAL_ASSERT(
-      //         inputs[inp_i].isTensor(),
-      //         "Cannot register a scalar as an output in a fusion.");
-      //     outputs.push_back(inputs[inp_i].toTensor());
-      //     break;
-      //   }
-      // }
+      // pushing empty tensor for trivial forwarding. Since we handle this in
+      // integration, see step 1 - note [trivial forwarding]
+      c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
+      const auto tensor_options =
+          at::TensorOptions().dtype(at::kFloat).device(device);
+      outputs.emplace_back(at::empty({0}, tensor_options));
     } else {
       TORCH_INTERNAL_ASSERT(
           kernel->outputs()[out_i]->isA<TensorView>(),
@@ -803,7 +798,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
   meta_options.device = c10::Device(DeviceType::Meta, 0);
 
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // If the output is just trivially the input, just "copy" it over.
+    // If the output is just trivially the input, just "copy" it over, see note
+    // [trivial forwarding]
     if (kernel->outputs()[out_i]->isFusionInput()) {
       for (auto inp_i : c10::irange(kernel->inputs().size())) {
         if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
@@ -1124,7 +1120,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
       auto& output_alias_indices = output_alias_indices_entry.get();
 
-      allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
+      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
 
       for (const auto& entry : alias_indices) {
         auto aliased_output_index = entry.first;

diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -220,6 +220,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // skip allocating real storage for those, but still maintain its spot to
   // maintain the indexing from output aliases to inputs
   std::vector<at::Tensor> allocOutputs(
+      const KernelArgumentHolder& args,
       kir::ExpressionEvaluator& expr_eval,
       const std::unordered_set<int>& alias_indices = {});
 

diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -649,11 +649,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
         group_outputs.size() == group_runtime_outputs.size(),
         "output size does not match");
     for (const size_t group_out_i : c10::irange(group_outputs.size())) {
-      output_holder[group_outputs[group_out_i]] =
-          group_runtime_outputs[group_out_i];
+      // trivial forwarding outputs empty tensor to save bandwidth, skip
+      // tensor_map update on those, since we want all future use of inputs on
+      // the original tensor input. See note [trivial forwarding]
+      if (!group_outputs[group_out_i]->isFusionInput()) {
+        output_holder[group_outputs[group_out_i]] =
+            group_runtime_outputs[group_out_i];
 
-      args.push(group_runtime_outputs[group_out_i]);
-      tensor_map.emplace(group_outputs[group_out_i], args.back());
+        args.push(group_runtime_outputs[group_out_i]);
+        tensor_map.emplace(group_outputs[group_out_i], args.back());
+      }
     }
   }
 
@@ -669,10 +674,30 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
     if (iter != output_holder.end()) {
       fusion_outputs.push_back(iter->second);
     } else if (output->isFusionInput()) {
+      // Note [ trivial forwarding ]
+      //
+      // Background:
+      // nvfuser codegen doesn't handle aliases at all. When we have a fusion
+      // that forwards an input to output without any operations on it, this is
+      // a no-op for codegen and the output tensor is never written to. However,
+      // the codegen cannot "forward" an input to output, since all outputs are
+      // allocated in integration. If we do not special case it, we'll ended up
+      // having a "fresh" tensor allocated for the forwarded-input.
+      //
+      // Approach:
+      // There are two aspects of the support:
+      // step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't
+      // have any producer in the IR, hence the output argument is not used in
+      // the code. But it does require to have an argument in the kernel as a
+      // place-holder so we'll map each arguments correctly.
+      // step 2. Integration handles the trivial forwarding of inputs. When we
+      // put together `fusion_outputs` for a given fusion, when outputs are just
+      // fusion inputs, we directly return the input tensor.
       const auto iter = tensor_map.find(output);
       TORCH_INTERNAL_ASSERT(
           iter != tensor_map.end(), "Can not find output as aliased intput");
       auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
+      // See step 2 - note [ trivial forwarding ]
       fusion_outputs.push_back(arg->getTensor());
     } else {
       bool empty_type_check = output->getDataType().has_value() &&

diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -589,7 +589,7 @@ ForwardNormResult batch_norm(
     // During inference, mean/invstd output are empty tensors
     // on CPU, but not on CUDA. We need to make sure we have the same
     // behavior as with eager mode on CUDA.
-    mean = set(running_mean);
+    mean = running_mean;
     invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
@@ -844,7 +844,7 @@ ForwardNormResult instance_norm(
     // During inference, mean/invstd output are empty tensors
     // on CPU, but not on CUDA. We need to make sure we have the same
     // behavior as with eager mode on CUDA.
-    mean = set(running_mean);
+    mean = running_mean;
     invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }

diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -26026,6 +26026,34 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
   testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({-1, -1});
+  TensorView* tv1 = makeConcreteTensor({-1, -1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  // Note: tv2 is not needed. Kept it here since previously there was an
+  // assertion from sorting in codegen.
+  auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
+  fusion->addOutput(tv0);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 4}, options);
+  at::Tensor t1 = at::randn({10, 4}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+
+  testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
+
+  // Second run to ensure cache hit handles trivial forwarding properly
+  auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
+  testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)