Add nvFuser support for torch.native_batch_norm (#85562)

IvanYashchuk · pytorchmergebot · commit 68a6113248ac · 2022-10-03T15:03:08.000Z
This PR adds nvFuser's implementation for batch_norm as there's no reference yet (#81191) and no in-place copy support (#84545). Pull Request resolved: #85562 Approved by: https://github.com/kevinstephano, https://github.com/ngimel
diff --git a/functorch/test/test_ops.py b/functorch/test/test_ops.py
@@ -395,6 +395,7 @@ def wrapped_fn(*args, **kwargs):
         skip('nn.functional.max_unpool1d'),  # fails everywhere except on mac
         skip('nn.functional.max_unpool2d'),  # fails everywhere except on windows
         skip('nn.functional.max_unpool3d'),  # fails everywhere except on mac
+        xfail("native_batch_norm"),
 
         xfail('nn.functional.rrelu')  # in-place test errors out with no formula implemented
     }))
@@ -643,6 +644,7 @@ def fn(inp, *args, **kwargs):
         xfail("nn.functional.batch_norm", 'without_cudnn'),
         # view doesn't work on sparse
         xfail("to_sparse"),
+        xfail("native_batch_norm"),
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
@@ -725,6 +727,7 @@ def vjp_of_vjp(*args_and_cotangents):
         # ---------------------------- BUGS ------------------------------------
         # All of the following are bugs and need to be fixed
         skip('linalg.svdvals'),  # # really annoying thing where it passes correctness check but not has_batch_rule
+        skip("native_batch_norm"),
         xfail('__getitem__', ''),  # dynamic error
         xfail('linalg.eig'),  # Uses aten::allclose
         xfail('linalg.householder_product'),  # needs select_scatter
@@ -833,6 +836,7 @@ def test_vmapvjp(self, device, dtype, op):
         # erroring because running_mean and running_var aren't differentiable
         xfail('nn.functional.batch_norm'),
         xfail('nn.functional.batch_norm', 'without_cudnn'),
+        xfail("native_batch_norm"),
         # ----------------------------------------------------------------------
     }
 
@@ -1030,6 +1034,7 @@ def test():
         xfail('linalg.vecdot', ''),
         xfail('segment_reduce', 'lengths'),
         xfail('sparse.sampled_addmm', ''),
+        xfail("native_batch_norm"),
     }))
     def test_vmapvjp_has_batch_rule(self, device, dtype, op):
         if not op.supports_autograd:
@@ -1095,6 +1100,7 @@ def test():
         xfail('nn.functional.dropout3d', ''),
         xfail('as_strided_scatter', ''),
         xfail('sparse.sampled_addmm', ''),
+        xfail("native_batch_norm"),
     }))
     def test_vjpvmap(self, device, dtype, op):
         # NB: there is no vjpvmap_has_batch_rule test because that is almost
@@ -1338,6 +1344,10 @@ def reference(primals, cotangents, primals_tangents, cotangents_tangents):
         xfail('to'),  # RuntimeError: required rank 4 tensor to use channels_last format
         xfail('to_sparse'),  # Forward AD not implemented and no decomposition
         xfail('view_as_complex'),  # RuntimeError: Tensor must have a last dimension with stride 1
+        # RuntimeError: Batch norm got a batched tensor as
+        # input while the running_mean or running_var, which will be updated in
+        # place, were not batched.
+        xfail("native_batch_norm"),
     }))
     @ops(op_db + additional_op_db, allowed_dtypes=(torch.float,))
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
diff --git a/functorch/test/test_vmap.py b/functorch/test/test_vmap.py
@@ -3287,6 +3287,7 @@ def test():
     @toleranceOverride({torch.float32: tol(atol=1e-04, rtol=1e-04)})
     @skipOps('TestVmapOperatorsOpInfo', 'test_vmap_exhaustive', vmap_fail.union({
         xfail('cat'),
+        xfail('native_batch_norm'),
     }))
     def test_vmap_exhaustive(self, device, dtype, op):
         # needs to be fixed
@@ -3306,6 +3307,7 @@ def test_vmap_exhaustive(self, device, dtype, op):
         xfail('cat'),
         xfail('complex'),
         xfail('copysign'),
+        xfail('native_batch_norm'),
         xfail('histogram'),
         xfail('index_fill'),
         xfail('nansum'),
diff --git a/test/test_prims.py b/test/test_prims.py
@@ -548,6 +548,69 @@ def func(a):
                 self.assertFalse(node.target == torch.ops.prims.add.default)
                 self.assertFalse(node.target == torch.ops.aten.add.default)
 
+    @onlyCUDA
+    @skipCUDAIfRocm
+    @dtypes(torch.float32, torch.float64)
+    def test_native_batch_norm_nvprims(self, device, dtype):
+        from torch._prims.context import TorchRefsNvfuserCapabilityMode
+        from torch._prims.executor import execute
+
+        # This test verifies that native_batch_norm is translated into nvprims
+        # and can be executed with nvFuser
+        from torch.fx.experimental.proxy_tensor import make_fx
+        from torch.testing._internal.common_methods_invocations import (
+            sample_inputs_native_batch_norm,
+        )
+
+        samples = sample_inputs_native_batch_norm(
+            None, device, dtype, requires_grad=False
+        )
+        batch_norms = [
+            torch.native_batch_norm,
+            torch.ops.aten.native_batch_norm,
+            torch.ops.aten.native_batch_norm.default,
+            torch.ops.nvprims.native_batch_norm.default,
+        ]
+        for sample, batch_norm in product(samples, batch_norms):
+            if sample.input.numel() == 0:
+                continue
+
+            def func(
+                input, weight, bias, running_mean, running_var, training, momentum, eps
+            ):
+                return batch_norm(
+                    input,
+                    weight,
+                    bias,
+                    running_mean,
+                    running_var,
+                    training,
+                    momentum,
+                    eps,
+                )
+
+            with TorchRefsNvfuserCapabilityMode():
+                gm = make_fx(func)(sample.input, *sample.args)
+
+            call_function_nodes = list(
+                filter(lambda n: n.op == "call_function", gm.graph.nodes)
+            )
+            includes_aten_batch_norm = any(
+                torch.ops.aten.native_batch_norm.default == node.target
+                for node in call_function_nodes
+            )
+            self.assertFalse(includes_aten_batch_norm)
+
+            includes_nvprims_batch_norm = any(
+                torch.ops.nvprims.native_batch_norm.default == node.target
+                for node in call_function_nodes
+            )
+            self.assertTrue(includes_nvprims_batch_norm)
+
+            # Check that the graph can be executed with nvFuser
+            out = execute(gm, sample.input, *sample.args, executor="strictly_nvfuser")
+            self.assertEqual(out, gm(sample.input, *sample.args))
+
     # decomposition of native_batch_norm_backward uses a casting, which prevents nvprim lowering on CPU build
     @onlyCUDA
     @dtypes(torch.float32, torch.float16)
diff --git a/torch/_prims/context.py b/torch/_prims/context.py
@@ -265,6 +265,12 @@ def _is_var_mean(self, func):
             and "aten.var_mean" in str(func)
         )
 
+    def _is_native_batch_norm(self, func):
+        return "torch.native_batch_norm" == torch.overrides.resolve_name(func) or (
+            func == torch.ops.aten.native_batch_norm.default
+            or func == torch.ops.aten.native_batch_norm
+        )
+
     def _is_rand_like(self, func):
         result = "torch.rand_like" == torch.overrides.resolve_name(func) or (
             func == torch.ops.aten.rand_like or func == torch.ops.aten.rand_like.default
@@ -283,9 +289,14 @@ def __torch_function__(
         # First we intercept calls for nvfuser-specific prims bypassing generic torch._refs
         if self._is_var_mean(orig_func):
             return torch.ops.nvprims.var_mean(*args, **kwargs)
+
+        if self._is_native_batch_norm(orig_func):
+            return torch.ops.nvprims.native_batch_norm(*args, **kwargs)
+
         if self._is_rand_like(orig_func):
             if len(kwargs) > 0:
                 warn("rand_like has ignored kwars!")
             return torch.ops.nvprims.rand_like(*args)
+
         # Then we use TorchRefsMode to interpret the rest
         return super().__torch_function__(orig_func, types, args, kwargs)
diff --git a/torch/_prims/nvfuser_executor.py b/torch/_prims/nvfuser_executor.py
@@ -136,6 +136,18 @@ def run_node(self, node):
                     args, kwargs = self.fetch_args_kwargs_from_env(node)
                     args = [args[0], original_shape, args[1]]
                     return self.call_function(node.target, args, node.kwargs)
+
+                if node.target in [
+                    torch.ops.nvprims.native_batch_norm,
+                    torch.ops.nvprims.native_batch_norm.default,
+                ]:
+                    args, kwargs = self.fetch_args_kwargs_from_env(node)
+                    assert len(args) == 8
+                    training = args[5]
+                    args6_end = tuple(map(_to_nvfuser_constant, args[6:]))
+                    args = args[:5] + (training,) + args6_end
+                    return node.target.impl_nvfuser(fd, *args, **kwargs)
+
                 return super().run_node(node)
 
             def call_function(self, target, args, kwargs):
diff --git a/torch/_prims/nvfuser_prims.py b/torch/_prims/nvfuser_prims.py
@@ -210,6 +210,29 @@ def _{fname}_nvfuser(fd, a, b, c):
     )
 
 
+def _native_batch_norm_nvfuser(
+    fd, input, weight, bias, running_mean, running_var, training, momentum, eps
+):
+    if weight is None:
+        weight = fd.define_null_tensor()
+    if bias is None:
+        bias = fd.define_null_tensor()
+    if running_mean is None:
+        running_mean = fd.define_null_tensor()
+    if running_var is None:
+        running_var = fd.define_null_tensor()
+    return fd.ops.batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        momentum,
+        eps,
+    )
+
+
 def _broadcast_in_dim_nvfuser(
     fd: Any,
     a: TensorLikeType,
@@ -299,6 +322,7 @@ def _amin_nvfuser(
     return fd.ops.min(a, dims, keep_dims)
 
 
+_nvfuser_impls["native_batch_norm"] = _native_batch_norm_nvfuser
 _nvfuser_impls["broadcast_in_dim"] = _broadcast_in_dim_nvfuser
 _nvfuser_impls["convert_element_type"] = _convert_element_type_nvfuser
 _nvfuser_impls["transpose"] = _transpose_nvfuser
@@ -312,6 +336,36 @@ def _amin_nvfuser(
 _nvfuser_impls["amin"] = _amin_nvfuser
 
 
+def register_native_batch_norm():
+    """This function is used to register the native_batch_norm function in torch.ops.nvprims module."""
+    name = "native_batch_norm"
+
+    nvprim.define(
+        f"{name}(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, "
+        + "bool training, float momentum, float eps)"
+        + " -> (Tensor, Tensor, Tensor)"
+    )
+
+    def _prim_impl(
+        input, weight, bias, running_mean, running_var, training, momentum, eps
+    ):
+        return torch.native_batch_norm(
+            input, weight, bias, running_mean, running_var, training, momentum, eps
+        )
+
+    nvprim_impl.impl(name, _prim_impl)
+    nvprim_autograd_impl.impl(
+        name, backwards_not_supported(torch.ops.nvprims.native_batch_norm.default)
+    )
+
+    prim_packet = torch.ops.nvprims.native_batch_norm
+    prim = prim_packet.default
+    for p in (prim_packet, prim):
+        p.__doc__ = "Computes batch normalization."
+        p.impl_nvfuser = _nvfuser_impls["native_batch_norm"]
+        p.return_type = torch._prims_common.RETURN_TYPE.NEW  # type: ignore[attr-defined]
+
+
 def register_rand_like():
     name = "rand_like"
 
@@ -471,6 +525,7 @@ def _var_mean_autograd(
 def register_nvprims():
     """Registers all nvFuser primitives in the torch.ops.nvprims module."""
     register_var_mean()
+    register_native_batch_norm()
     register_rand_like()
 
     for name in nvprim_names:
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -587,8 +587,11 @@ ForwardNormResult batch_norm(
     auto invstd_bcast = broadcast(unbiased_invstd, broadcast_mask);
 
     // During inference, mean/invstd output are empty tensors
-    mean = TensorViewBuilder().shape(std::vector<int64_t>{0}).build();
-    invstd = TensorViewBuilder().shape(std::vector<int64_t>{0}).build();
+    // on CPU, but not on CUDA. We need to make sure we have the same
+    // behavior as with eager mode on CUDA.
+    mean = set(running_mean); // use set to avoid "trivial input forwarding NOT
+                              // IMPLEMENTED" error
+    invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
 
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_interface.cpp
@@ -32,7 +32,12 @@ void FusionInterface::addOutput(Nvf::Val* output) const {
 
 std::vector<at::Tensor> FusionInterface::execute(
     const at::ArrayRef<c10::IValue>& inputs) const {
-  return fusionExecutorCachePtr()->runFusionWithInputs(inputs);
+  // aliasOutputToInput always adds Tensors as outputs that we don't want
+  // to return to the user. We need to remove them.
+  auto count_output_aliases = fusionPtr()->getOutputAliasIndices().size();
+  auto result = fusionExecutorCachePtr()->runFusionWithInputs(inputs);
+  result.erase(result.begin(), result.begin() + count_output_aliases);
+  return result;
 }
 
 Nvf::FusionGuard FusionInterface::guard() const {
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h b/torch/csrc/jit/codegen/cuda/python_frontend/fusion_record.h
diff --git a/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp b/torch/csrc/jit/codegen/cuda/python_frontend/python_bindings.cpp
diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py