[ROCm] enabling miopen_batch_norm lowering in inductor (pytorch#105740)

jataylo · pytorchmergebot · commit 40184b28eb9e · 2023-08-01T22:39:17.000Z
Enabling miopen_batch_norm lowering for inductor only. This is to avoid errors observed in some models and perf difference is very close from initial benchmarks. ``` LoweringException: RuntimeError: Expected contiguous tensor, but got non-contiguous tensor for argument #1 'input' (while checking arguments for miopen_batch_norm) target: aten.miopen_batch_norm.default ``` Pull Request resolved: pytorch#105740 Approved by: https://github.com/jithunnair-amd, https://github.com/malfet
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -2831,7 +2831,6 @@ def test_batch_norm_2d(self):
         )
 
     # From yolov3
-    @skipIfRocm
     def test_batch_norm_2d_2(self):
         if self.device == "cpu":
             raise unittest.SkipTest("requires CUDA")
diff --git a/test/inductor/test_torchinductor_codegen_dynamic_shapes.py b/test/inductor/test_torchinductor_codegen_dynamic_shapes.py
@@ -11,7 +11,6 @@
     IS_CI,
     IS_WINDOWS,
     TEST_WITH_ASAN,
-    TEST_WITH_ROCM,
     TestCase,
 )
 from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA
@@ -286,9 +285,6 @@ def run(*ex, **kwargs):
     "test_aliased_buffer_reuse_dynamic_shapes": TestFailure(("cpu",)),
 }
 
-if TEST_WITH_ROCM:
-    # aten.miopen_batch_norm is not registered for lowering
-    test_failures["test_batch_norm_2d_dynamic_shapes"] = TestFailure("cuda")
 
 DynamicShapesCodegenCommonTemplate = make_dynamic_cls(
     CommonTemplate, xfail_prop="_expected_failure_codegen_dynamic"
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -59,9 +59,6 @@
     test_failures["test_expanded_reduction_dynamic_shapes"] = TestFailure(
         ("cuda"), is_skip=True
     )
-    test_failures["test_batch_norm_2d_dynamic_shapes"] = TestFailure(
-        ("cuda"), is_skip=True
-    )
 
 
 def make_dynamic_cls(cls, xfail_prop="_expected_failure_dynamic"):
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
@@ -2,6 +2,7 @@
 import logging
 import math
 import numbers
+import typing
 
 import torch
 import torch._decomp as decomp
@@ -400,6 +401,38 @@ def _foreach_lerp_scalar(start_tensors, end_tensors, weight):
     )
 
 
+@aten.miopen_batch_norm.default.py_impl(torch._C.DispatchKey.Autograd)
+@register_decomposition(aten.miopen_batch_norm)
+def miopen_batch_norm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: typing.Optional[torch.Tensor],
+    running_mean: typing.Optional[torch.Tensor],
+    running_var: typing.Optional[torch.Tensor],
+    training: bool,
+    exponential_average_factor: float,
+    epsilon: float,
+):
+    a, b, c = aten.native_batch_norm(
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        training,
+        exponential_average_factor,
+        epsilon,
+    )
+
+    if training:
+        return (a, b, c)
+    return (
+        a,
+        weight.new_zeros((0,)),
+        weight.new_zeros((0,)),
+    )
+
+
 @functools.lru_cache(None)
 def fast_random_decomps():
     return {**decompositions, **extra_random_decomps}
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
@@ -1872,9 +1872,6 @@ def apply_constraint(arg, fx_arg):
 # fails accuracy on test_torch.py, and explicit fallback required to avoid warn=True on implicit
 make_fallback(aten.exponential.default, warn=False)
 
-# ROCm specific fallback, perf issues are observed when registered
-make_fallback(aten.miopen_batch_norm, warn=False)
-
 
 # Register with type_promotion_kind None.
 # For example, fp16.copy_(fp32) should **not** promote the first input's dtype.

Original file line number	Diff line number	Diff line change
`@@ -2831,7 +2831,6 @@ def test_batch_norm_2d(self):`
`2831`	`2831`	`)`
`2832`	`2832`
`2833`	`2833`	`# From yolov3`
`2834`		`- @skipIfRocm`
`2835`	`2834`	`def test_batch_norm_2d_2(self):`
`2836`	`2835`	`if self.device == "cpu":`
`2837`	`2836`	`raise unittest.SkipTest("requires CUDA")`
Original file line number	Diff line number	Diff line change
`@@ -59,9 +59,6 @@`
`59`	`59`	`test_failures["test_expanded_reduction_dynamic_shapes"] = TestFailure(`
`60`	`60`	`("cuda"), is_skip=True`
`61`	`61`	`)`
`62`		`- test_failures["test_batch_norm_2d_dynamic_shapes"] = TestFailure(`
`63`		`- ("cuda"), is_skip=True`
`64`		`- )`
`65`	`62`
`66`	`63`
`67`	`64`	`def make_dynamic_cls(cls, xfail_prop="_expected_failure_dynamic"):`