inductor: promote half/bfloat16 constant to float for cpu vectorization path (pytorch#105440)

XiaobingSuper · pytorchmergebot · commit 18bcf62bbcf7 · 2023-07-19T06:53:23.000Z
As scalar path, we should also promote half/bfloat16 constant to float for better accuracy, after this PR, the TIMM ```dm_nfnet``` model amp path can be passed. Pull Request resolved: pytorch#105440 Approved by: https://github.com/jgong5, https://github.com/jansel
diff --git a/test/inductor/test_cpu_repro.py b/test/inductor/test_cpu_repro.py
@@ -1977,6 +1977,15 @@ def f(a):
         x = torch.rand(4, 5)
         self.common(f, (x,))
 
+    def test_scalar_mul_bfloat16(self):
+        def f(x):
+            return torch.ops.aten.mul.Tensor(x, 1.7015043497085571)
+
+        metrics.reset()
+        x = torch.randn(4, 5, dtype=torch.bfloat16)
+        self.common(f, (x,))
+        assert metrics.generated_cpp_vec_kernel_count == 1
+
     def test_to_channels_last_bfloat16(self):
         def f(a):
             return a.to(memory_format=torch.channels_last)
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
@@ -2094,6 +2094,10 @@ def store_reduction(name, index, value):
 
             @staticmethod
             def constant(val, dtype):
+                if dtype == torch.bfloat16:
+                    # Since load promotes all bfloat16-precision inputs to float, constants
+                    # must be promoted as well
+                    dtype = torch.float32
                 with RecordOptimizationContext(__name__) as node_ctx:
                     opt_ctx: OptimizationContext = node_ctx.get_opt_ctx()
                     assert opt_ctx