[cuDNN][TF32] Threshold adjustments for TF32 on >=sm80 (pytorch#78437)

eqy · pytorchmergebot · commit b740a99b9e05 · 2022-06-03T01:02:56.000Z
CC @ptrblck @mcarilli Change to transformer multilayer test can potentially be swapped in favor of an rtol change? (see also: pytorch#75612). Pull Request resolved: pytorch#78437 Approved by: https://github.com/ngimel
diff --git a/test/test_nn.py b/test/test_nn.py
@@ -14070,7 +14070,10 @@ def _test(t, weight, mode):
             if mode == 'same':
                 actual = actual[:5, :5, :10]
 
-            self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
+            if tf32_is_not_fp32() and (dtype == torch.float or dtype == torch.complex64):
+                self.assertEqual(actual, expected, atol=0.05, rtol=0.05)
+            else:
+                self.assertEqual(actual, expected, rtol=2e-5, atol=5e-6)
 
         # Global dtype for this test suite is torch.double
         # This leads to change in type-promotion
@@ -19505,7 +19508,10 @@ def test_cudnn_convolution_relu(self, device, dtype):
             w = w.to(memory_format=memory_format)
             cudnn_out = torch.cudnn_convolution_relu(inp, w, None, (1, 1), (0, 0), (1, 1), 1)
             self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
-            self.assertEqual(conv2d_out.relu(), cudnn_out)
+            if tf32_is_not_fp32() and dtype == torch.float:
+                self.assertEqual(conv2d_out.relu(), cudnn_out, atol=2e-4, rtol=0.006)
+            else:
+                self.assertEqual(conv2d_out.relu(), cudnn_out)
 
     @onlyCUDA
     @skipCUDAIfRocm
@@ -19533,7 +19539,10 @@ def test_cudnn_convolution_add_relu(self, device, dtype):
             cudnn_out = torch.cudnn_convolution_add_relu(inp, w, z, alpha, None, (1, 1), (0, 0), (1, 1), 1)
 
             self.assertTrue(cudnn_out.is_contiguous(memory_format=memory_format))
-            self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
+            if tf32_is_not_fp32() and dtype == torch.float:
+                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out, atol=3e-4, rtol=0.006)
+            else:
+                self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out)
 
     @onlyCUDA
     @skipCUDAIfRocm
diff --git a/torch/testing/_internal/common_nn.py b/torch/testing/_internal/common_nn.py
@@ -4243,7 +4243,7 @@ def unsqueeze_inp(inp):
         check_gradgrad=False,
         desc='multilayer_coder',
         with_tf32=True,
-        tf32_precision=0.01,
+        tf32_precision=0.02,
     ),
     dict(
         module_name='Linear',