Enable SDPA Unit Tests and Adjust Fudge Factors (ROCm#34)

xinyazhang · web-flow · commit 97c5663c0569 · 2025-01-20T14:20:21.000-06:00
* Adjust of fudge factor for gfx950

* Enable SDPA UTs for gfx950
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -3154,6 +3154,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             'grad_value': 8.5,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['grad_value'] = 16.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3273,6 +3274,7 @@ def _get_mem_eff_drop_mask(batch_size, n_heads, q_len, kv_len, p, seed, offset,
             "grad_attn_mask": 45.0,
         }
         if TEST_WITH_ROCM:
+            fudge_factors['grad_value'] = 16.0
             fudge_factors['grad_key'] = 45.0
             fudge_factors['grad_query'] = 360.0
             if seq_len_k >= 1024:
@@ -3528,7 +3530,7 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
         g.replay()
         out = output_tuple[0]
         if dropout_p == 0.0:
-            self.assertEqual(out_first, out, atol=0, rtol=0)
+            self.assertEqual(out_first, out, atol=0, rtol=0, msg='Two passes of non-dropout graph mismatches')
         else:
             # replays produce different results
             self.assertNotEqual(out_first, out)
@@ -3569,8 +3571,8 @@ def get_dropout_mask(output, fused_kernel, batch_size, n_heads, q_len, kv_len, d
                 fudge_factors={
                     'out': 3.0,
                     'grad_query': 100.0,
-                    'grad_key': 8.0,
-                    'grad_value': 3.0,
+                    'grad_key': 8.0 if not TEST_WITH_ROCM else 16.0,
+                    'grad_value': 3.0 if not TEST_WITH_ROCM else 6.0,
                 }
             )
 
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
@@ -39,7 +39,7 @@
 def CDNA2OrLater():
     if TEST_WITH_ROCM:
         gcn_arch_name = torch.cuda.get_device_properties('cuda').gcnArchName
-        return any(arch in gcn_arch_name for arch in {"gfx90a", "gfx940", "gfx941", "gfx942"})
+        return any(arch in gcn_arch_name for arch in {"gfx90a", "gfx940", "gfx941", "gfx942", "gfx950"})
     return False
 
 def evaluate_gfx_arch_exact(matching_arch):
@@ -54,14 +54,14 @@ def evaluate_gfx_arch_exact(matching_arch):
 
 def evaluate_platform_supports_flash_attention():
     if TEST_WITH_ROCM:
-        return evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-') or evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-')
+        return CDNA2OrLater()
     if TEST_CUDA:
         return not IS_WINDOWS and SM80OrLater
     return False
 
 def evaluate_platform_supports_efficient_attention():
     if TEST_WITH_ROCM:
-        return evaluate_gfx_arch_exact('gfx90a:sramecc+:xnack-') or evaluate_gfx_arch_exact('gfx942:sramecc+:xnack-')
+        return CDNA2OrLater()
     if TEST_CUDA:
         return True
     return False