ROCm
diff --git a/‎apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
Lines changed: 19 additions & 7 deletions b/‎apex/contrib/csrc/multihead_attn/encdec_multihead_attn_cuda.cu
Lines changed: 19 additions & 7 deletions
diff --git a/‎apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
Lines changed: 20 additions & 6 deletions b/‎apex/contrib/csrc/multihead_attn/encdec_multihead_attn_norm_add_cuda.cu
Lines changed: 20 additions & 6 deletions
diff --git a/‎apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
Lines changed: 19 additions & 7 deletions b/‎apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_additive_mask_cuda.cu
Lines changed: 19 additions & 7 deletions
diff --git a/‎apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
Lines changed: 19 additions & 6 deletions b/‎apex/contrib/csrc/multihead_attn/self_multihead_attn_bias_cuda.cu
Lines changed: 19 additions & 6 deletions
@@ -87,6 +87,10 @@ std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
   char b_layout_n{'n'};
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
+
   // Input Linear Q Fwd
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
                              CUBLAS_OP_T, 
@@ -159,7 +163,8 @@ std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
                              static_cast<half*>(softmax_results_ptr), 
                              k_seq_len, 
                              k_seq_len*q_seq_len, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Padded Softmax
   bool softmax_success = false;
@@ -212,7 +217,8 @@ std::vector<torch::Tensor> fwd_cuda(bool use_time_mask, bool is_training,
                              static_cast<half*>(matmul2_results.data_ptr()), 
                              head_dim*attn_batches, 
                              head_dim, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Output Linear
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -315,7 +321,9 @@ std::vector<torch::Tensor> bwd_cuda(
   char b_layout_t{'t'}; 
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
- 
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif 
   // Output Linear Dgrad
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
                              CUBLAS_OP_N, 
@@ -388,7 +396,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              static_cast<half*>(matmul2_grads.data_ptr()),
                              k_seq_len, 
                              k_seq_len*q_seq_len,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul2 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -410,7 +419,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              v_lin_grads_ptr, 
                              lead_dim_kv, 
                              batch_stride_kv, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   apex_masked_scale_cuda<at::Half,float,uint32_t>(
@@ -449,7 +459,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              q_lin_grads_ptr, 
                              lead_dim_q, 
                              batch_stride_q, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul1 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -471,7 +482,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              k_lin_grads_ptr, 
                              lead_dim_kv, 
                              batch_stride_kv, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Input Linear Q Dgrad  
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
 
@@ -113,6 +113,10 @@ std::vector<torch::Tensor> fwd_cuda(
       1.0e-5, static_cast<const at::Half *>(lyr_nrm_gamma_weights.data_ptr()),
       static_cast<const at::Half *>(lyr_nrm_beta_weights.data_ptr()));
 
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
+
   // Input Linear Q Fwd
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
                              CUBLAS_OP_T, 
@@ -185,7 +189,8 @@ std::vector<torch::Tensor> fwd_cuda(
 			     static_cast<half*>(softmax_results_ptr),
 			     k_seq_len,
 			     k_seq_len*q_seq_len, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Padded Softmax
   bool softmax_success = false;
@@ -239,7 +244,8 @@ std::vector<torch::Tensor> fwd_cuda(
 			     static_cast<half*>(matmul2_results.data_ptr()),
 			     head_dim*attn_batches,
 			     head_dim, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Output Linear
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -371,6 +377,10 @@ std::vector<torch::Tensor> bwd_cuda(
   char b_layout_t{'t'}; 
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
+
 
   // Dropout Add Backward  
   apex_masked_scale_cuda<at::Half,float,uint32_t>(
@@ -452,7 +462,8 @@ std::vector<torch::Tensor> bwd_cuda(
 			     static_cast<half*>(matmul2_grads.data_ptr()),
 			     k_seq_len,
 			     k_seq_len*q_seq_len,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul2 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -474,7 +485,8 @@ std::vector<torch::Tensor> bwd_cuda(
 			     v_lin_grads_ptr,
 			     lead_dim_kv,
 			     batch_stride_kv,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   apex_masked_scale_cuda<at::Half,float,uint32_t>(
@@ -513,7 +525,8 @@ std::vector<torch::Tensor> bwd_cuda(
 			     q_lin_grads_ptr,
 			     lead_dim_q,
 			     batch_stride_q,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul1 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -535,7 +548,8 @@ std::vector<torch::Tensor> bwd_cuda(
 			     k_lin_grads_ptr,
 			     lead_dim_kv,
 			     batch_stride_kv, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Input Linear Q Dgrad  
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
 
@@ -88,6 +88,9 @@ std::vector<torch::Tensor> fwd_cuda(
   char b_layout_n{'n'};
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
   // Input Linear Fwd
   input_lin_results.copy_(input_biases);
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -135,7 +138,8 @@ std::vector<torch::Tensor> fwd_cuda(
                              static_cast<half*>(bmm1_results_ptr), 
                              k_seq_len, 
                              k_seq_len*q_seq_len, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Padded Softmax
   bool softmax_success = false;
@@ -180,7 +184,8 @@ std::vector<torch::Tensor> fwd_cuda(
                              static_cast<half*>(matmul2_results.data_ptr()), 
                              head_dim*attn_batches, 
                              head_dim, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   outputs.copy_(output_biases);
 
@@ -270,6 +275,9 @@ std::vector<torch::Tensor> bwd_cuda(
   char b_layout_t{'t'};
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
 
   // Output Linear Dgrad
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -321,7 +329,7 @@ std::vector<torch::Tensor> bwd_cuda(
                              rocblas_datatype_f32_r,
                              algo,
                              solution_index,
-                             flags));
+			     flags));
 
   auto  output_bias_grads = output_grads.view({-1, embed_dim}) .sum(0, false);
   // MatMul2 Dgrad1
@@ -344,7 +352,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              static_cast<half*>(matmul2_grads.data_ptr()),
                              k_seq_len, 
                              k_seq_len*q_seq_len,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul2 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -366,7 +375,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              v_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
@@ -403,7 +413,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              q_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul1 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -425,7 +436,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              k_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Input Linear Dgrad  
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
 
@@ -80,6 +80,10 @@ fwd_cuda(bool use_time_mask, bool is_training, int heads,
   char b_layout_n{'n'};
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
+
   // Input Linear Fwd
   input_lin_results.copy_(input_biases);
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -127,7 +131,8 @@ fwd_cuda(bool use_time_mask, bool is_training, int heads,
                              static_cast<half*>(softmax_results_ptr), 
                              k_seq_len, 
                              k_seq_len*q_seq_len, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Padded Softmax
   bool softmax_success = false;
@@ -180,7 +185,8 @@ fwd_cuda(bool use_time_mask, bool is_training, int heads,
                              static_cast<half*>(matmul2_results.data_ptr()), 
                              head_dim*attn_batches, 
                              head_dim, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   outputs.copy_(output_biases);
 
@@ -270,6 +276,9 @@ std::vector<torch::Tensor> bwd_cuda(
   char b_layout_t{'t'};
 
   //TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+  #if USE_GEMM_FLAGS_FP16_ALT_IMPL
+    flags = at::BackwardPassGuard::is_backward_pass() ? rocblas_gemm_flags_fp16_alt_impl : 0;
+  #endif
 
   // Output Linear Dgrad
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
@@ -344,7 +353,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              static_cast<half*>(matmul2_grads.data_ptr()),
                              k_seq_len, 
                              k_seq_len*q_seq_len,
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul2 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -366,7 +376,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              v_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Apply Dropout Mask and Scale by Dropout Probability 
   // Softmax Grad
@@ -398,7 +409,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              q_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
 
   // Matmul1 Dgrad2
   gemm_switch_fp32accum(     a_layout_n, 
@@ -420,7 +432,8 @@ std::vector<torch::Tensor> bwd_cuda(
                              k_lin_grads_ptr, 
                              lead_dim, 
                              batch_stride, 
-                             attn_batches);
+                             attn_batches,
+			     flags);
   // Input Linear Dgrad  
   TORCH_CUDABLAS_CHECK(rocblas_gemm_ex(handle,
                              CUBLAS_OP_N,