pytorch · Apr 9, 2025
diff --git a/‎extension/llm/custom_ops/op_sdpa.cpp
Lines changed: 20 additions & 15 deletions b/‎extension/llm/custom_ops/op_sdpa.cpp
Lines changed: 20 additions & 15 deletions
diff --git a/‎extension/llm/custom_ops/op_sdpa.h
Lines changed: 1 addition & 0 deletions b/‎extension/llm/custom_ops/op_sdpa.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎extension/llm/custom_ops/op_sdpa_aot.cpp
Lines changed: 12 additions & 6 deletions b/‎extension/llm/custom_ops/op_sdpa_aot.cpp
Lines changed: 12 additions & 6 deletions
diff --git a/‎extension/llm/custom_ops/op_sdpa_impl.h
Lines changed: 84 additions & 35 deletions b/‎extension/llm/custom_ops/op_sdpa_impl.h
Lines changed: 84 additions & 35 deletions
diff --git a/‎extension/llm/custom_ops/test_quantized_sdpa.py
Lines changed: 72 additions & 6 deletions b/‎extension/llm/custom_ops/test_quantized_sdpa.py
Lines changed: 72 additions & 6 deletions
@@ -264,14 +264,14 @@ Tensor& flash_attention_kernel_out(
       InvalidArgument,
       output);
 
-  auto q_seq_len = query.size(2);
+  auto seq_len = query.size(2);
 
   ET_SWITCH_FLOAT_TYPES(
       query.scalar_type(), ctx, "flash_attention", CTYPE, [&] {
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               query,
@@ -287,7 +287,7 @@ Tensor& flash_attention_kernel_out(
               nullopt,
               nullopt,
               nullopt);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               query,
@@ -341,7 +341,8 @@ Tensor& custom_sdpa_out_impl(
     const optional<Tensor>& k_zero_points = nullopt,
     const optional<Tensor>& k_scales = nullopt,
     const optional<Tensor>& v_zero_points = nullopt,
-    const optional<Tensor>& v_scales = nullopt) {
+    const optional<Tensor>& v_scales = nullopt,
+    bool is_seq_at_dim_2 = false) {
   ET_KERNEL_CHECK_MSG(
       ctx,
       !attn_mask.has_value() || !is_causal,
@@ -357,13 +358,15 @@ Tensor& custom_sdpa_out_impl(
       "Invalid arguments");
 
   int64_t seq_len = q.size(1);
-  auto q_seq_len = q.size(1);
+  SeqDim seq_dim{SeqDim::TWO};
+  if (!is_seq_at_dim_2) {
+    seq_dim = SeqDim::ONE;
+  }
 
-  bool is_seq_at_dim_1{true};
   if (q.scalar_type() == ScalarType::Char) {
-    is_seq_at_dim_1 = false;
-    seq_len = q.size(2);
-    q_seq_len = q.size(2);
+    if (seq_dim == SeqDim::TWO) {
+      seq_len = q.size(2);
+    }
     ET_KERNEL_CHECK_MSG(
         ctx,
         q_scales.has_value() && q_zero_points.has_value() &&
@@ -412,7 +415,7 @@ Tensor& custom_sdpa_out_impl(
         // TODO we need to re-evaluate this for ARM CPUs
         // And there can be many so instead of templatizing
         // we might consider another appraoch
-        if (q_seq_len >= 768) {
+        if (seq_len >= 768) {
           sdpa::impl::cpu_flash_attention<CTYPE, 256, 512>(
               output,
               q,
@@ -428,10 +431,10 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
-        } else if (q_seq_len >= 192) {
+        } else if (seq_len >= 192) {
           sdpa::impl::cpu_flash_attention<CTYPE, 64, 512>(
               output,
               q,
@@ -447,7 +450,7 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         } else {
@@ -466,7 +469,7 @@ Tensor& custom_sdpa_out_impl(
               k_scales, // k_scales
               v_zero_points, // v_zero_points
               v_scales, // v_scales
-              is_seq_at_dim_1, /* is_seq_at_dim_1 */
+              seq_dim, /* seq_dim */
               start_pos,
               num_keys_for_causal_attention);
         }
@@ -492,6 +495,7 @@ Tensor& custom_quantized_sdpa_out(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output) {
   return custom_sdpa_out_impl(
       ctx,
@@ -509,7 +513,8 @@ Tensor& custom_quantized_sdpa_out(
       k_zero_points,
       k_scales,
       v_zero_points,
-      v_scales);
+      v_scales,
+      is_seq_at_dim_2);
 }
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 
 
@@ -74,6 +74,7 @@ Tensor& custom_quantized_sdpa_out(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
+    const bool is_seq_at_dim_1,
     Tensor& output);
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 } // namespace native
 
@@ -96,6 +96,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
     const optional<Tensor> k_scales,
     const optional<Tensor> v_zero_points,
     const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output);
 
 at::Tensor custom_quantized_sdpa_aten(
@@ -115,7 +116,8 @@ at::Tensor custom_quantized_sdpa_aten(
     const std::optional<at::Tensor>& k_zero_points,
     const std::optional<at::Tensor>& k_scales,
     const std::optional<at::Tensor>& v_zero_points,
-    const std::optional<at::Tensor>& v_scales);
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2);
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 
 Tensor& update_cache_out_no_context(
@@ -258,6 +260,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
     const optional<Tensor> k_scales,
     const optional<Tensor> v_zero_points,
     const optional<Tensor> v_scales,
+    const bool is_seq_at_dim_2,
     Tensor& output) {
   executorch::aten::RuntimeContext context{};
   return torch::executor::native::custom_quantized_sdpa_out(
@@ -276,6 +279,7 @@ Tensor& custom_quantized_sdpa_out_no_context(
       k_scales,
       v_zero_points,
       v_scales,
+      is_seq_at_dim_2,
       output);
 }
 
@@ -296,9 +300,10 @@ at::Tensor custom_quantized_sdpa_aten(
     const std::optional<at::Tensor>& k_zero_points,
     const std::optional<at::Tensor>& k_scales,
     const std::optional<at::Tensor>& v_zero_points,
-    const std::optional<at::Tensor>& v_scales) {
+    const std::optional<at::Tensor>& v_scales,
+    const bool is_seq_at_dim_2) {
   auto output = at::empty(q.sizes());
-  WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 14)
+  WRAP_TO_ATEN(custom_quantized_sdpa_out_no_context, 15)
   (q,
    k,
    v,
@@ -313,6 +318,7 @@ at::Tensor custom_quantized_sdpa_aten(
    k_scales,
    v_zero_points,
    v_scales,
+   is_seq_at_dim_2,
    output);
   return output;
 }
@@ -371,13 +377,13 @@ TORCH_LIBRARY_FRAGMENT(llama, m) {
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
       "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
       "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
-      "Tensor? v_scales=None) -> Tensor");
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False) -> Tensor");
   m.def(
       "custom_quantized_sdpa.out(Tensor query, Tensor key, Tensor value, SymInt start_pos, "
       "Tensor? attn_mask=None, float drpout_p=0.0, bool is_causal=False, "
       "float? scale=None, Tensor? q_zero_points=None, Tensor? q_scales=None, "
       "Tensor? k_zero_points=None, Tensor? k_scales=None, Tensor? v_zero_points=None, "
-      "Tensor? v_scales=None, *, Tensor(a!) out) -> Tensor(a!)");
+      "Tensor? v_scales=None, bool is_seq_at_dim_2=False, *, Tensor(a!) out) -> Tensor(a!)");
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 }
 
@@ -404,6 +410,6 @@ TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
   m.impl(
       "custom_quantized_sdpa.out",
       WRAP_TO_ATEN(
-          torch::executor::native::custom_quantized_sdpa_out_no_context, 14));
+          torch::executor::native::custom_quantized_sdpa_out_no_context, 15));
 #endif // ENABLE_CUSTOM_QUANTIZED_SDPA
 }
@@ -32,13 +32,17 @@ namespace executor {
 
 namespace native {
 
+enum class SeqDim { ONE = 1, TWO };
+
 namespace sdpa::impl {
 
 struct MaybeQuantizedMatrixData {
   const void* data{nullptr};
   const int8_t* zero_points{nullptr};
   const float* scales{nullptr};
   int64_t m = 0, n = 0;
+  const int64_t zero_points_stride{1};
+  const int64_t scales_stride{1};
   ScalarType dtype{ScalarType::Float};
   MaybeQuantizedMatrixData() = default;
   MaybeQuantizedMatrixData(
@@ -47,12 +51,15 @@ struct MaybeQuantizedMatrixData {
       const float* scales_,
       int64_t m_,
       int64_t n_,
+      int64_t qparams_stride,
       ScalarType dtype_)
       : data(data_),
         zero_points(zero_points_),
         scales(scales_),
         m(m_),
         n(n_),
+        zero_points_stride(qparams_stride),
+        scales_stride(qparams_stride),
         dtype(dtype_) {}
 };
 
@@ -91,8 +98,9 @@ void _q_at_k_gemm(
           static_cast<const int8_t*>(k_data.zero_points),
           static_cast<const float*>(q_data.scales),
           static_cast<const float*>(k_data.scales),
-          1,
-          1);
+          // LHS and RHS are assumed to have same stride for qparams
+          q_data.zero_points_stride,
+          k_data.zero_points_stride);
     } else {
       ET_CHECK_MSG(
           false, "Accumulation in dtype other than float not supported yet");
@@ -152,7 +160,7 @@ void _qk_at_v_gemm(
           static_cast<const int8_t*>(v_data.zero_points),
           static_cast<const float*>(v_data.scales),
           beta,
-          1);
+          v_data.zero_points_stride);
     } else {
       ET_CHECK_MSG(
           false, "Accumulation in dtype other than float not supported yet");
@@ -351,6 +359,40 @@ sdpa_with_kv_cache does not use attn_mask.
 
 TODO: Just handle conversion of bool mask to float
 */
+/**
+ * @brief Implements Flash Attention algorithm on CPU
+ *
+ * This function computes scaled dot-product attention with optimizations for
+ CPU.
+ * It supports both regular and quantized attention computation.
+ *
+ * @tparam scalar_t The data type for computation (e.g., float)
+ * @tparam q_split_size Block size for query matrix in tiling algorithm
+ * @tparam kv_split_size Block size for key/value matrices in tiling algorithm
+ *
+ * @param output Output tensor to store attention results
+ * @param query Query tensor [Batch x Num_heads x Q_seq_len x Dim_per_head]
+ * @param key Key tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head]
+ * @param value Value tensor [Batch x Num_heads_kv x KV_seq_len x Dim_per_head]
+ * @param dropout_p Dropout probability (not used in current implementation)
+ * @param is_causal Whether to apply causal mask (lower triangular)
+ * @param attn_mask Optional explicit attention mask
+ * @param scale Optional custom scaling factor (default: 1/sqrt(head_dim))
+ * @param q_zero_points Optional zero points for quantized query
+ * @param q_scales Optional scales for quantized query
+ * @param k_zero_points Optional zero points for quantized key
+ * @param k_scales Optional scales for quantized key
+ * @param v_zero_points Optional zero points for quantized value
+ * @param v_scales Optional scales for quantized value
+ * @param seq_dim Which dimension is sequence dimension.
+ If SeqDim::One, then query, key, value are
+ expected to be in shape [Batch x Q_seq_len x Dim_per_head x Num_heads] and
+ output is expected to be in shape [Batch x Q_seq_len x Dim_per_head x
+ Num_heads]
+ * @param start_pos Starting position for causal masking in generation
+ * @param num_keys_for_causal_attention Number of keys to consider for causal
+ attention (-1 for all)
+ */
 template <typename scalar_t, int64_t q_split_size, int64_t kv_split_size>
 void cpu_flash_attention(
     Tensor& output,
@@ -367,22 +409,10 @@ void cpu_flash_attention(
     const optional<Tensor>& k_scales,
     const optional<Tensor>& v_zero_points,
     const optional<Tensor>& v_scales,
-    bool is_seq_at_dim_1 = false,
+    const SeqDim seq_dim = SeqDim::TWO,
     const int64_t start_pos = 0,
     const int64_t num_keys_for_causal_attention = -1) {
   (void)dropout_p;
-  // Query (Batch x Num_heads  x Q_seq_len  x Dim_per_head)
-  // Key   (Batch x Num_heads  x KV_seq_len x Dim_per_head)
-  // Value (Batch x Num_heads  x KV_seq_len x Dim_per_head)
-
-  /*
-  //    -> (Batch x Q_seq_len  x Num_heads  x Dim_per_head)
-  at::Tensor query = q.transpose(1, 2);
-  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
-  at::Tensor key = k.transpose(1, 2);
-  //    -> (Batch x KV_seq_len x Num_heads  x Dim_per_head)
-  at::Tensor value = v.transpose(1, 2);
-  */
 
   // Without this we have out-of-bounds writes for
   // causal masking
@@ -408,7 +438,7 @@ void cpu_flash_attention(
   int64_t kvSize = value.size(2);
   int64_t num_heads_kv = key.size(1);
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     num_head = query.size(2);
     num_heads_kv = key.size(2);
     qSize = query.size(1);
@@ -466,7 +496,7 @@ void cpu_flash_attention(
   int64_t qStrideH = strides[1];
   int64_t qStrideM = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     qStrideH = strides[2];
     qStrideM = strides[1];
   }
@@ -476,7 +506,7 @@ void cpu_flash_attention(
   int64_t kStrideH = strides[1];
   int64_t kStrideN = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     kStrideH = strides[2];
     kStrideN = strides[1];
   }
@@ -486,7 +516,7 @@ void cpu_flash_attention(
   int64_t vStrideH = strides[1];
   int64_t vStrideN = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     vStrideH = strides[2];
     vStrideN = strides[1];
   }
@@ -502,28 +532,44 @@ void cpu_flash_attention(
   int64_t v_quant_params_StrideN = 0;
 
   if (is_quantized_sdpa) {
-    strides = q_zero_points.value().strides();
-    q_quant_params_StrideB = strides[0];
-    q_quant_params_StrideH = strides[1];
-    q_quant_params_StrideM = strides[2];
-
-    strides = k_zero_points.value().strides();
-    k_quant_params_StrideB = strides[0];
-    k_quant_params_StrideH = strides[1];
-    k_quant_params_StrideN = strides[2];
-
-    strides = v_zero_points.value().strides();
-    v_quant_params_StrideB = strides[0];
-    v_quant_params_StrideH = strides[1];
-    v_quant_params_StrideN = strides[2];
+    auto q_strides = q_zero_points.value().strides();
+    q_quant_params_StrideB = q_strides[0];
+    q_quant_params_StrideH = q_strides[1];
+    q_quant_params_StrideM = q_strides[2];
+
+    auto k_strides = k_zero_points.value().strides();
+    k_quant_params_StrideB = k_strides[0];
+    k_quant_params_StrideH = k_strides[1];
+    k_quant_params_StrideN = k_strides[2];
+
+    auto v_strides = v_zero_points.value().strides();
+    v_quant_params_StrideB = v_strides[0];
+    v_quant_params_StrideH = v_strides[1];
+    v_quant_params_StrideN = v_strides[2];
+
+    ET_CHECK_MSG(
+        (v_quant_params_StrideN == k_quant_params_StrideN) &&
+            (v_quant_params_StrideN == q_quant_params_StrideM),
+        "Quant params strides must be same for seq dim");
+
+    if (seq_dim == SeqDim::ONE) {
+      q_quant_params_StrideH = q_strides[2];
+      q_quant_params_StrideM = q_strides[1];
+
+      k_quant_params_StrideH = k_strides[2];
+      k_quant_params_StrideN = k_strides[1];
+
+      v_quant_params_StrideH = v_strides[2];
+      v_quant_params_StrideN = v_strides[1];
+    }
   }
 
   strides = output.strides();
   int64_t oStrideB = strides[0];
   int64_t oStrideH = strides[1];
   int64_t oStrideM = strides[2];
 
-  if (is_seq_at_dim_1) {
+  if (seq_dim == SeqDim::ONE) {
     oStrideH = strides[2];
     oStrideM = strides[1];
   }
@@ -679,13 +725,15 @@ void cpu_flash_attention(
             q_scales_ptr,
             qBlockSize,
             headSize,
+            q_quant_params_StrideM,
             query.scalar_type());
         MaybeQuantizedMatrixData k_sub_matrix_data = MaybeQuantizedMatrixData(
             static_cast<const void*>(k_sub_matrix_data_ptr),
             k_zero_points_ptr,
             k_scales_ptr,
             kvBlockSize,
             headSize,
+            k_quant_params_StrideN,
             key.scalar_type());
         _q_at_k_gemm<accum_t>(
             qBlockSize,
@@ -835,6 +883,7 @@ void cpu_flash_attention(
             v_scales_ptr,
             kvBlockSize,
             headSize,
+            v_quant_params_StrideN,
             value.scalar_type());
         // Calculate Softmax(q @ k.T) @ v
         _qk_at_v_gemm<accum_t>(
 
@@ -35,6 +35,7 @@ def setUp(self):
         self.float_dtype = torch.float32
         self.q_shape = None
         self.kv_shape = None
+        self.is_seq_at_dim_2 = True
 
     def _scale_tensor(self, tensor, min_value, max_value, scale=True):
         normalized_tensor = (tensor - tensor.min()) / (tensor.max() - tensor.min())
@@ -105,6 +106,10 @@ def _sdpa_ref(
             self.float_dtype,
         )
 
+        if not self.is_seq_at_dim_2:
+            q = q.transpose(1, 2).contiguous()
+            k = k.transpose(1, 2).contiguous()
+            v = v.transpose(1, 2).contiguous()
         num_heads_q = q.size(1)
         num_heads_kv = k.size(1)
         seq_len = q.size(2)
@@ -119,6 +124,8 @@ def _sdpa_ref(
             k = k.repeat_interleave(n_reps, dim=1)
             v = v.repeat_interleave(n_reps, dim=1)
         out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        if not self.is_seq_at_dim_2:
+            out = out.transpose(1, 2).contiguous()
         return out
 
     def _int_matmul(
@@ -212,7 +219,7 @@ def _test_sdpa_common(
         seq_len,
         scale_tensors=False,
         atol=1e-5,
-        is_seq_at_dim_2=True,
+        is_seq_at_dim_2=False,
     ):
         # Range arbitrarily chosen to reproduce a numerical error on x86 in some of the long context tests
         tensor_scale_max = 15
@@ -221,9 +228,10 @@ def _test_sdpa_common(
         self.n_heads_q = n_heads_q
         self.head_dim = head_dim
         self.max_seq_len = max_seq_len
+        self.is_seq_at_dim_2 = is_seq_at_dim_2
         seq_dim = 2
         self.q_shape = (self.n_batch, self.n_heads_q, seq_len, self.head_dim)
-        self.kv_shape = (self.n_batch, self.n_heads_q, self.max_seq_len, self.head_dim)
+        self.kv_shape = (self.n_batch, self.n_heads_kv, self.max_seq_len, self.head_dim)
         if not is_seq_at_dim_2:
             seq_dim = 1
             self.q_shape = (self.n_batch, seq_len, self.n_heads_q, self.head_dim)
@@ -286,7 +294,6 @@ def _test_sdpa_common(
             quantized_dtype,
         )
 
-        start_pos = 0
         seq_len = q.size(seq_dim)
         attn_mask = self.mask[start_pos : start_pos + seq_len, :]
         attn_mask = attn_mask[:, : start_pos + seq_len]
@@ -334,6 +341,7 @@ def _test_sdpa_common(
             k_scale_fp32,
             v_zero_point_int8,
             v_scale_fp32,
+            is_seq_at_dim_2,
         )
         self.assertTrue(torch.allclose(ref_output, op_output, atol=atol))
         # Following line crashes due to some weird issues in mkldnn with crash in mkl_sgemm with `wild jump`
@@ -374,6 +382,7 @@ def _test_sdpa_common(
             k_scale_fp32,
             v_zero_point_int8,
             v_scale_fp32,
+            is_seq_at_dim_2,
         )
         self.assertTrue(torch.allclose(ref_output, op_output, atol=atol))
 
@@ -393,6 +402,18 @@ def test_sdpa_with_custom_quantized(self):
             seq_len,
             True,
             atol=1e-4,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            True,
+            atol=1e-4,
+            is_seq_at_dim_2=False,
         )
 
     def test_sdpa_with_custom_quantized_seq_len_1(self):
@@ -403,7 +424,22 @@ def test_sdpa_with_custom_quantized_seq_len_1(self):
         seq_len = 1
         start_pos = 0
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
         )
 
     def test_sdpa_with_custom_quantized_seq_len_small(self):
@@ -414,7 +450,22 @@ def test_sdpa_with_custom_quantized_seq_len_small(self):
         seq_len = 4
         start_pos = 0
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
         )
 
     def test_sdpa_with_custom_quantized_seq_len_llava_example(self):
@@ -466,5 +517,20 @@ def test_sdpa_with_cache_mqa(self):
         seq_len = 24
         start_pos = 0
         self._test_sdpa_common(
-            n_heads_kv, n_heads_q, head_dim, max_seq_len, start_pos, seq_len
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=True,
+        )
+        self._test_sdpa_common(
+            n_heads_kv,
+            n_heads_q,
+            head_dim,
+            max_seq_len,
+            start_pos,
+            seq_len,
+            is_seq_at_dim_2=False,
         )