From b834118ea35c98ca117ec0f1890bd675847fea40 Mon Sep 17 00:00:00 2001
From: jianyizh <jianyi.zhang@intel.com>
Date: Thu, 17 Jul 2025 07:57:06 +0000
Subject: [PATCH 1/4] add a simple kernel

---
 src/ATen/native/xpu/sycl/Indexing.cpp | 49 ++++++++++++++++++++++-----
 src/ATen/native/xpu/sycl/Indexing.h   | 33 ++++++++++++++++++
 2 files changed, 74 insertions(+), 8 deletions(-)
diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 47a7f76e5b..0e2b7409c6 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -77,6 +77,28 @@ class IndexSelectScalarFunctor {
   }
 };
 
+template <typename index_t, typename scalar_t>
+static inline void _embedding(
+    scalar_t* output,
+    const scalar_t* weight,
+    const index_t* index,
+    int64_t num_embeddings,
+    int64_t embedding_dim,
+    int64_t indices_length) {
+  using KernelClass = EmbeddingKernelFunctor<index_t, scalar_t>;
+  int64_t work_group_size = syclDeviceMaxWorkGroupSize();
+  int64_t num_work_group = ceil_div(
+      static_cast<int64_t>(indices_length * embedding_dim),
+      static_cast<int64_t>(work_group_size));
+  auto kfn = KernelClass(
+      output, weight, index, num_embeddings, embedding_dim, indices_length);
+  sycl_kernel_submit(
+      num_work_group * work_group_size,
+      work_group_size,
+      getCurrentSYCLQueue(),
+      kfn);
+}
+
 template <
     class SrcInfo,
     class DstInfo,
@@ -202,14 +224,25 @@ void index_select_kernel(
 
           // Improve efficiency of generated native instructions for contiguous.
           // See comm/TensorInfo.h
-          if (dst.is_contiguous() && indices.is_contiguous())
-            _index_select_kernel<
-                SrcInfo,
-                DstInfo,
-                IdxInfo,
-                /* TrivialOffCal */ true>(
-                src_info, dst_info, index_info, new_indexing_dim);
-          else
+          if (dst.is_contiguous() && indices.is_contiguous()) {
+            if (src.dim() == 2 && indices.dim() == 1 &&
+                src.is_contiguous()) {
+              _embedding<index_t, scalar_t>(
+                  dst.mutable_data_ptr<scalar_t>(),
+                  src.const_data_ptr<scalar_t>(),
+                  indices.const_data_ptr<index_t>(),
+                  src.size(0),
+                  src.size(1),
+                  indices.size(0));
+            } else {
+              _index_select_kernel<
+                  SrcInfo,
+                  DstInfo,
+                  IdxInfo,
+                  /* TrivialOffCal */ true>(
+                  src_info, dst_info, index_info, new_indexing_dim);
+            }
+          } else
             _index_select_kernel<
                 SrcInfo,
                 DstInfo,
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index acc277ec16..346d0e9f2f 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -27,6 +27,39 @@ TensorInfo<T, IndexType> tensorInfoIfScalar(TensorInfo<T, IndexType> ti) {
   return ti;
 }
 
+template <typename index_t, typename scalar_t>
+struct EmbeddingKernelFunctor {
+  void operator()(sycl::nd_item<1> item) const {
+    auto thread_id = item.get_global_linear_id();
+    if (thread_id < indices_length_ * embedding_dim_) {
+      output_[thread_id] = weight_
+          [index_[thread_id / embedding_dim_] * embedding_dim_ +
+           thread_id % embedding_dim_];
+    }
+  }
+  EmbeddingKernelFunctor(
+      scalar_t* output,
+      const scalar_t* weight,
+      const index_t* index,
+      int64_t num_embeddings,
+      int64_t embedding_dim,
+      int64_t indices_length)
+      : output_(output),
+        weight_(weight),
+        index_(index),
+        num_embeddings_(num_embeddings),
+        embedding_dim_(embedding_dim),
+        indices_length_(indices_length) {}
+
+ private:
+  scalar_t* output_;
+  const scalar_t* weight_;
+  const index_t* index_;
+  int64_t num_embeddings_;
+  int64_t embedding_dim_;
+  int64_t indices_length_;
+};
+
 template <class SrcInfo, class DstInfo, class IdxInfo, class FuncType>
 class IndexKernelConfig : public BatchKernelConfig {
  public:

From d04b03ce5b35e0dd67b64093ebba0602730e6558 Mon Sep 17 00:00:00 2001
From: jianyizh <jianyi.zhang@intel.com>
Date: Thu, 17 Jul 2025 08:07:06 +0000
Subject: [PATCH 2/4] add for loop to avoid too much workgroup

---
 src/ATen/native/xpu/sycl/Indexing.cpp | 6 ++++--
 src/ATen/native/xpu/sycl/Indexing.h   | 3 +--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 0e2b7409c6..688c1a9219 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -87,9 +87,11 @@ static inline void _embedding(
     int64_t indices_length) {
   using KernelClass = EmbeddingKernelFunctor<index_t, scalar_t>;
   int64_t work_group_size = syclDeviceMaxWorkGroupSize();
-  int64_t num_work_group = ceil_div(
+  int64_t num_xe_core = syclGpuEuCount() / syclGpuEUCountPerSubslice();
+  // 2 work group on 1 xe core to reach 100% occupancy
+  int64_t num_work_group = std::min(num_xe_core * 2,ceil_div(
       static_cast<int64_t>(indices_length * embedding_dim),
-      static_cast<int64_t>(work_group_size));
+      static_cast<int64_t>(work_group_size)));
   auto kfn = KernelClass(
       output, weight, index, num_embeddings, embedding_dim, indices_length);
   sycl_kernel_submit(
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index 346d0e9f2f..b89504bf6f 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -30,8 +30,7 @@ TensorInfo<T, IndexType> tensorInfoIfScalar(TensorInfo<T, IndexType> ti) {
 template <typename index_t, typename scalar_t>
 struct EmbeddingKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    auto thread_id = item.get_global_linear_id();
-    if (thread_id < indices_length_ * embedding_dim_) {
+    for (auto thread_id = item.get_global_linear_id(); thread_id < indices_length_ * embedding_dim_; thread_id += item.get_local_range(0) * item.get_group_range(0)) {
       output_[thread_id] = weight_
           [index_[thread_id / embedding_dim_] * embedding_dim_ +
            thread_id % embedding_dim_];

From d64f67870249c51a9d8a9ef26d97cb7ce94e31f7 Mon Sep 17 00:00:00 2001
From: jianyizh <jianyi.zhang@intel.com>
Date: Thu, 17 Jul 2025 09:01:18 +0000
Subject: [PATCH 3/4] slm

---
 src/ATen/native/xpu/sycl/Indexing.cpp | 34 ++++++++++++------
 src/ATen/native/xpu/sycl/Indexing.h   | 52 ++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 688c1a9219..059f694cee 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -86,19 +86,34 @@ static inline void _embedding(
     int64_t embedding_dim,
     int64_t indices_length) {
   using KernelClass = EmbeddingKernelFunctor<index_t, scalar_t>;
+  using SLMKernelClass = EmbeddingKernelSLMFunctor<index_t, scalar_t>;
   int64_t work_group_size = syclDeviceMaxWorkGroupSize();
   int64_t num_xe_core = syclGpuEuCount() / syclGpuEUCountPerSubslice();
+
   // 2 work group on 1 xe core to reach 100% occupancy
-  int64_t num_work_group = std::min(num_xe_core * 2,ceil_div(
-      static_cast<int64_t>(indices_length * embedding_dim),
-      static_cast<int64_t>(work_group_size)));
+  int64_t num_work_group = std::min(
+      num_xe_core * 2,
+      ceil_div(
+          static_cast<int64_t>(indices_length * embedding_dim),
+          static_cast<int64_t>(work_group_size)));
   auto kfn = KernelClass(
       output, weight, index, num_embeddings, embedding_dim, indices_length);
-  sycl_kernel_submit(
-      num_work_group * work_group_size,
-      work_group_size,
-      getCurrentSYCLQueue(),
-      kfn);
+  auto slmkfn = SLMKernelClass(
+      output, weight, index, num_embeddings, embedding_dim, indices_length);
+  // 2 work group share 1 Xe core, so slm is 64KB
+  if (num_embeddings * embedding_dim * sizeof(scalar_t) <=
+      syclLocalMemSize() / 2)
+    sycl_kernel_submit(
+        num_work_group * work_group_size,
+        work_group_size,
+        getCurrentSYCLQueue(),
+        slmkfn);
+  else
+    sycl_kernel_submit(
+        num_work_group * work_group_size,
+        work_group_size,
+        getCurrentSYCLQueue(),
+        kfn);
 }
 
 template <
@@ -227,8 +242,7 @@ void index_select_kernel(
           // Improve efficiency of generated native instructions for contiguous.
           // See comm/TensorInfo.h
           if (dst.is_contiguous() && indices.is_contiguous()) {
-            if (src.dim() == 2 && indices.dim() == 1 &&
-                src.is_contiguous()) {
+            if (src.dim() == 2 && indices.dim() == 1 && src.is_contiguous()) {
               _embedding<index_t, scalar_t>(
                   dst.mutable_data_ptr<scalar_t>(),
                   src.const_data_ptr<scalar_t>(),
diff --git a/src/ATen/native/xpu/sycl/Indexing.h b/src/ATen/native/xpu/sycl/Indexing.h
index b89504bf6f..89d644b68b 100644
--- a/src/ATen/native/xpu/sycl/Indexing.h
+++ b/src/ATen/native/xpu/sycl/Indexing.h
@@ -30,7 +30,10 @@ TensorInfo<T, IndexType> tensorInfoIfScalar(TensorInfo<T, IndexType> ti) {
 template <typename index_t, typename scalar_t>
 struct EmbeddingKernelFunctor {
   void operator()(sycl::nd_item<1> item) const {
-    for (auto thread_id = item.get_global_linear_id(); thread_id < indices_length_ * embedding_dim_; thread_id += item.get_local_range(0) * item.get_group_range(0)) {
+    for (auto thread_id = item.get_global_linear_id();
+         thread_id < indices_length_ * embedding_dim_;
+         thread_id += item.get_local_range(0) * item.get_group_range(0)) {
+      SYCL_KERNEL_ASSERT(index_[thread_id / embedding_dim_] < num_embeddings_);
       output_[thread_id] = weight_
           [index_[thread_id / embedding_dim_] * embedding_dim_ +
            thread_id % embedding_dim_];
@@ -59,6 +62,53 @@ struct EmbeddingKernelFunctor {
   int64_t indices_length_;
 };
 
+template <typename index_t, typename scalar_t>
+struct EmbeddingKernelSLMFunctor : public __SYCL_KER_CONFIG_CONVENTION__ {
+  void operator()(sycl::nd_item<1> item) const {
+    for (auto local_id = item.get_local_id(0);
+         local_id < num_embeddings_ * embedding_dim_;
+         local_id += item.get_local_range(0)) {
+      cached_weight_[local_id] = weight_[local_id];
+    }
+    item.barrier(sycl_local_fence);
+    for (auto thread_id = item.get_global_linear_id();
+         thread_id < indices_length_ * embedding_dim_;
+         thread_id += item.get_local_range(0) * item.get_group_range(0)) {
+      SYCL_KERNEL_ASSERT(index_[thread_id / embedding_dim_] < num_embeddings_);
+      output_[thread_id] = cached_weight_
+          [index_[thread_id / embedding_dim_] * embedding_dim_ +
+           thread_id % embedding_dim_];
+    }
+  }
+  void sycl_ker_config_convention(sycl::handler& cgh) {
+    cached_weight_ =
+        sycl_local_acc_t<scalar_t, 1>(num_embeddings_ * embedding_dim_, cgh);
+  }
+  EmbeddingKernelSLMFunctor(
+      scalar_t* output,
+      const scalar_t* weight,
+      const index_t* index,
+      int64_t num_embeddings,
+      int64_t embedding_dim,
+      int64_t indices_length)
+      : output_(output),
+        weight_(weight),
+        index_(index),
+        num_embeddings_(num_embeddings),
+        embedding_dim_(embedding_dim),
+        indices_length_(indices_length),
+        cached_weight_() {}
+
+ private:
+  scalar_t* output_;
+  const scalar_t* weight_;
+  const index_t* index_;
+  int64_t num_embeddings_;
+  int64_t embedding_dim_;
+  int64_t indices_length_;
+  sycl_local_acc_t<scalar_t, 1> cached_weight_;
+};
+
 template <class SrcInfo, class DstInfo, class IdxInfo, class FuncType>
 class IndexKernelConfig : public BatchKernelConfig {
  public:

From 38ff50ca08a812fc2083fdcd81c42c4a254fddfd Mon Sep 17 00:00:00 2001
From: jianyizh <jianyi.zhang@intel.com>
Date: Thu, 31 Jul 2025 16:22:39 +0800
Subject: [PATCH 4/4] Apply suggestions from copilot

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 src/ATen/native/xpu/sycl/Indexing.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ATen/native/xpu/sycl/Indexing.cpp b/src/ATen/native/xpu/sycl/Indexing.cpp
index 059f694cee..0a5a31cbd5 100644
--- a/src/ATen/native/xpu/sycl/Indexing.cpp
+++ b/src/ATen/native/xpu/sycl/Indexing.cpp
@@ -101,8 +101,10 @@ static inline void _embedding(
   auto slmkfn = SLMKernelClass(
       output, weight, index, num_embeddings, embedding_dim, indices_length);
   // 2 work group share 1 Xe core, so slm is 64KB
-  if (num_embeddings * embedding_dim * sizeof(scalar_t) <=
-      syclLocalMemSize() / 2)
+  if (static_cast<uint64_t>(num_embeddings) * 
+      static_cast<uint64_t>(embedding_dim) * 
+      static_cast<uint64_t>(sizeof(scalar_t)) <= 
+      static_cast<uint64_t>(syclLocalMemSize() / 2))
     sycl_kernel_submit(
         num_work_group * work_group_size,
         work_group_size,