From 8aef4c5aad6821d6b2d082774d939c6f4be8dd60 Mon Sep 17 00:00:00 2001
From: "Liu, Kaixuan" <kaixuan.liu@intel.com>
Date: Tue, 13 May 2025 11:21:01 -0400
Subject: [PATCH 1/2] upgrade pytorch and ipex to 2.7 version

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>
---
 Dockerfile-intel                                       | 10 +++++-----
 .../server/text_embeddings_server/utils/flash_attn.py  |  1 +
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/Dockerfile-intel b/Dockerfile-intel
index 67a5938f..9b9d46e5 100644
--- a/Dockerfile-intel
+++ b/Dockerfile-intel
@@ -59,7 +59,7 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
     --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
     cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s
 
-FROM intel/intel-extension-for-pytorch:2.6.0-pip-base AS cpu
+FROM intel/intel-extension-for-pytorch:2.7.0-pip-base AS cpu
 ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
 
@@ -77,7 +77,7 @@ COPY backends/python/server/text_embeddings_server/models/__init__.py backends/p
 COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
 COPY backends/python/server/requirements-intel.txt backends/python/server/requirements.txt
 
-RUN python -m pip install torch==2.6.0 torchvision torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
+RUN python -m pip install torch==2.7.0 torchvision torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
 
 RUN cd backends/python/server && \
     make install
@@ -103,7 +103,7 @@ COPY backends/python/server/requirements-hpu.txt backends/python/server/requirem
 RUN cd backends/python/server && \
     make install
 
-FROM intel/intel-extension-for-pytorch:2.6.10-xpu AS xpu
+FROM intel/intel-extension-for-pytorch:2.7.10-xpu AS xpu
 
 ENV HUGGINGFACE_HUB_CACHE=/data \
     PORT=80
@@ -117,8 +117,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
 
 RUN apt-get update && apt install -y intel-basekit cmake vim python3-dev ninja-build pciutils
 WORKDIR /usr/src
-RUN pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
-RUN pip install intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir
+RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
+RUN pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir
 
 ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py
index 5c4d67f0..5d39e6d0 100644
--- a/backends/python/server/text_embeddings_server/utils/flash_attn.py
+++ b/backends/python/server/text_embeddings_server/utils/flash_attn.py
@@ -100,6 +100,7 @@ def attention(
                 out,
                 cu_seqlens,
                 cu_seqlens,
+                None,
                 max_s,
                 max_s,
                 0,

From e55f48535e0125386c39b08ae9ec3dbe1d42894b Mon Sep 17 00:00:00 2001
From: "Liu, Kaixuan" <kaixuan.liu@intel.com>
Date: Tue, 13 May 2025 13:25:38 -0400
Subject: [PATCH 2/2] make a WA to support both CPU and XPU

Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>
---
 .../utils/flash_attn.py                       | 53 +++++++++++++------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py
index 5d39e6d0..3fb6b06d 100644
--- a/backends/python/server/text_embeddings_server/utils/flash_attn.py
+++ b/backends/python/server/text_embeddings_server/utils/flash_attn.py
@@ -93,23 +93,42 @@ def attention(
         if use_ipex:
             import intel_extension_for_pytorch as ipex
 
-            return ipex.llm.functional.varlen_attention(
-                q.contiguous() if q.device.type == "xpu" else q,
-                k.contiguous() if k.device.type == "xpu" else k,
-                v.contiguous() if v.device.type == "xpu" else v,
-                out,
-                cu_seqlens,
-                cu_seqlens,
-                None,
-                max_s,
-                max_s,
-                0,
-                softmax_scale,
-                zero_tensors=False,
-                is_causal=False,
-                return_softmax=False,
-                gen_=None,
-            )
+            if q.device.type == "xpu":
+                return ipex.llm.functional.varlen_attention(
+                    q.contiguous(),
+                    k.contiguous(),
+                    v.contiguous(),
+                    out,
+                    cu_seqlens,
+                    cu_seqlens,
+                    None,
+                    max_s,
+                    max_s,
+                    0,
+                    softmax_scale,
+                    zero_tensors=False,
+                    is_causal=False,
+                    return_softmax=False,
+                    gen_=None,
+                )
+            elif q.device.type == "cpu":
+                return ipex.llm.functional.varlen_attention(
+                    q,
+                    k,
+                    v,
+                    out,
+                    cu_seqlens,
+                    cu_seqlens,
+                    max_s,
+                    max_s,
+                    0,
+                    softmax_scale,
+                    zero_tensors=False,
+                    is_causal=False,
+                    return_softmax=False,
+                    gen_=None,
+                )
+
         elif is_hpu:
             return hpu_attn(
                 q,