From 8aef4c5aad6821d6b2d082774d939c6f4be8dd60 Mon Sep 17 00:00:00 2001 From: "Liu, Kaixuan" Date: Tue, 13 May 2025 11:21:01 -0400 Subject: [PATCH 1/2] upgrade pytorch and ipex to 2.7 version Signed-off-by: Liu, Kaixuan --- Dockerfile-intel | 10 +++++----- .../server/text_embeddings_server/utils/flash_attn.py | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Dockerfile-intel b/Dockerfile-intel index 67a5938f..9b9d46e5 100644 --- a/Dockerfile-intel +++ b/Dockerfile-intel @@ -59,7 +59,7 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \ --mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \ cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s -FROM intel/intel-extension-for-pytorch:2.6.0-pip-base AS cpu +FROM intel/intel-extension-for-pytorch:2.7.0-pip-base AS cpu ENV HUGGINGFACE_HUB_CACHE=/data \ PORT=80 @@ -77,7 +77,7 @@ COPY backends/python/server/text_embeddings_server/models/__init__.py backends/p COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml COPY backends/python/server/requirements-intel.txt backends/python/server/requirements.txt -RUN python -m pip install torch==2.6.0 torchvision torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir +RUN python -m pip install torch==2.7.0 torchvision torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir RUN cd backends/python/server && \ make install @@ -103,7 +103,7 @@ COPY backends/python/server/requirements-hpu.txt backends/python/server/requirem RUN cd backends/python/server && \ make install -FROM intel/intel-extension-for-pytorch:2.6.10-xpu AS xpu +FROM intel/intel-extension-for-pytorch:2.7.10-xpu AS xpu ENV HUGGINGFACE_HUB_CACHE=/data \ PORT=80 @@ -117,8 +117,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO RUN apt-get update && apt install -y intel-basekit cmake vim python3-dev ninja-build pciutils WORKDIR /usr/src -RUN pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu --no-cache-dir -RUN pip install intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir +RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir +RUN pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py index 5c4d67f0..5d39e6d0 100644 --- a/backends/python/server/text_embeddings_server/utils/flash_attn.py +++ b/backends/python/server/text_embeddings_server/utils/flash_attn.py @@ -100,6 +100,7 @@ def attention( out, cu_seqlens, cu_seqlens, + None, max_s, max_s, 0, From e55f48535e0125386c39b08ae9ec3dbe1d42894b Mon Sep 17 00:00:00 2001 From: "Liu, Kaixuan" Date: Tue, 13 May 2025 13:25:38 -0400 Subject: [PATCH 2/2] make a WA to support both CPU and XPU Signed-off-by: Liu, Kaixuan --- .../utils/flash_attn.py | 53 +++++++++++++------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/backends/python/server/text_embeddings_server/utils/flash_attn.py b/backends/python/server/text_embeddings_server/utils/flash_attn.py index 5d39e6d0..3fb6b06d 100644 --- a/backends/python/server/text_embeddings_server/utils/flash_attn.py +++ b/backends/python/server/text_embeddings_server/utils/flash_attn.py @@ -93,23 +93,42 @@ def attention( if use_ipex: import intel_extension_for_pytorch as ipex - return ipex.llm.functional.varlen_attention( - q.contiguous() if q.device.type == "xpu" else q, - k.contiguous() if k.device.type == "xpu" else k, - v.contiguous() if v.device.type == "xpu" else v, - out, - cu_seqlens, - cu_seqlens, - None, - max_s, - max_s, - 0, - softmax_scale, - zero_tensors=False, - is_causal=False, - return_softmax=False, - gen_=None, - ) + if q.device.type == "xpu": + return ipex.llm.functional.varlen_attention( + q.contiguous(), + k.contiguous(), + v.contiguous(), + out, + cu_seqlens, + cu_seqlens, + None, + max_s, + max_s, + 0, + softmax_scale, + zero_tensors=False, + is_causal=False, + return_softmax=False, + gen_=None, + ) + elif q.device.type == "cpu": + return ipex.llm.functional.varlen_attention( + q, + k, + v, + out, + cu_seqlens, + cu_seqlens, + max_s, + max_s, + 0, + softmax_scale, + zero_tensors=False, + is_causal=False, + return_softmax=False, + gen_=None, + ) + elif is_hpu: return hpu_attn( q,