Skip to content

upgrade pytorch and ipex to 2.7 version #607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions Dockerfile-intel
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ RUN --mount=type=secret,id=actions_results_url,env=ACTIONS_RESULTS_URL \
--mount=type=secret,id=actions_runtime_token,env=ACTIONS_RUNTIME_TOKEN \
cargo build --release --bin text-embeddings-router -F grpc -F python --no-default-features && sccache -s

FROM intel/intel-extension-for-pytorch:2.6.0-pip-base AS cpu
FROM intel/intel-extension-for-pytorch:2.7.0-pip-base AS cpu
ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80

Expand All @@ -77,7 +77,7 @@ COPY backends/python/server/text_embeddings_server/models/__init__.py backends/p
COPY backends/python/server/pyproject.toml backends/python/server/pyproject.toml
COPY backends/python/server/requirements-intel.txt backends/python/server/requirements.txt

RUN python -m pip install torch==2.6.0 torchvision torchaudio==2.6.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir
RUN python -m pip install torch==2.7.0 torchvision torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cpu --no-cache-dir

RUN cd backends/python/server && \
make install
Expand All @@ -103,7 +103,7 @@ COPY backends/python/server/requirements-hpu.txt backends/python/server/requirem
RUN cd backends/python/server && \
make install

FROM intel/intel-extension-for-pytorch:2.6.10-xpu AS xpu
FROM intel/intel-extension-for-pytorch:2.7.10-xpu AS xpu

ENV HUGGINGFACE_HUB_CACHE=/data \
PORT=80
Expand All @@ -117,8 +117,8 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO

RUN apt-get update && apt install -y intel-basekit cmake vim python3-dev ninja-build pciutils
WORKDIR /usr/src
RUN pip install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
RUN pip install intel-extension-for-pytorch==2.6.10+xpu oneccl_bind_pt==2.6.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir
RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
RUN pip install intel-extension-for-pytorch==2.7.10+xpu oneccl_bind_pt==2.7.0+xpu --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ --no-cache-dir

ENV CCL_ROOT=/opt/intel/oneapi/ccl/latest
ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest
Expand Down
52 changes: 36 additions & 16 deletions backends/python/server/text_embeddings_server/utils/flash_attn.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,22 +93,42 @@ def attention(
if use_ipex:
import intel_extension_for_pytorch as ipex

return ipex.llm.functional.varlen_attention(
q.contiguous() if q.device.type == "xpu" else q,
k.contiguous() if k.device.type == "xpu" else k,
v.contiguous() if v.device.type == "xpu" else v,
out,
cu_seqlens,
cu_seqlens,
max_s,
max_s,
0,
softmax_scale,
zero_tensors=False,
is_causal=False,
return_softmax=False,
gen_=None,
)
if q.device.type == "xpu":
return ipex.llm.functional.varlen_attention(
q.contiguous(),
k.contiguous(),
v.contiguous(),
out,
cu_seqlens,
cu_seqlens,
None,
max_s,
max_s,
0,
softmax_scale,
zero_tensors=False,
is_causal=False,
return_softmax=False,
gen_=None,
)
elif q.device.type == "cpu":
return ipex.llm.functional.varlen_attention(
q,
k,
v,
out,
cu_seqlens,
cu_seqlens,
max_s,
max_s,
0,
softmax_scale,
zero_tensors=False,
is_causal=False,
return_softmax=False,
gen_=None,
)

elif is_hpu:
return hpu_attn(
q,
Expand Down
Loading