neuralmagic
diff --git a/‎.buildkite/run-xpu-test.sh‎
Lines changed: 14 additions & 0 deletions b/‎.buildkite/run-xpu-test.sh‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎.buildkite/test-template.j2‎
Lines changed: 5 additions & 0 deletions b/‎.buildkite/test-template.j2‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎Dockerfile.xpu‎
Lines changed: 22 additions & 0 deletions b/‎Dockerfile.xpu‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎benchmarks/benchmark_latency.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_latency.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/benchmark_throughput.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/getting_started/xpu-installation.rst‎
Lines changed: 61 additions & 0 deletions b/‎docs/source/getting_started/xpu-installation.rst‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎docs/source/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements-xpu.txt‎
Lines changed: 11 additions & 0 deletions b/‎requirements-xpu.txt‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎setup.py‎
Lines changed: 8 additions & 0 deletions b/‎setup.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎vllm/_custom_ops.py‎
Lines changed: 2 additions & 1 deletion b/‎vllm/_custom_ops.py‎
Lines changed: 2 additions & 1 deletion
@@ -0,0 +1,14 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
@@ -45,6 +45,11 @@ steps:
       queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
+  - label: "XPU Test"
+    agents:
+      queue: intel
+    command: bash .buildkite/run-xpu-test.sh
+
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
 
@@ -0,0 +1,22 @@
+FROM intel/oneapi-basekit:2024.1.0-devel-ubuntu22.04
+
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+    echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+    chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+    rm /etc/apt/sources.list.d/intel-graphics.list && \
+    wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+    echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+    chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
+RUN apt-get update  -y \
+&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-xpu.txt
+
+RUN VLLM_TARGET_DEVICE=xpu python3 setup.py install
+
+CMD ["/bin/bash"]
@@ -191,7 +191,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument('--block-size',
                         type=int,
 
@@ -349,7 +349,7 @@ def main(args: argparse.Namespace):
         "--device",
         type=str,
         default="cuda",
-        choices=["cuda", "cpu", "tpu"],
+        choices=["cuda", "cpu", "tpu", "xpu"],
         help='device type for vLLM execution, supporting CUDA and CPU.')
     parser.add_argument(
         "--enable-prefix-caching",
 
@@ -0,0 +1,61 @@
+.. _installation_xpu:
+
+Installation with XPU
+========================
+
+vLLM initially supports basic model inferencing and serving on Intel GPU platform.
+
+Table of contents:
+
+#. :ref:`Requirements <xpu_backend_requirements>`
+#. :ref:`Quick start using Dockerfile <xpu_backend_quick_start_dockerfile>`
+#. :ref:`Build from source <build_xpu_backend_from_source>`
+
+.. _xpu_backend_requirements:
+
+Requirements
+------------
+
+* OS: Linux
+* Supported Hardware: Intel Data Center GPU (Intel ARC GPU WIP)
+* OneAPI requirements: oneAPI 2024.1 
+
+.. _xpu_backend_quick_start_dockerfile:
+
+Quick start using Dockerfile
+----------------------------
+
+.. code-block:: console
+
+    $ docker build -f Dockerfile.xpu -t vllm-xpu-env --shm-size=4g .
+    $ docker run -it \
+                 --rm \
+                 --network=host \
+                 --device /dev/dri \
+                 -v /dev/dri/by-path:/dev/dri/by-path \
+                 vllm-xpu-env
+
+.. _build_xpu_backend_from_source:
+
+Build from source
+-----------------
+
+- First, install required driver and intel OneAPI 2024.1.
+
+- Second, install Python packages for vLLM XPU backend building:
+
+.. code-block:: console
+
+    $ pip install --upgrade pip
+    $ pip install -v -r requirements-xpu.txt 
+
+- Finally, build and install vLLM XPU backend: 
+
+.. code-block:: console
+
+    $ VLLM_TARGET_DEVICE=xpu python setup.py install
+
+.. note::
+    - FP16 is the default data type in the current XPU backend. The BF16 data
+      type will be supported in the future.
+
@@ -66,6 +66,7 @@ Documentation
    getting_started/cpu-installation
    getting_started/neuron-installation
    getting_started/tpu-installation
+   getting_started/xpu-installation
    getting_started/quickstart
    getting_started/debugging
    getting_started/examples/examples_index
 
@@ -0,0 +1,11 @@
+# Common dependencies
+-r requirements-common.txt
+
+setuptools < 70.0.0 # IPEX's torch have some dependency. to be removed.
+
+torch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/torch-2.1.0.post1%2Bcxx11.abi-cp310-cp310-linux_x86_64.whl
+intel_extension_for_pytorch @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.1.30a0-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_stable/xpu/oneccl_bind_pt-2.1.200%2Bxpu-cp310-cp310-linux_x86_64.whl
+
+triton @ https://github.com/intel/intel-xpu-backend-for-triton/releases/download/v2.1.0/triton-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
+
@@ -234,6 +234,10 @@ def _is_cpu() -> bool:
     return VLLM_TARGET_DEVICE == "cpu"
 
 
+def _is_xpu() -> bool:
+    return VLLM_TARGET_DEVICE == "xpu"
+
+
 def _build_custom_ops() -> bool:
     return _is_cuda() or _is_hip() or _is_cpu()
 
@@ -357,6 +361,8 @@ def get_vllm_version() -> str:
         version += "+tpu"
     elif _is_cpu():
         version += "+cpu"
+    elif _is_xpu():
+        version += "+xpu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -406,6 +412,8 @@ def _read_requirements(filename: str) -> List[str]:
         requirements = _read_requirements("requirements-tpu.txt")
     elif _is_cpu():
         requirements = _read_requirements("requirements-cpu.txt")
+    elif _is_xpu():
+        requirements = _read_requirements("requirements-xpu.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
 
@@ -373,7 +373,8 @@ def reshape_and_cache_flash(
                                                    kv_cache_dtype)
 
 
-def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
+def copy_blocks(key_caches: List[torch.Tensor],
+                value_caches: List[torch.Tensor],
                 block_mapping: torch.Tensor) -> None:
     torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)