diff --git a/Dockerfile.rocm b/Dockerfile.rocm index a12d5ba5fd8f..9aa3a974e704 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,5 +1,5 @@ -# Default ROCm 6.1 base image -ARG BASE_IMAGE="rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging" +# Default ROCm 6.2 base image +ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0" # Default ROCm ARCHes to build vLLM for. ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" @@ -7,18 +7,12 @@ ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100" # Whether to install CK-based flash-attention # If 0, will not install flash-attention ARG BUILD_FA="1" -# If `TRY_FA_WHEEL=1`, we will try installing flash-attention from `FA_WHEEL_URL` -# If this succeeds, we use the downloaded wheel and skip building flash-attention. -# Otherwise, ROCm flash-attention from `FA_BRANCH` will be built for the -# architectures specified in `FA_GFX_ARCHS` -ARG TRY_FA_WHEEL="1" -ARG FA_WHEEL_URL="https://github.com/ROCm/flash-attention/releases/download/v2.5.9post1-cktile-vllm/flash_attn-2.5.9.post1-cp39-cp39-linux_x86_64.whl" ARG FA_GFX_ARCHS="gfx90a;gfx942" -ARG FA_BRANCH="23a2b1c2" +ARG FA_BRANCH="3cea2fb" # Whether to build triton on rocm ARG BUILD_TRITON="1" -ARG TRITON_BRANCH="e0fc12c" +ARG TRITON_BRANCH="e192dba" ### Base image build stage FROM $BASE_IMAGE AS base @@ -50,16 +44,17 @@ RUN python3 -m pip install --upgrade pip # Remove sccache so it doesn't interfere with ccache # TODO: implement sccache support across components RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" -# Install torch == 2.5.0 on ROCm + +# Install torch == 2.6.0 on ROCm RUN --mount=type=cache,target=/root/.cache/pip \ case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.1"*) \ + *"rocm-6.2"*) \ python3 -m pip uninstall -y torch torchvision \ && python3 -m pip install --pre \ - torch==2.5.0.dev20240726 \ - cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \ - torchvision==0.20.0.dev20240726 \ - --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.1 ;; \ + torch==2.6.0.dev20240918 \ + setuptools-scm>=8 \ + torchvision==0.20.0.dev20240918 \ + --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \ *) ;; esac ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer @@ -81,25 +76,18 @@ RUN cd /opt/rocm/share/amd_smi \ ### Flash-Attention wheel build stage FROM base AS build_fa ARG BUILD_FA -ARG TRY_FA_WHEEL -ARG FA_WHEEL_URL ARG FA_GFX_ARCHS ARG FA_BRANCH # Build ROCm flash-attention wheel if `BUILD_FA = 1` RUN --mount=type=cache,target=${CCACHE_DIR} \ if [ "$BUILD_FA" = "1" ]; then \ - if [ "${TRY_FA_WHEEL}" = "1" ] && python3 -m pip install "${FA_WHEEL_URL}"; then \ - # If a suitable wheel exists, we download it instead of building FA - mkdir -p /install && wget -N "${FA_WHEEL_URL}" -P /install; \ - else \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/flash-attention.git \ - && cd flash-attention \ - && git checkout "${FA_BRANCH}" \ - && git submodule update --init \ - && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ - fi; \ + mkdir -p libs \ + && cd libs \ + && git clone https://github.com/ROCm/flash-attention.git \ + && cd flash-attention \ + && git checkout "${FA_BRANCH}" \ + && git submodule update --init \ + && GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \ # Create an empty directory otherwise as later build stages expect one else mkdir -p /install; \ fi @@ -114,6 +102,7 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ if [ "$BUILD_TRITON" = "1" ]; then \ mkdir -p libs \ && cd libs \ + && python3 -m pip install ninja cmake wheel pybind11 \ && git clone https://github.com/OpenAI/triton.git \ && cd triton \ && git checkout "${TRITON_BRANCH}" \ @@ -143,13 +132,6 @@ RUN --mount=type=cache,target=${CCACHE_DIR} \ --mount=type=bind,source=.git,target=.git \ --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -Ur requirements-rocm.txt \ - && case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \ - *"rocm-6.1"*) \ - # Bring in upgrades to HIP graph earlier than ROCm 6.2 for vLLM - wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib \ - # Prevent interference if torch bundles its own HIP runtime - && rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* || true;; \ - *) ;; esac \ && python3 setup.py clean --all \ && python3 setup.py develop diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index d169fe676dc9..4ed0bfe70071 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -3,15 +3,17 @@ Installation with ROCm ====================== -vLLM supports AMD GPUs with ROCm 6.1. +vLLM supports AMD GPUs with ROCm 6.2. Requirements ------------ * OS: Linux -* Python: 3.8 -- 3.11 +* Python: 3.9 -- 3.12 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) -* ROCm 6.1 +* ROCm 6.2 + +Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8. Installation options: @@ -27,7 +29,7 @@ You can build and install vLLM from source. First, build a docker image from `Dockerfile.rocm `_ and launch a docker container from the image. -`Dockerfile.rocm `_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches. +`Dockerfile.rocm `_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches. It provides flexibility to customize the build of docker image using the following arguments: * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. @@ -39,13 +41,13 @@ It provides flexibility to customize the build of docker image using the followi Their values can be passed in when running ``docker build`` with ``--build-arg`` options. -To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default: +To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default: .. code-block:: console $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . -To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: +To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: .. code-block:: console @@ -79,9 +81,8 @@ Option 2: Build from source - `ROCm `_ - `PyTorch `_ -- `hipBLAS `_ -For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`. +For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`. Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started `_ @@ -90,26 +91,45 @@ Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTor Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton `_ + .. code-block:: console + + $ python3 -m pip install ninja cmake wheel pybind11 + $ pip uninstall -y triton + $ git clone https://github.com/OpenAI/triton.git + $ cd triton + $ git checkout e192dba + $ cd python + $ pip3 install . + $ cd ../.. + +.. note:: + - If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent. + + 2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm `_ + Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention `_ Alternatively, wheels intended for vLLM use can be accessed under the releases. -.. note:: - - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) +For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`. +Note to get your gfx architecture, run `rocminfo |grep gfx`. -3. Build vLLM. - -.. code-block:: console + .. code-block:: console - $ cd vllm - $ pip install -U -r requirements-rocm.txt - $ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation + $ git clone https://github.com/ROCm/flash-attention.git + $ cd flash-attention + $ git checkout 3cea2fb + $ git submodule update --init + $ GPU_ARCHS="gfx90a" python3 setup.py install + $ cd .. +.. note:: + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) -.. tip:: +3. Build vLLM. - For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps: + For example, vLLM on ROCM 6.2 can be built with the following steps: .. code-block:: console @@ -117,7 +137,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ # Install PyTorch $ pip uninstall torch -y - $ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 + $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2 $ # Build & install AMD SMI $ pip install /opt/rocm/share/amd_smi @@ -127,15 +147,14 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. $ pip install "numpy<2" $ pip install -r requirements-rocm.txt - $ # Apply the patch to ROCM 6.1 (requires root permission) - $ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib - $ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so* - $ # Build vLLM for MI210/MI250/MI300. $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" $ python3 setup.py develop + This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation + + .. tip:: - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.