diff --git a/.github/workflows/1xH100_tests.yml b/.github/workflows/1xH100_tests.yml new file mode 100644 index 0000000000..18f1ff9cd4 --- /dev/null +++ b/.github/workflows/1xH100_tests.yml @@ -0,0 +1,53 @@ +name: Run 1xH100 Tests + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' + gpu-arch-type: "cuda" + gpu-arch-version: "12.4" + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 60 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + submodules: recursive + script: | + conda create -n venv python=3.9 -y + conda activate venv + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install uv + pip install ${{ matrix.torch-spec }} + uv pip install -r dev-requirements.txt + uv pip install vllm + pip install . + pytest test/integration --verbose -s + pytest test/dtypes/test_affine_quantized_float.py --verbose -s + ./test/float8/test_everything_single_gpu.sh diff --git a/.github/workflows/float8_test.yml b/.github/workflows/1xL4_tests.yml similarity index 62% rename from .github/workflows/float8_test.yml rename to .github/workflows/1xL4_tests.yml index bf58f520c6..cf4bf22423 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/1xL4_tests.yml @@ -1,4 +1,4 @@ -name: Run Float8 Tests +name: Run 1xL4 Tests on: push: @@ -11,7 +11,7 @@ on: - 'gh/**' concurrency: - group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true env: @@ -28,11 +28,6 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' gpu-arch-type: "cuda" gpu-arch-version: "12.6" - - name: H100 - runs-on: linux.aws.h100.4 - torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' - gpu-arch-type: "cuda" - gpu-arch-version: "12.4" permissions: id-token: write contents: read @@ -53,14 +48,6 @@ jobs: uv pip install -r dev-requirements.txt uv pip install vllm pip install . - pytest test/float8 --verbose -s pytest test/integration --verbose -s pytest test/dtypes/test_affine_quantized_float.py --verbose -s - GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l) - if [ "$GPU_COUNT" -ge 4 ]; then - echo "Found $GPU_COUNT GPUs - running test_everything.sh" - ./test/float8/test_everything.sh - else - echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh" - exit 0 - fi + ./test/float8/test_everything_single_gpu.sh diff --git a/.github/workflows/4xH100_tests.yml b/.github/workflows/4xH100_tests.yml new file mode 100644 index 0000000000..21e82ca845 --- /dev/null +++ b/.github/workflows/4xH100_tests.yml @@ -0,0 +1,51 @@ +name: Run 4xH100 tests + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: H100 + runs-on: linux.aws.h100.4 + torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' + gpu-arch-type: "cuda" + gpu-arch-version: "12.4" + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 60 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + submodules: recursive + script: | + conda create -n venv python=3.9 -y + conda activate venv + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install uv + pip install ${{ matrix.torch-spec }} + uv pip install -r dev-requirements.txt + uv pip install vllm + pip install . + ./test/float8/test_everything_multi_gpu.sh diff --git a/test/float8/test_everything_multi_gpu.sh b/test/float8/test_everything_multi_gpu.sh new file mode 100755 index 0000000000..6f391f9699 --- /dev/null +++ b/test/float8/test_everything_multi_gpu.sh @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +#!/bin/bash + +# terminate script on first error +set -e +IS_ROCM=$(rocm-smi --version || true) + +# These tests do not work on ROCm yet +if [ -z "$IS_ROCM" ] +then +./test/float8/test_fsdp.sh +./test/float8/test_fsdp_compile.sh +./test/float8/test_dtensor.sh +python test/float8/test_fsdp2/test_fsdp2.py +fi + +echo "all multi gpu tests successful" diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh new file mode 100755 index 0000000000..0b72951126 --- /dev/null +++ b/test/float8/test_everything_single_gpu.sh @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +#!/bin/bash + +# terminate script on first error +set -e + +pytest test/float8/test_base.py --verbose -s +pytest test/float8/test_compile.py --verbose -s +pytest test/float8/test_numerics_integration.py --verbose -s +pytest test/float8/test_auto_filter.py --verbose -s + +echo "all float8 single gpu tests successful"