From d2f63e95d27b150befeafb31431c38a7b5250c36 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Wed, 16 Jul 2025 09:44:53 -0700 Subject: [PATCH 1/2] single and multi device float8 test workflows in ci --- .github/workflows/float8_multi_gpu_test.yml | 51 +++++++++++++++++++ ...t8_test.yml => float8_single_gpu_test.yml} | 18 ++----- test/float8/test_everything_multi_gpu.sh | 21 ++++++++ test/float8/test_everything_single_gpu.sh | 18 +++++++ 4 files changed, 94 insertions(+), 14 deletions(-) create mode 100644 .github/workflows/float8_multi_gpu_test.yml rename .github/workflows/{float8_test.yml => float8_single_gpu_test.yml} (67%) create mode 100755 test/float8/test_everything_multi_gpu.sh create mode 100755 test/float8/test_everything_single_gpu.sh diff --git a/.github/workflows/float8_multi_gpu_test.yml b/.github/workflows/float8_multi_gpu_test.yml new file mode 100644 index 0000000000..6b73812858 --- /dev/null +++ b/.github/workflows/float8_multi_gpu_test.yml @@ -0,0 +1,51 @@ +name: Run Float8 Multi-GPU Tests + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: float8_multi_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: H100 + runs-on: linux.aws.h100.4 + torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' + gpu-arch-type: "cuda" + gpu-arch-version: "12.4" + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 60 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + submodules: recursive + script: | + conda create -n venv python=3.9 -y + conda activate venv + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install uv + pip install ${{ matrix.torch-spec }} + uv pip install -r dev-requirements.txt + uv pip install vllm + pip install . + ./test/float8/test_everything_multi_gpu.sh diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_single_gpu_test.yml similarity index 67% rename from .github/workflows/float8_test.yml rename to .github/workflows/float8_single_gpu_test.yml index bf58f520c6..a6b76abf6c 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_single_gpu_test.yml @@ -1,4 +1,4 @@ -name: Run Float8 Tests +name: Run Float8 Single-GPU Tests on: push: @@ -11,7 +11,7 @@ on: - 'gh/**' concurrency: - group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: float8_single_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true env: @@ -29,7 +29,7 @@ jobs: gpu-arch-type: "cuda" gpu-arch-version: "12.6" - name: H100 - runs-on: linux.aws.h100.4 + runs-on: linux.aws.h100 torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' gpu-arch-type: "cuda" gpu-arch-version: "12.4" @@ -53,14 +53,4 @@ jobs: uv pip install -r dev-requirements.txt uv pip install vllm pip install . - pytest test/float8 --verbose -s - pytest test/integration --verbose -s - pytest test/dtypes/test_affine_quantized_float.py --verbose -s - GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l) - if [ "$GPU_COUNT" -ge 4 ]; then - echo "Found $GPU_COUNT GPUs - running test_everything.sh" - ./test/float8/test_everything.sh - else - echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh" - exit 0 - fi + ./test/float8/test_everything_single_gpu.sh diff --git a/test/float8/test_everything_multi_gpu.sh b/test/float8/test_everything_multi_gpu.sh new file mode 100755 index 0000000000..6f391f9699 --- /dev/null +++ b/test/float8/test_everything_multi_gpu.sh @@ -0,0 +1,21 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +#!/bin/bash + +# terminate script on first error +set -e +IS_ROCM=$(rocm-smi --version || true) + +# These tests do not work on ROCm yet +if [ -z "$IS_ROCM" ] +then +./test/float8/test_fsdp.sh +./test/float8/test_fsdp_compile.sh +./test/float8/test_dtensor.sh +python test/float8/test_fsdp2/test_fsdp2.py +fi + +echo "all multi gpu tests successful" diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh new file mode 100755 index 0000000000..96f52ff0d0 --- /dev/null +++ b/test/float8/test_everything_single_gpu.sh @@ -0,0 +1,18 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. +#!/bin/bash + +# terminate script on first error +set -e + +pytest test/float8/test_base.py --verbose -s +pytest test/float8/test_compile.py --verbose -s +pytest test/float8/test_numerics_integration.py --verbose -s +pytest test/float8/test_auto_filter.py --verbose -s +pytest test/integration --verbose -s +pytest test/dtypes/test_affine_quantized_float.py --verbose -s + +echo "all single gpu tests successful" From 99e42f0ef04cbb214b77e6e6b29804b72ba01dad Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Fri, 18 Jul 2025 09:13:18 -0700 Subject: [PATCH 2/2] split workflows by hardware type --- .github/workflows/1xH100_tests.yml | 53 +++++++++++++++++++ ...at8_single_gpu_test.yml => 1xL4_tests.yml} | 11 ++-- ...t8_multi_gpu_test.yml => 4xH100_tests.yml} | 4 +- test/float8/test_everything_single_gpu.sh | 4 +- 4 files changed, 60 insertions(+), 12 deletions(-) create mode 100644 .github/workflows/1xH100_tests.yml rename .github/workflows/{float8_single_gpu_test.yml => 1xL4_tests.yml} (75%) rename .github/workflows/{float8_multi_gpu_test.yml => 4xH100_tests.yml} (88%) diff --git a/.github/workflows/1xH100_tests.yml b/.github/workflows/1xH100_tests.yml new file mode 100644 index 0000000000..18f1ff9cd4 --- /dev/null +++ b/.github/workflows/1xH100_tests.yml @@ -0,0 +1,53 @@ +name: Run 1xH100 Tests + +on: + push: + branches: + - main + - 'gh/**' + pull_request: + branches: + - main + - 'gh/**' + +concurrency: + group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' + gpu-arch-type: "cuda" + gpu-arch-version: "12.4" + permissions: + id-token: write + contents: read + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 60 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + submodules: recursive + script: | + conda create -n venv python=3.9 -y + conda activate venv + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install uv + pip install ${{ matrix.torch-spec }} + uv pip install -r dev-requirements.txt + uv pip install vllm + pip install . + pytest test/integration --verbose -s + pytest test/dtypes/test_affine_quantized_float.py --verbose -s + ./test/float8/test_everything_single_gpu.sh diff --git a/.github/workflows/float8_single_gpu_test.yml b/.github/workflows/1xL4_tests.yml similarity index 75% rename from .github/workflows/float8_single_gpu_test.yml rename to .github/workflows/1xL4_tests.yml index a6b76abf6c..cf4bf22423 100644 --- a/.github/workflows/float8_single_gpu_test.yml +++ b/.github/workflows/1xL4_tests.yml @@ -1,4 +1,4 @@ -name: Run Float8 Single-GPU Tests +name: Run 1xL4 Tests on: push: @@ -11,7 +11,7 @@ on: - 'gh/**' concurrency: - group: float8_single_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true env: @@ -28,11 +28,6 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126' gpu-arch-type: "cuda" gpu-arch-version: "12.6" - - name: H100 - runs-on: linux.aws.h100 - torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126' - gpu-arch-type: "cuda" - gpu-arch-version: "12.4" permissions: id-token: write contents: read @@ -53,4 +48,6 @@ jobs: uv pip install -r dev-requirements.txt uv pip install vllm pip install . + pytest test/integration --verbose -s + pytest test/dtypes/test_affine_quantized_float.py --verbose -s ./test/float8/test_everything_single_gpu.sh diff --git a/.github/workflows/float8_multi_gpu_test.yml b/.github/workflows/4xH100_tests.yml similarity index 88% rename from .github/workflows/float8_multi_gpu_test.yml rename to .github/workflows/4xH100_tests.yml index 6b73812858..21e82ca845 100644 --- a/.github/workflows/float8_multi_gpu_test.yml +++ b/.github/workflows/4xH100_tests.yml @@ -1,4 +1,4 @@ -name: Run Float8 Multi-GPU Tests +name: Run 4xH100 tests on: push: @@ -11,7 +11,7 @@ on: - 'gh/**' concurrency: - group: float8_multi_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true env: diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh index 96f52ff0d0..0b72951126 100755 --- a/test/float8/test_everything_single_gpu.sh +++ b/test/float8/test_everything_single_gpu.sh @@ -12,7 +12,5 @@ pytest test/float8/test_base.py --verbose -s pytest test/float8/test_compile.py --verbose -s pytest test/float8/test_numerics_integration.py --verbose -s pytest test/float8/test_auto_filter.py --verbose -s -pytest test/integration --verbose -s -pytest test/dtypes/test_affine_quantized_float.py --verbose -s -echo "all single gpu tests successful" +echo "all float8 single gpu tests successful"