Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions .github/workflows/1xH100_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Run 1xH100 Tests

on:
push:
branches:
- main
- 'gh/**'
pull_request:
branches:
- main
- 'gh/**'

concurrency:
group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

jobs:
test:
strategy:
fail-fast: false
matrix:
include:
- name: H100
runs-on: linux.aws.h100
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.4"
permissions:
id-token: write
contents: read
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
script: |
conda create -n venv python=3.9 -y
conda activate venv
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
python -m pip install --upgrade pip
pip install uv
pip install ${{ matrix.torch-spec }}
uv pip install -r dev-requirements.txt
uv pip install vllm
pip install .
pytest test/integration --verbose -s
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
./test/float8/test_everything_single_gpu.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: Run Float8 Tests
name: Run 1xL4 Tests

on:
push:
Expand All @@ -11,7 +11,7 @@ on:
- 'gh/**'

concurrency:
group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

env:
Expand All @@ -28,11 +28,6 @@ jobs:
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.6"
- name: H100
runs-on: linux.aws.h100.4
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.4"
permissions:
id-token: write
contents: read
Expand All @@ -53,14 +48,6 @@ jobs:
uv pip install -r dev-requirements.txt
uv pip install vllm
pip install .
pytest test/float8 --verbose -s
pytest test/integration --verbose -s
pytest test/dtypes/test_affine_quantized_float.py --verbose -s
GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l)
if [ "$GPU_COUNT" -ge 4 ]; then
echo "Found $GPU_COUNT GPUs - running test_everything.sh"
./test/float8/test_everything.sh
else
echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh"
exit 0
fi
./test/float8/test_everything_single_gpu.sh
51 changes: 51 additions & 0 deletions .github/workflows/4xH100_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
name: Run 4xH100 tests

on:
push:
branches:
- main
- 'gh/**'
pull_request:
branches:
- main
- 'gh/**'

concurrency:
group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true

env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

jobs:
test:
strategy:
fail-fast: false
matrix:
include:
- name: H100
runs-on: linux.aws.h100.4
torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
gpu-arch-type: "cuda"
gpu-arch-version: "12.4"
permissions:
id-token: write
contents: read
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
submodules: recursive
script: |
conda create -n venv python=3.9 -y
conda activate venv
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
python -m pip install --upgrade pip
pip install uv
pip install ${{ matrix.torch-spec }}
uv pip install -r dev-requirements.txt
uv pip install vllm
pip install .
./test/float8/test_everything_multi_gpu.sh
21 changes: 21 additions & 0 deletions test/float8/test_everything_multi_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.
#!/bin/bash

# terminate script on first error
set -e
IS_ROCM=$(rocm-smi --version || true)

# These tests do not work on ROCm yet
if [ -z "$IS_ROCM" ]
then
./test/float8/test_fsdp.sh
./test/float8/test_fsdp_compile.sh
./test/float8/test_dtensor.sh
python test/float8/test_fsdp2/test_fsdp2.py
fi

echo "all multi gpu tests successful"
16 changes: 16 additions & 0 deletions test/float8/test_everything_single_gpu.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD 3-Clause license found in the
# LICENSE file in the root directory of this source tree.
#!/bin/bash

# terminate script on first error
set -e

pytest test/float8/test_base.py --verbose -s
pytest test/float8/test_compile.py --verbose -s
pytest test/float8/test_numerics_integration.py --verbose -s
pytest test/float8/test_auto_filter.py --verbose -s

echo "all float8 single gpu tests successful"
Loading