From d2f63e95d27b150befeafb31431c38a7b5250c36 Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Wed, 16 Jul 2025 09:44:53 -0700
Subject: [PATCH 1/2] single and multi device float8 test workflows in ci

---
 .github/workflows/float8_multi_gpu_test.yml   | 51 +++++++++++++++++++
 ...t8_test.yml => float8_single_gpu_test.yml} | 18 ++-----
 test/float8/test_everything_multi_gpu.sh      | 21 ++++++++
 test/float8/test_everything_single_gpu.sh     | 18 +++++++
 4 files changed, 94 insertions(+), 14 deletions(-)
 create mode 100644 .github/workflows/float8_multi_gpu_test.yml
 rename .github/workflows/{float8_test.yml => float8_single_gpu_test.yml} (67%)
 create mode 100755 test/float8/test_everything_multi_gpu.sh
 create mode 100755 test/float8/test_everything_single_gpu.sh

diff --git a/.github/workflows/float8_multi_gpu_test.yml b/.github/workflows/float8_multi_gpu_test.yml
new file mode 100644
index 0000000000..6b73812858
--- /dev/null
+++ b/.github/workflows/float8_multi_gpu_test.yml
@@ -0,0 +1,51 @@
+name: Run Float8 Multi-GPU Tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: float8_multi_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: H100
+            runs-on: linux.aws.h100.4
+            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+        pip install uv
+        pip install ${{ matrix.torch-spec }}
+        uv pip install -r dev-requirements.txt
+        uv pip install vllm
+        pip install .
+        ./test/float8/test_everything_multi_gpu.sh
diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_single_gpu_test.yml
similarity index 67%
rename from .github/workflows/float8_test.yml
rename to .github/workflows/float8_single_gpu_test.yml
index bf58f520c6..a6b76abf6c 100644
--- a/.github/workflows/float8_test.yml
+++ b/.github/workflows/float8_single_gpu_test.yml
@@ -1,4 +1,4 @@
-name: Run Float8 Tests
+name: Run Float8 Single-GPU Tests
 
 on:
   push:
@@ -11,7 +11,7 @@ on:
       - 'gh/**'
 
 concurrency:
-  group: float8_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: float8_single_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -29,7 +29,7 @@ jobs:
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
           - name: H100
-            runs-on: linux.aws.h100.4
+            runs-on: linux.aws.h100
             torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.4"
@@ -53,14 +53,4 @@ jobs:
         uv pip install -r dev-requirements.txt
         uv pip install vllm
         pip install .
-        pytest test/float8 --verbose -s
-        pytest test/integration --verbose -s
-        pytest test/dtypes/test_affine_quantized_float.py --verbose -s
-        GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l)
-        if [ "$GPU_COUNT" -ge 4 ]; then
-            echo "Found $GPU_COUNT GPUs - running test_everything.sh"
-            ./test/float8/test_everything.sh
-        else
-            echo "Only $GPU_COUNT GPUs available. Need at least 4 GPUs to run test_everything.sh"
-            exit 0
-        fi
+        ./test/float8/test_everything_single_gpu.sh
diff --git a/test/float8/test_everything_multi_gpu.sh b/test/float8/test_everything_multi_gpu.sh
new file mode 100755
index 0000000000..6f391f9699
--- /dev/null
+++ b/test/float8/test_everything_multi_gpu.sh
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+#!/bin/bash
+
+# terminate script on first error
+set -e
+IS_ROCM=$(rocm-smi --version || true)
+
+# These tests do not work on ROCm yet
+if [ -z "$IS_ROCM" ]
+then
+./test/float8/test_fsdp.sh
+./test/float8/test_fsdp_compile.sh
+./test/float8/test_dtensor.sh
+python test/float8/test_fsdp2/test_fsdp2.py
+fi
+
+echo "all multi gpu tests successful"
diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh
new file mode 100755
index 0000000000..96f52ff0d0
--- /dev/null
+++ b/test/float8/test_everything_single_gpu.sh
@@ -0,0 +1,18 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+#!/bin/bash
+
+# terminate script on first error
+set -e
+
+pytest test/float8/test_base.py --verbose -s
+pytest test/float8/test_compile.py --verbose -s
+pytest test/float8/test_numerics_integration.py --verbose -s
+pytest test/float8/test_auto_filter.py --verbose -s
+pytest test/integration --verbose -s
+pytest test/dtypes/test_affine_quantized_float.py --verbose -s
+
+echo "all single gpu tests successful"

From 99e42f0ef04cbb214b77e6e6b29804b72ba01dad Mon Sep 17 00:00:00 2001
From: Daniel Vega-Myhre <danvm@meta.com>
Date: Fri, 18 Jul 2025 09:13:18 -0700
Subject: [PATCH 2/2] split workflows by hardware type

---
 .github/workflows/1xH100_tests.yml            | 53 +++++++++++++++++++
 ...at8_single_gpu_test.yml => 1xL4_tests.yml} | 11 ++--
 ...t8_multi_gpu_test.yml => 4xH100_tests.yml} |  4 +-
 test/float8/test_everything_single_gpu.sh     |  4 +-
 4 files changed, 60 insertions(+), 12 deletions(-)
 create mode 100644 .github/workflows/1xH100_tests.yml
 rename .github/workflows/{float8_single_gpu_test.yml => 1xL4_tests.yml} (75%)
 rename .github/workflows/{float8_multi_gpu_test.yml => 4xH100_tests.yml} (88%)

diff --git a/.github/workflows/1xH100_tests.yml b/.github/workflows/1xH100_tests.yml
new file mode 100644
index 0000000000..18f1ff9cd4
--- /dev/null
+++ b/.github/workflows/1xH100_tests.yml
@@ -0,0 +1,53 @@
+name: Run 1xH100 Tests
+
+on:
+  push:
+    branches:
+      - main
+      - 'gh/**'
+  pull_request:
+    branches:
+      - main
+      - 'gh/**'
+
+concurrency:
+  group: 1xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+
+jobs:
+  test:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: H100
+            runs-on: linux.aws.h100
+            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.4"
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.9 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+        pip install uv
+        pip install ${{ matrix.torch-spec }}
+        uv pip install -r dev-requirements.txt
+        uv pip install vllm
+        pip install .
+        pytest test/integration --verbose -s
+        pytest test/dtypes/test_affine_quantized_float.py --verbose -s
+        ./test/float8/test_everything_single_gpu.sh
diff --git a/.github/workflows/float8_single_gpu_test.yml b/.github/workflows/1xL4_tests.yml
similarity index 75%
rename from .github/workflows/float8_single_gpu_test.yml
rename to .github/workflows/1xL4_tests.yml
index a6b76abf6c..cf4bf22423 100644
--- a/.github/workflows/float8_single_gpu_test.yml
+++ b/.github/workflows/1xL4_tests.yml
@@ -1,4 +1,4 @@
-name: Run Float8 Single-GPU Tests
+name: Run 1xL4 Tests
 
 on:
   push:
@@ -11,7 +11,7 @@ on:
       - 'gh/**'
 
 concurrency:
-  group: float8_single_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: 1xL4_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 env:
@@ -28,11 +28,6 @@ jobs:
             torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
             gpu-arch-type: "cuda"
             gpu-arch-version: "12.6"
-          - name: H100
-            runs-on: linux.aws.h100
-            torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu126'
-            gpu-arch-type: "cuda"
-            gpu-arch-version: "12.4"
     permissions:
       id-token: write
       contents: read
@@ -53,4 +48,6 @@ jobs:
         uv pip install -r dev-requirements.txt
         uv pip install vllm
         pip install .
+        pytest test/integration --verbose -s
+        pytest test/dtypes/test_affine_quantized_float.py --verbose -s
         ./test/float8/test_everything_single_gpu.sh
diff --git a/.github/workflows/float8_multi_gpu_test.yml b/.github/workflows/4xH100_tests.yml
similarity index 88%
rename from .github/workflows/float8_multi_gpu_test.yml
rename to .github/workflows/4xH100_tests.yml
index 6b73812858..21e82ca845 100644
--- a/.github/workflows/float8_multi_gpu_test.yml
+++ b/.github/workflows/4xH100_tests.yml
@@ -1,4 +1,4 @@
-name: Run Float8 Multi-GPU Tests
+name: Run 4xH100 tests
 
 on:
   push:
@@ -11,7 +11,7 @@ on:
       - 'gh/**'
 
 concurrency:
-  group: float8_multi_gpu_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  group: 4xH100_tests-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
   cancel-in-progress: true
 
 env:
diff --git a/test/float8/test_everything_single_gpu.sh b/test/float8/test_everything_single_gpu.sh
index 96f52ff0d0..0b72951126 100755
--- a/test/float8/test_everything_single_gpu.sh
+++ b/test/float8/test_everything_single_gpu.sh
@@ -12,7 +12,5 @@ pytest test/float8/test_base.py --verbose -s
 pytest test/float8/test_compile.py --verbose -s
 pytest test/float8/test_numerics_integration.py --verbose -s
 pytest test/float8/test_auto_filter.py --verbose -s
-pytest test/integration --verbose -s
-pytest test/dtypes/test_affine_quantized_float.py --verbose -s
 
-echo "all single gpu tests successful"
+echo "all float8 single gpu tests successful"