From 1b64e74b1f3935d74f394e7f8b3ada91a86fa5c3 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Thu, 20 Oct 2022 14:37:30 -0400 Subject: [PATCH 1/4] [Nova] GHA Linux GPU Job --- .github/workflows/test-linux-cpu.yml | 4 +- .github/workflows/test-linux-gpu.yml | 60 ++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/test-linux-gpu.yml diff --git a/.github/workflows/test-linux-cpu.yml b/.github/workflows/test-linux-cpu.yml index f78dd323da7..234ad97f4d6 100644 --- a/.github/workflows/test-linux-cpu.yml +++ b/.github/workflows/test-linux-cpu.yml @@ -16,7 +16,7 @@ jobs: tests: strategy: matrix: - py_vers: ["3.7", "3.8", "3.9", "3.10"] + python_version: ["3.7", "3.8", "3.9", "3.10"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: @@ -27,7 +27,7 @@ jobs: git config --global --add safe.directory /__w/vision/vision # Set up Environment Variables - export PYTHON_VERSION="${{ matrix.py_vers }}" + export PYTHON_VERSION="${{ matrix.python_version }}" export VERSION="cpu" export CUDATOOLKIT="cpuonly" diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml new file mode 100644 index 00000000000..8e6f562715e --- /dev/null +++ b/.github/workflows/test-linux-gpu.yml @@ -0,0 +1,60 @@ +name: Unit-tests on Linux GPU + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +env: + CHANNEL: "nightly" + +jobs: + tests: + strategy: + matrix: + python_version: ["3.7", "3.8", "3.9", "3.10"] + cuda_arch_version: ["11.6", "11.7"] + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + runner: linux.4xlarge.nvidia.gpu + repository: pytorch/vision + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda_arch_version }} + script: | + # Mark Build Directory Safe + git config --global --add safe.directory /__w/vision/vision + + # Set up Environment Variables + export PYTHON_VERSION="${{ matrix.python_version }}" + export VERSION="${{ matrix.cuda_arch_version }}" + export CUDATOOLKIT="pytorch-cuda=${VERSION}" + + # Set CHANNEL + if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then + export CHANNEL=test + else + export CHANNEL=nightly + fi + + # Create Conda Env + conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy + conda activate /work/ci_env + + # Install PyTorch, Torchvision, and testing libraries + set -ex + conda install \ + --yes \ + -c "pytorch-${CHANNEL}" \ + -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ + "${CUDATOOLKIT}" + python3 setup.py develop + python3 -m pip install pytest pytest-mock 'av<10' + + # Run Tests + python3 -m torch.utils.collect_env + python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 From 667f7b9e480f35469faf61c41c3473e6d203e839 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Thu, 20 Oct 2022 15:20:48 -0400 Subject: [PATCH 2/4] increase timeout since jobs timeout and cancel after 30 mins --- .github/workflows/test-linux-gpu.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 8e6f562715e..dc453ca2ee2 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -25,6 +25,7 @@ jobs: repository: pytorch/vision gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} + timeout: 60 script: | # Mark Build Directory Safe git config --global --add safe.directory /__w/vision/vision From 31315f12c55f537c00a73a9773a276f9dd587289 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Fri, 21 Oct 2022 12:59:20 -0400 Subject: [PATCH 3/4] bigger instance and longer timeout --- .github/workflows/test-linux-gpu.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index dc453ca2ee2..bc97a07c969 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -21,11 +21,11 @@ jobs: fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: - runner: linux.4xlarge.nvidia.gpu + runner: linux.8xlarge.nvidia.gpu repository: pytorch/vision gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }} - timeout: 60 + timeout: 120 script: | # Mark Build Directory Safe git config --global --add safe.directory /__w/vision/vision From b907257f18fd3c1b4c289205404ff87943de02c3 Mon Sep 17 00:00:00 2001 From: Omkar Salpekar Date: Fri, 21 Oct 2022 15:08:45 -0400 Subject: [PATCH 4/4] use instance with more gpu memory and only run py38 and cu116 on PR CI --- .github/workflows/test-linux-gpu.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index bc97a07c969..a4d938f23ed 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -16,12 +16,12 @@ jobs: tests: strategy: matrix: - python_version: ["3.7", "3.8", "3.9", "3.10"] - cuda_arch_version: ["11.6", "11.7"] + python_version: ["3.8"] + cuda_arch_version: ["11.6"] fail-fast: false uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: - runner: linux.8xlarge.nvidia.gpu + runner: linux.g5.4xlarge.nvidia.gpu repository: pytorch/vision gpu-arch-type: cuda gpu-arch-version: ${{ matrix.cuda_arch_version }}