From b93a21ca3c80ac36c0aa1d143fc718391b11cc0a Mon Sep 17 00:00:00 2001 From: Priya Kasimbeg Date: Wed, 26 Feb 2025 01:08:14 +0000 Subject: [PATCH] switch to mlcommons-algoperf gcp project and artifact repo --- .github/workflows/regression_tests.yml | 32 +++---- .../workflows/regression_tests_variants.yml | 85 ------------------- docker/build_docker_images.sh | 9 +- docker/scripts/cloud-init.cfg | 4 +- docker/scripts/startup.sh | 18 +++- docs/CONTRIBUTING.md | 4 +- scoring/run_workloads.py | 2 +- 7 files changed, 41 insertions(+), 113 deletions(-) delete mode 100644 .github/workflows/regression_tests_variants.yml diff --git a/.github/workflows/regression_tests.yml b/.github/workflows/regression_tests.yml index cb8595f58..172cf6102 100644 --- a/.github/workflows/regression_tests.yml +++ b/.github/workflows/regression_tests.yml @@ -44,7 +44,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d fastmri -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w fastmri -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d fastmri -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w fastmri -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs imagenet_resnet_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -53,7 +53,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d imagenet -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w imagenet_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d imagenet -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w imagenet_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs imagenet_vit_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -62,7 +62,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d imagenet -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w imagenet_vit -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d imagenet -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w imagenet_vit -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs ogbg_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -71,7 +71,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d ogbg -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w ogbg -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d ogbg -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w ogbg -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs criteo_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -80,7 +80,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs librispeech_conformer_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -89,7 +89,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d librispeech -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w librispeech_conformer -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d librispeech -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w librispeech_conformer -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs librispeech_deepspeech_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -98,7 +98,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d librispeech -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w librispeech_deepspeech -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d librispeech -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w librispeech_deepspeech -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs wmt_jax: runs-on: self-hosted needs: build_and_push_jax_docker_image @@ -107,7 +107,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d wmt -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w wmt -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d wmt -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w wmt -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs fastmri_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -116,7 +116,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d fastmri -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w fastmri -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d fastmri -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w fastmri -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs imagenet_resnet_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -125,7 +125,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w imagenet_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w imagenet_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs imagenet_vit_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -134,7 +134,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w imagenet_vit -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d imagenet -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w imagenet_vit -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs ogbg_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -143,7 +143,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d ogbg -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w ogbg -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d ogbg -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w ogbg -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs criteo_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -152,7 +152,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs exit $? librispeech_conformer_pytorch: runs-on: self-hosted @@ -162,7 +162,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d librispeech -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w librispeech_conformer -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d librispeech -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w librispeech_conformer -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs librispeech_deepspeech_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -171,7 +171,7 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d librispeech -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w librispeech_deepspeech -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d librispeech -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w librispeech_deepspeech -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs wmt_pytorch: runs-on: self-hosted needs: build_and_push_pytorch_docker_image @@ -180,4 +180,4 @@ jobs: - name: Run containerized workload run: | docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d wmt -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w wmt -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false + docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d wmt -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w wmt -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false --data_bucket mlcommons-data --logs_bucket mlcommons-runs --data_bucket mlcommons-data --logs_bucket mlcommons-runs diff --git a/.github/workflows/regression_tests_variants.yml b/.github/workflows/regression_tests_variants.yml deleted file mode 100644 index b234575b7..000000000 --- a/.github/workflows/regression_tests_variants.yml +++ /dev/null @@ -1,85 +0,0 @@ -name: Containerized Regression Tests for Workload Variants - -on: - pull_request: - branches: - - 'main' - -jobs: - build_and_push_jax_docker_image: - runs-on: self-hosted - steps: - - uses: actions/checkout@v2 - - name: Build and push docker images - run: | - GIT_BRANCH=${{ github.head_ref || github.ref_name }} - FRAMEWORK=jax - IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}" - cd $HOME/algorithmic-efficiency/docker - docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH - BUILD_RETURN=$? - if [[ ${BUILD_RETURN} != 0 ]]; then exit ${BUILD_RETURN}; fi - docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME - docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME - build_and_push_pytorch_docker_image: - runs-on: self-hosted - steps: - - uses: actions/checkout@v2 - - name: Build and push docker images - run: | - GIT_BRANCH=${{ github.head_ref || github.ref_name }} - FRAMEWORK=pytorch - IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}" - cd $HOME/algorithmic-efficiency/docker - docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH - BUILD_RETURN=$? - if [[ ${BUILD_RETURN} != 0 ]]; then exit ${BUILD_RETURN}; fi - docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME - docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME - criteo_layernorm_jax: - runs-on: self-hosted - needs: build_and_push_jax_docker_image - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb_layernorm -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - criteo_resnet_jax: - runs-on: self-hosted - needs: build_and_push_jax_docker_image - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_${{ github.head_ref || github.ref_name }} -d criteo1tb -f jax -s reference_algorithms/paper_baselines/adamw/jax/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - criteo_layernorm_pytorch: - runs-on: self-hosted - needs: build_and_push_pytorch_docker_image - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_layernorm -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - criteo_resnet_pytorch: - runs-on: self-hosted - needs: build_and_push_pytorch_docker_image - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_resnet -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - criteo_embed_init_pytorch: - runs-on: self-hosted - needs: build_and_push_pytorch_docker_image - steps: - - uses: actions/checkout@v2 - - name: Run containerized workload - run: | - docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} - docker run -v $HOME/data/:/data/ -v $HOME/experiment_runs/:/experiment_runs -v $HOME/experiment_runs/logs:/logs --gpus all --ipc=host us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_pytorch_${{ github.head_ref || github.ref_name }} -d criteo1tb -f pytorch -s reference_algorithms/paper_baselines/adamw/pytorch/submission.py -w criteo1tb_embed_init -t reference_algorithms/paper_baselines/adamw/tuning_search_space.json -e tests/regression_tests/adamw -m 10 -c False -o True -r false - - diff --git a/docker/build_docker_images.sh b/docker/build_docker_images.sh index 9e0e68ca9..ff0c6dc21 100644 --- a/docker/build_docker_images.sh +++ b/docker/build_docker_images.sh @@ -13,6 +13,9 @@ do esac done +# Artifact repostiory +ARTIFACT_REPO="europe-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo" + if [[ -z ${GIT_BRANCH+x} ]] then GIT_BRANCH='main' # Set default argument @@ -22,9 +25,9 @@ for FRAMEWORK in "jax" "pytorch" "both" do IMAGE_NAME="algoperf_${FRAMEWORK}_${GIT_BRANCH}" DOCKER_BUILD_COMMAND="docker build --no-cache -t $IMAGE_NAME . --build-arg framework=$FRAMEWORK --build-arg branch=$GIT_BRANCH" - DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" - DOCKER_PUSH_COMMAND="docker push us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" - DOCKER_PULL_COMMAND="docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/$IMAGE_NAME" + DOCKER_TAG_COMMAND="docker tag $IMAGE_NAME $ARTIFACT_REPO/$IMAGE_NAME" + DOCKER_PUSH_COMMAND="docker push $ARTIFACT_REPO/$IMAGE_NAME" + DOCKER_PULL_COMMAND="docker pull $ARTIFACT_REPO/$IMAGE_NAME" echo "On branch: ${GIT_BRANCH}" echo $DOCKER_BUILD_COMMAND diff --git a/docker/scripts/cloud-init.cfg b/docker/scripts/cloud-init.cfg index 78ea2e97d..6010463f6 100644 --- a/docker/scripts/cloud-init.cfg +++ b/docker/scripts/cloud-init.cfg @@ -40,8 +40,8 @@ write_files: ExecStartPre=mount --bind /var/lib/nvidia /var/lib/nvidia ExecStartPre=mount -o remountexec /var/lib/nvidia ExecStartPre=/usr/bin/docker-credential-gcr configure-docker --registries us-central1-docker.pkg.dev - ExecStartPre=/usr/bin/docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest - ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/base_image:latest -b true + ExecStartPre=/usr/bin/docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest + ExecStart=/usr/bin/docker run --rm --name=mlcommons --volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 --volume /var/lib/nvidia/bin:/usr/local/nvidia/bin --device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 --device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 --device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 --device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 --device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/base_image:latest -b true StandardOutput=journal+console StandardError=journal+console diff --git a/docker/scripts/startup.sh b/docker/scripts/startup.sh index 1dbba9565..39a58d7ad 100644 --- a/docker/scripts/startup.sh +++ b/docker/scripts/startup.sh @@ -51,6 +51,8 @@ RSYNC_DATA="true" OVERWRITE="false" SAVE_CHECKPOINTS="true" TUNING_RULESET="external" +ROOT_DATA_BUCKET="algoperf-data" +LOGS_BUCKET="algoperf-runs" # Pass flag while [ "$1" != "" ]; do @@ -136,6 +138,14 @@ while [ "$1" != "" ]; do shift ADDITIONAL_REQUIREMENTS_PATH=$1 ;; + --data_bucket) + shift + ROOT_DATA_BUCKET=$1 + ;; + --logs_bucket) + shift + LOGS_BUCKET=$1 + ;; *) usage exit 1 @@ -179,11 +189,11 @@ VALID_WORKLOADS=("criteo1tb" "imagenet_resnet" "imagenet_resnet_silu" "imagenet_ VALID_RULESETS=("self" "external") # Set data and experiment paths -ROOT_DATA_BUCKET="gs://mlcommons-data" ROOT_DATA_DIR="${HOME_DIR}/data" +ROOT_DATA_BUCKET="gs://${ROOT_DATA_BUCKET}" -EXPERIMENT_BUCKET="gs://mlcommons-runs" EXPERIMENT_DIR="${HOME_DIR}/experiment_runs" +EXPERIMENT_LOGS_BUCKET="gs://${LOGS_BUCKET}" if [[ -n ${DATASET+x} ]]; then if [[ ! " ${VALID_DATASETS[@]} " =~ " $DATASET " ]]; then @@ -313,8 +323,8 @@ if [[ ! -z ${SUBMISSION_PATH+x} ]]; then RETURN_CODE=$? if [[ $INTERNAL_CONTRIBUTOR_MODE == "true" ]]; then - /google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/ - /google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/ + /google-cloud-sdk/bin/gsutil -m cp -r ${EXPERIMENT_DIR}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/ + /google-cloud-sdk/bin/gsutil -m cp ${LOG_FILE} ${EXPERIMENT_LOGS_BUCKET}/${EXPERIMENT_NAME}/${WORKLOAD}_${FRAMEWORK}/ fi fi diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index c98a5009e..7778030dc 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -88,7 +88,7 @@ gcloud auth configure-docker $ARTIFACT_REGISTRY_URL To pull the latest prebuilt image: ```bash -docker pull us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/ +docker pull europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/ ``` The naming convention for `image_name` is `algoperf__`. @@ -102,7 +102,7 @@ Currently maintained images on the repository are: - `algoperf_both_dev` To reference the pulled image you will have to use the full `image_path`, e.g. -`us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_main`. +`europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_main`. ### Trigger Rebuild and Push of Maintained Images diff --git a/scoring/run_workloads.py b/scoring/run_workloads.py index 99c6e810e..683fb3c63 100644 --- a/scoring/run_workloads.py +++ b/scoring/run_workloads.py @@ -26,7 +26,7 @@ flags.DEFINE_string( 'docker_image_url', - 'us-central1-docker.pkg.dev/training-algorithms-external/mlcommons-docker-repo/algoperf_jax_dev', + 'europe-west4-docker.pkg.dev/mlcommons-algoperf/algoperf-docker-repo/algoperf_jax_dev', 'URL to docker image') flags.DEFINE_integer( 'run_percentage',