diff --git a/buildspec-dlc-cpu-tests.yml b/buildspec-dlc-cpu-tests.yml index 7bf062de..9f3f596f 100644 --- a/buildspec-dlc-cpu-tests.yml +++ b/buildspec-dlc-cpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' ECR_REPO: 'sagemaker-test' diff --git a/buildspec-dlc-gpu-tests.yml b/buildspec-dlc-gpu-tests.yml index 6266877e..3ad2cf65 100644 --- a/buildspec-dlc-gpu-tests.yml +++ b/buildspec-dlc-gpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' GPU_INSTANCE_TYPE: 'ml.p3.2xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit' diff --git a/buildspec-gen-cpu-tests.yml b/buildspec-gen-cpu-tests.yml index f1f88b3e..4433deb4 100644 --- a/buildspec-gen-cpu-tests.yml +++ b/buildspec-gen-cpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' CPU_INSTANCE_TYPE: 'ml.c4.xlarge' ECR_REPO: 'sagemaker-test' diff --git a/buildspec-gen-gpu-tests.yml b/buildspec-gen-gpu-tests.yml index 441dd269..e9cd04ff 100644 --- a/buildspec-gen-gpu-tests.yml +++ b/buildspec-gen-gpu-tests.yml @@ -2,7 +2,7 @@ version: 0.2 env: variables: - FRAMEWORK_VERSION: '2.7.1' + FRAMEWORK_VERSION: '2.9.1' GPU_INSTANCE_TYPE: 'ml.p3.16xlarge' ECR_REPO: 'sagemaker-test' GITHUB_REPO: 'sagemaker-tensorflow-training-toolkit' diff --git a/setup.py b/setup.py index 439cfbc5..279e7ad9 100644 --- a/setup.py +++ b/setup.py @@ -38,13 +38,13 @@ def read_version(): "pytest-rerunfailures", "mock", "sagemaker[local]>=2", - "tensorflow<2.4", + "tensorflow>=2.9", "docker-compose", - "boto3==1.16.34", + "boto3", "python-dateutil>=2.1,<2.8.1", - "botocore==1.19.34", + "botocore", "requests-mock", - "awscli==1.18.194", + "awscli", "protobuf>=3.20,<3.21" ] diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py index 090d6d66..4d1965f4 100644 --- a/src/sagemaker_tensorflow_container/training.py +++ b/src/sagemaker_tensorflow_container/training.py @@ -103,7 +103,7 @@ def _build_tf_config_for_mwms(hosts, current_host): """ workers = hosts - def host_addresses(hosts, port=8890): + def host_addresses(hosts, port=2222): return ["{}:{}".format(host, port) for host in hosts] tf_config = {"cluster": {}, "environment": "cloud"} diff --git a/test/container/2.9.1/Dockerfile.dlc.cpu b/test/container/2.9.1/Dockerfile.dlc.cpu new file mode 100644 index 00000000..855e2458 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.dlc.cpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-cpu-py39-ubuntu20.04-sagemaker + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/2.9.1/Dockerfile.dlc.gpu b/test/container/2.9.1/Dockerfile.dlc.gpu new file mode 100644 index 00000000..b468d9f5 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.dlc.gpu @@ -0,0 +1,6 @@ +ARG region +FROM 763104351884.dkr.ecr.$region.amazonaws.com/tensorflow-training:2.9.1-gpu-py39-cu112-ubuntu20.04-sagemaker + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz diff --git a/test/container/2.9.1/Dockerfile.tf.cpu b/test/container/2.9.1/Dockerfile.tf.cpu new file mode 100644 index 00000000..b18e3cf5 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.tf.cpu @@ -0,0 +1,9 @@ +FROM tensorflow/tensorflow:2.9.1 + +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz +RUN pip install --no-cache-dir tensorflow-io +RUN apt-get update && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd \ No newline at end of file diff --git a/test/container/2.9.1/Dockerfile.tf.gpu b/test/container/2.9.1/Dockerfile.tf.gpu new file mode 100644 index 00000000..3adb62f0 --- /dev/null +++ b/test/container/2.9.1/Dockerfile.tf.gpu @@ -0,0 +1,13 @@ +FROM tensorflow/tensorflow:2.9.1-gpu + +ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main + +COPY dist/sagemaker_tensorflow_training-*.tar.gz /sagemaker_tensorflow_training.tar.gz +RUN pip install --upgrade --no-cache-dir /sagemaker_tensorflow_training.tar.gz && \ + rm /sagemaker_tensorflow_training.tar.gz +RUN pip install --no-cache-dir tensorflow-io +RUN apt-key del 7fa2af80 \ + && rm /etc/apt/sources.list.d/nvidia-ml.list \ + && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub \ + && apt-get update \ + && apt-get install -y --no-install-recommends openssh-server && mkdir -p /var/run/sshd diff --git a/test/integration/sagemaker/test_multi_worker_mirrored.py b/test/integration/sagemaker/test_multi_worker_mirrored.py index eeb980fb..0472fe32 100644 --- a/test/integration/sagemaker/test_multi_worker_mirrored.py +++ b/test/integration/sagemaker/test_multi_worker_mirrored.py @@ -14,6 +14,7 @@ import os +import pytest from sagemaker.tensorflow import TensorFlow from sagemaker.utils import unique_name_from_base @@ -21,7 +22,9 @@ RESOURCE_PATH = os.path.join(os.path.dirname(__file__), "..", "..", "resources") -def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys): +def test_keras_example( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys +): estimator = TensorFlow( entry_point=os.path.join(RESOURCE_PATH, "multi_worker_mirrored", "train_dummy.py"), role="SageMakerRole", @@ -40,3 +43,62 @@ def test_multi_node(sagemaker_session, instance_type, image_uri, tmpdir, framewo logs = captured.out + captured.err assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs assert "TF_CONFIG=" in logs + + +@pytest.mark.skip_cpu +def test_tf_model_garden( + sagemaker_session, instance_type, image_uri, tmpdir, framework_version, capsys +): + epochs = 1 + global_batch_size = 64 + train_steps = int(10**5 * epochs / global_batch_size) + steps_per_loop = train_steps // 100 + overrides = ( + f"runtime.enable_xla=False," + f"runtime.num_gpus=1," + f"runtime.distribution_strategy=multi_worker_mirrored," + f"runtime.mixed_precision_dtype=float16," + f"task.train_data.global_batch_size={global_batch_size}," + f"task.train_data.input_path=/opt/ml/input/data/training/train-000*," + f"task.train_data.cache=True," + f"trainer.train_steps={train_steps}," + f"trainer.steps_per_loop={steps_per_loop}," + f"trainer.summary_interval={steps_per_loop}," + f"trainer.checkpoint_interval={train_steps}," + f"task.model.backbone.type=resnet," + f"task.model.backbone.resnet.model_id=50" + ) + estimator = TensorFlow( + git_config={ + "repo": "https://github.com/tensorflow/models.git", + "branch": "v2.9.2", + }, + source_dir=".", + entry_point="official/vision/train.py", + model_dir=False, + instance_type=instance_type, + instance_count=2, + image_uri=image_uri, + hyperparameters={ + "sagemaker_multi_worker_mirrored_strategy_enabled": True, + "experiment": "resnet_imagenet", + "config_file": "official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml", + "mode": "train", + "model_dir": "/opt/ml/model", + "params_override": overrides, + }, + environment={ + 'NCCL_DEBUG': 'INFO', + }, + max_run=60 * 60 * 12, # 1 hour + role="SageMakerRole", + volume_size=400, + ) + estimator.fit( + inputs="s3://collection-of-ml-datasets/Imagenet/TFRecords/train", + job_name=unique_name_from_base("test-tf-mwms"), + ) + captured = capsys.readouterr() + logs = captured.out + captured.err + assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs + assert "TF_CONFIG=" in logs diff --git a/test/resources/multi_worker_mirrored/train_dummy.py b/test/resources/multi_worker_mirrored/train_dummy.py index 7552e019..c347a761 100644 --- a/test/resources/multi_worker_mirrored/train_dummy.py +++ b/test/resources/multi_worker_mirrored/train_dummy.py @@ -45,4 +45,4 @@ def build_and_compile_cnn_model(): # Model building/compiling need to be within `strategy.scope()`. multi_worker_model = build_and_compile_cnn_model() -multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70) +multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70, verbose=2) diff --git a/test/unit/test_training.py b/test/unit/test_training.py index d955a6da..68b72757 100644 --- a/test/unit/test_training.py +++ b/test/unit/test_training.py @@ -35,7 +35,7 @@ "worker": ["{}:2222".format(HOST2)], "ps": ["{}:2223".format(HOST1), "{}:2223".format(HOST2)], } -CLUSTER_WITH_MWMS = {"worker": ["{}:8890".format(HOST) for HOST in HOST_LIST]} +CLUSTER_WITH_MWMS = {"worker": ["{}:2222".format(HOST) for HOST in HOST_LIST]} MASTER_TASK = {"index": 0, "type": "master"} WORKER_TASK = {"index": 0, "type": "worker"}