From 79ed242a3be5abb7f5e6771398b071ba81ce6b6b Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 12:44:21 -0700 Subject: [PATCH 01/10] update pillow --- src/config/build_config.py | 4 +- tensorflow/buildspec.yml | 43 +++++++++++++++---- .../training/docker/2.3/py3/Dockerfile.cpu | 2 +- .../docker/2.3/py3/cu102/Dockerfile.gpu | 2 +- 4 files changed, 38 insertions(+), 13 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index 0b7e2581a102..c9e90df52c36 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = [] +DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = False +DISABLE_DATETIME_TAG = True # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail DISABLE_NEW_BUILDS = False diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index 9cdc7d65ae38..f19c1682deab 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.4.1 -short_version: &SHORT_VERSION 2.4 +version: &VERSION 2.3.2 +short_version: &SHORT_VERSION 2.3 repository_info: training_repository: &TRAINING_REPOSITORY @@ -23,9 +23,6 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py - deep_learning_container: - source: ../../src/deep_learning_container.py - target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT sagemaker_package_name: source: docker/build_artifacts/sagemaker @@ -36,9 +33,6 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py - deep_learning_container: - source: ../../src/deep_learning_container.py - target: deep_learning_container.py images: BuildTensorflowCpuPy37TrainingDockerImage: @@ -55,6 +49,37 @@ images: *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT + BuildTensorflowGpuPy37Cu102TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 7738 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu102 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, + "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, + /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT + BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 7738 + base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu102 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, + "-", *OS_VERSION, "-example" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, + /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -105,7 +130,7 @@ images: device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu110 + cuda_version: &CUDA_VERSION cu102 os_version: &OS_VERSION ubuntu18.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] diff --git a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu index cf9dbe147ec4..8c5206e9fcd2 100644 --- a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu +++ b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu @@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow==7.2.0 \ + Pillow \ python-dateutil==2.8.1 \ # install PyYAML>=5.4.1 to avoid conflict with latest awscli "pyYAML>=5.4.1,<5.5" \ diff --git a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu index e739491dfdc9..10473d8b2656 100644 --- a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu +++ b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu @@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow==7.2.0 \ + Pillow \ python-dateutil==2.8.1 \ # install PyYAML>=5.4.1 to avoid conflict with latest awscli "pyYAML>=5.4.1,<5.5" \ From 5f2946fe0b6312dd8db95cb0427769c91e4ef5b1 Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 14:39:52 -0700 Subject: [PATCH 02/10] update pillow --- tensorflow/training/docker/2.3/py3/Dockerfile.cpu | 2 +- tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu index 8c5206e9fcd2..340dc39d57b5 100644 --- a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu +++ b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu @@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow \ + Pillow==8.2.0 \ python-dateutil==2.8.1 \ # install PyYAML>=5.4.1 to avoid conflict with latest awscli "pyYAML>=5.4.1,<5.5" \ diff --git a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu index 10473d8b2656..3ece0ccfd9f4 100644 --- a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu +++ b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu @@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow \ + Pillow==8.2.0 \ python-dateutil==2.8.1 \ # install PyYAML>=5.4.1 to avoid conflict with latest awscli "pyYAML>=5.4.1,<5.5" \ From 854602b869a0c5cbc6cd9990f967a1212c2494c0 Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 14:53:51 -0700 Subject: [PATCH 03/10] revert --- src/config/build_config.py | 4 ++-- tensorflow/buildspec.yml | 43 ++++++++------------------------------ 2 files changed, 11 insertions(+), 36 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index c9e90df52c36..0b7e2581a102 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"] +DISABLE_FRAMEWORK_TESTS = [] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = True +DISABLE_DATETIME_TAG = False # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail DISABLE_NEW_BUILDS = False diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index f19c1682deab..9cdc7d65ae38 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.3.2 -short_version: &SHORT_VERSION 2.3 +version: &VERSION 2.4.1 +short_version: &SHORT_VERSION 2.4 repository_info: training_repository: &TRAINING_REPOSITORY @@ -23,6 +23,9 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT sagemaker_package_name: source: docker/build_artifacts/sagemaker @@ -33,6 +36,9 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py images: BuildTensorflowCpuPy37TrainingDockerImage: @@ -49,37 +55,6 @@ images: *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT - BuildTensorflowGpuPy37Cu102TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 7738 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, - "-", *OS_VERSION ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, - /Dockerfile., *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT - BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 7738 - base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, - "-", *OS_VERSION, "-example" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, - /Dockerfile., *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -130,7 +105,7 @@ images: device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 + cuda_version: &CUDA_VERSION cu110 os_version: &OS_VERSION ubuntu18.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] From 00c3cc5cf138a96e1dd91dc705a72319e0b8711b Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 15:01:36 -0700 Subject: [PATCH 04/10] update pillow --- tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu index fc708bd8ce49..fbd719df3e61 100644 --- a/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu +++ b/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu @@ -182,7 +182,7 @@ RUN ${PIP} install --no-cache-dir -U \ scipy==1.5.2 \ scikit-learn==0.23 \ pandas==1.1 \ - Pillow==7.2.0 \ + Pillow==8.2.0 \ python-dateutil==2.8.1 \ # install PyYAML>=5.4.1 to avoid conflict with latest awscli "pyYAML>=5.4.1,<5.5" \ From 4fe456295cbab56974692ac309cab6679aee7595 Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 15:57:36 -0700 Subject: [PATCH 05/10] test cuda110 --- tensorflow/buildspec.yml | 66 ++-------------------------------------- 1 file changed, 3 insertions(+), 63 deletions(-) diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index 9cdc7d65ae38..63f979e73e52 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.4.1 -short_version: &SHORT_VERSION 2.4 +version: &VERSION 2.3.2 +short_version: &SHORT_VERSION 2.3 repository_info: training_repository: &TRAINING_REPOSITORY @@ -23,9 +23,6 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py - deep_learning_container: - source: ../../src/deep_learning_container.py - target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT sagemaker_package_name: source: docker/build_artifacts/sagemaker @@ -36,25 +33,8 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py - deep_learning_container: - source: ../../src/deep_learning_container.py - target: deep_learning_container.py images: - BuildTensorflowCpuPy37TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_CPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 4489 - device_type: &DEVICE_TYPE cpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION - ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., - *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -70,44 +50,4 @@ images: /Dockerfile., *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT - BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 7738 - base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu110 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, - "-", *OS_VERSION, "-example" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, - /Dockerfile., *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT - BuildTensorflowCPUInferencePy3DockerImage: - <<: *INFERENCE_REPOSITORY - build: &TENSORFLOW_CPU_INFERENCE_PY3 false - image_size_baseline: 4899 - device_type: &DEVICE_TYPE cpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] - context: - <<: *INFERENCE_CONTEXT - BuildTensorflowGPUInferencePy3DockerImage: - <<: *INFERENCE_REPOSITORY - build: &TENSORFLOW_GPU_INFERENCE_PY3 false - image_size_baseline: 7738 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu110 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] - context: - <<: *INFERENCE_CONTEXT + From d36dd1350a43eadfd07a05454cfe70dacfed7bfb Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Tue, 27 Apr 2021 15:58:13 -0700 Subject: [PATCH 06/10] test cuda110 --- src/config/build_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index 0b7e2581a102..c9e90df52c36 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = [] +DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = False +DISABLE_DATETIME_TAG = True # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail DISABLE_NEW_BUILDS = False From 9f5d54f048df71bcb514c59c77e0aebeed86f4c1 Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Wed, 28 Apr 2021 17:38:45 -0700 Subject: [PATCH 07/10] use p3.16 in virginia region --- .../integration/sagemaker/test_mnist.py | 12 ++++++------ .../integration/sagemaker/test_smdataparallel.py | 8 ++++---- .../sagemaker/test_smmodelparallel.py | 16 ++++++++-------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py index 4a954d529337..fe371c9eeea7 100755 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py @@ -191,15 +191,15 @@ def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version) @pytest.mark.model("mnist") @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers -def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version): +def test_smdataparallel_smmodelparallel_mnist(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version): """ Tests SM Distributed DataParallel and ModelParallel single-node via script mode This test has been added for SM DataParallelism and ModelParallelism tests for re:invent. TODO: Consider reworking these tests after re:Invent releases are done """ instance_type = "ml.p3.16xlarge" - _, image_framework_version = get_framework_and_version_from_tag(ecr_image) - image_cuda_version = get_cuda_version_from_tag(ecr_image) + _, image_framework_version = get_framework_and_version_from_tag(n_virginia_ecr_image) + image_cuda_version = get_cuda_version_from_tag(n_virginia_ecr_image) if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110": pytest.skip("SMD Model and Data Parallelism are only supported on CUDA 11, and on TensorFlow 2.3.1 or higher") smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel') @@ -209,12 +209,12 @@ def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type, instance_count=1, instance_type=instance_type, source_dir=smmodelparallel_path, - sagemaker_session=sagemaker_session, - image_uri=ecr_image, + sagemaker_session=n_virginia_sagemaker_session, + image_uri=n_virginia_ecr_image, framework_version=framework_version, py_version='py3') - estimator = _disable_sm_profiler(sagemaker_session.boto_region_name, estimator) + estimator = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, estimator) estimator.fit() diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py index 404e9674f1c9..71c97acd110d 100644 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py @@ -58,12 +58,12 @@ def can_run_smdataparallel_efa(ecr_image): @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers def test_distributed_training_smdataparallel_script_mode( - sagemaker_session, instance_type, ecr_image, tmpdir, framework_version + n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version ): """ Tests SMDataParallel single-node command via script mode """ - validate_or_skip_smdataparallel(ecr_image) + validate_or_skip_smdataparallel(n_virginia_ecr_image) instance_type = "ml.p3.16xlarge" distribution = {"smdistributed": {"dataparallel": {"enabled": True}}} estimator = TensorFlow( @@ -72,10 +72,10 @@ def test_distributed_training_smdataparallel_script_mode( role='SageMakerRole', instance_type=instance_type, instance_count=1, - image_uri=ecr_image, + image_uri=n_virginia_ecr_image, framework_version=framework_version, py_version='py3', - sagemaker_session=sagemaker_session, + sagemaker_session=n_virginia_sagemaker_session, distribution=distribution) estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel')) diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py index fe275168091e..f089684a50d6 100644 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py @@ -122,12 +122,12 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers @pytest.mark.parametrize("test_script, num_processes", [("tf2_conv.py", 2), ("tf2_conv_xla.py", 2), ("smmodelparallel_hvd2_conv.py", 4), ("send_receive_checkpoint.py", 2), ("tf2_checkpoint_test.py", 2)]) -def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes): +def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes): """ Tests SM Modelparallel in sagemaker """ instance_type = "ml.p3.16xlarge" - validate_or_skip_smmodelparallel(ecr_image) + validate_or_skip_smmodelparallel(n_virginia_ecr_image) smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel') estimator = TensorFlow(entry_point=test_script, role='SageMakerRole', @@ -141,8 +141,8 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ", } }, - sagemaker_session=sagemaker_session, - image_uri=ecr_image, + sagemaker_session=n_virginia_sagemaker_session, + image_uri=n_virginia_ecr_image, framework_version=framework_version, py_version='py3', base_job_name='smp-test1') @@ -156,12 +156,12 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr @pytest.mark.skip_cpu @pytest.mark.skip_py2_containers @pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_hvd2_conv_multinode.py", 2)]) -def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes): +def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes): """ Tests SM Modelparallel in sagemaker """ instance_type = "ml.p3.16xlarge" - validate_or_skip_smmodelparallel(ecr_image) + validate_or_skip_smmodelparallel(n_virginia_ecr_image) smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel') estimator = TensorFlow(entry_point=test_script, role='SageMakerRole', @@ -175,8 +175,8 @@ def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image, "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ", } }, - sagemaker_session=sagemaker_session, - image_uri=ecr_image, + sagemaker_session=n_virginia_sagemaker_session, + image_uri=n_virginia_ecr_image, framework_version=framework_version, py_version='py3', base_job_name='smp-test2') From 3afaa72ebf33bf45e01cc9f81ee68d90c098dc7c Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Wed, 28 Apr 2021 17:45:23 -0700 Subject: [PATCH 08/10] DISABLE_NEW_BUILDS true --- src/config/build_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index c9e90df52c36..dde89d175ae2 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -10,4 +10,4 @@ DISABLE_DATETIME_TAG = True # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail -DISABLE_NEW_BUILDS = False +DISABLE_NEW_BUILDS = True From 8d45b835d111ccb2d6a51d91e720de3c376ba767 Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Wed, 28 Apr 2021 17:47:21 -0700 Subject: [PATCH 09/10] DISABLE_NEW_BUILDS true --- src/config/test_config.py | 10 +++++----- test/testrunner.py | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/config/test_config.py b/src/config/test_config.py index 3f8ce5f60f0b..931264220edc 100644 --- a/src/config/test_config.py +++ b/src/config/test_config.py @@ -5,11 +5,11 @@ # Disable the test codebuild jobs to be run # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks. -DISABLE_EFA_TESTS = False +DISABLE_EFA_TESTS = True -DISABLE_SANITY_TESTS = False +DISABLE_SANITY_TESTS = True DISABLE_SAGEMAKER_TESTS = False -DISABLE_ECS_TESTS = False -DISABLE_EKS_TESTS = False -DISABLE_EC2_TESTS = False +DISABLE_ECS_TESTS = True +DISABLE_EKS_TESTS = True +DISABLE_EC2_TESTS = True USE_SCHEDULER = False diff --git a/test/testrunner.py b/test/testrunner.py index 07cf6bda795b..0d95ca950090 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images): sm_tests_tar_name = "sagemaker_tests.tar.gz" run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}") - pool_number = len(images) - with Pool(pool_number) as p: - p.map(sm_utils.execute_local_tests, images) + # pool_number = len(images) + # with Pool(pool_number) as p: + # p.map(sm_utils.execute_local_tests, images) def run_sagemaker_test_in_executor(image, num_of_instances, instance_type): From 4de74cb22519c17fd5caa92a20a5bf17d3d162cb Mon Sep 17 00:00:00 2001 From: Jeetendra Patil Date: Thu, 29 Apr 2021 10:34:22 -0700 Subject: [PATCH 10/10] revert code --- src/config/build_config.py | 6 ++-- src/config/test_config.py | 10 +++--- tensorflow/buildspec.yml | 66 ++++++++++++++++++++++++++++++++++++-- test/testrunner.py | 6 ++-- 4 files changed, 74 insertions(+), 14 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index dde89d175ae2..0b7e2581a102 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"] +DISABLE_FRAMEWORK_TESTS = [] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = True +DISABLE_DATETIME_TAG = False # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail -DISABLE_NEW_BUILDS = True +DISABLE_NEW_BUILDS = False diff --git a/src/config/test_config.py b/src/config/test_config.py index 931264220edc..3f8ce5f60f0b 100644 --- a/src/config/test_config.py +++ b/src/config/test_config.py @@ -5,11 +5,11 @@ # Disable the test codebuild jobs to be run # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks. -DISABLE_EFA_TESTS = True +DISABLE_EFA_TESTS = False -DISABLE_SANITY_TESTS = True +DISABLE_SANITY_TESTS = False DISABLE_SAGEMAKER_TESTS = False -DISABLE_ECS_TESTS = True -DISABLE_EKS_TESTS = True -DISABLE_EC2_TESTS = True +DISABLE_ECS_TESTS = False +DISABLE_EKS_TESTS = False +DISABLE_EC2_TESTS = False USE_SCHEDULER = False diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index 63f979e73e52..9cdc7d65ae38 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.3.2 -short_version: &SHORT_VERSION 2.3 +version: &VERSION 2.4.1 +short_version: &SHORT_VERSION 2.4 repository_info: training_repository: &TRAINING_REPOSITORY @@ -23,6 +23,9 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py inference_context: &INFERENCE_CONTEXT sagemaker_package_name: source: docker/build_artifacts/sagemaker @@ -33,8 +36,25 @@ context: dockerd-entrypoint: source: docker/build_artifacts/dockerd-entrypoint.py target: dockerd-entrypoint.py + deep_learning_container: + source: ../../src/deep_learning_container.py + target: deep_learning_container.py images: + BuildTensorflowCpuPy37TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_CPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 4489 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION + ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., + *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -50,4 +70,44 @@ images: /Dockerfile., *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT - + BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 7738 + base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu110 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, + "-", *OS_VERSION, "-example" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, + /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT + BuildTensorflowCPUInferencePy3DockerImage: + <<: *INFERENCE_REPOSITORY + build: &TENSORFLOW_CPU_INFERENCE_PY3 false + image_size_baseline: 4899 + device_type: &DEVICE_TYPE cpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ] + context: + <<: *INFERENCE_CONTEXT + BuildTensorflowGPUInferencePy3DockerImage: + <<: *INFERENCE_REPOSITORY + build: &TENSORFLOW_GPU_INFERENCE_PY3 false + image_size_baseline: 7738 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu110 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] + context: + <<: *INFERENCE_CONTEXT diff --git a/test/testrunner.py b/test/testrunner.py index 0d95ca950090..07cf6bda795b 100644 --- a/test/testrunner.py +++ b/test/testrunner.py @@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images): sm_tests_tar_name = "sagemaker_tests.tar.gz" run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}") - # pool_number = len(images) - # with Pool(pool_number) as p: - # p.map(sm_utils.execute_local_tests, images) + pool_number = len(images) + with Pool(pool_number) as p: + p.map(sm_utils.execute_local_tests, images) def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):