Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/config/build_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
ENABLE_NEURON_MODE = False
# Frameworks for which you want to disable both builds and tests
DISABLE_FRAMEWORK_TESTS = []
DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
# Disable new builds or build without datetime tag
DISABLE_DATETIME_TAG = False
DISABLE_DATETIME_TAG = True
# Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
# before disabling new builds or tests will fail
DISABLE_NEW_BUILDS = False
DISABLE_NEW_BUILDS = True
10 changes: 5 additions & 5 deletions src/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
# Disable the test codebuild jobs to be run

# It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
DISABLE_EFA_TESTS = False
DISABLE_EFA_TESTS = True

DISABLE_SANITY_TESTS = False
DISABLE_SANITY_TESTS = True
DISABLE_SAGEMAKER_TESTS = False
DISABLE_ECS_TESTS = False
DISABLE_EKS_TESTS = False
DISABLE_EC2_TESTS = False
DISABLE_ECS_TESTS = True
DISABLE_EKS_TESTS = True
DISABLE_EC2_TESTS = True
USE_SCHEDULER = False
66 changes: 3 additions & 63 deletions tensorflow/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK tensorflow
version: &VERSION 2.4.1
short_version: &SHORT_VERSION 2.4
version: &VERSION 2.3.2
short_version: &SHORT_VERSION 2.3

repository_info:
training_repository: &TRAINING_REPOSITORY
Expand All @@ -23,9 +23,6 @@ context:
dockerd-entrypoint:
source: docker/build_artifacts/dockerd-entrypoint.py
target: dockerd-entrypoint.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py
inference_context: &INFERENCE_CONTEXT
sagemaker_package_name:
source: docker/build_artifacts/sagemaker
Expand All @@ -36,25 +33,8 @@ context:
dockerd-entrypoint:
source: docker/build_artifacts/dockerd-entrypoint.py
target: dockerd-entrypoint.py
deep_learning_container:
source: ../../src/deep_learning_container.py
target: deep_learning_container.py

images:
BuildTensorflowCpuPy37TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_CPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 4489
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION
]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowGpuPy37Cu110TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_GPU_TRAINING_PY3 false
Expand All @@ -70,44 +50,4 @@ images:
/Dockerfile., *DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_GPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 7738
base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
cuda_version: &CUDA_VERSION cu110
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
"-", *OS_VERSION, "-example" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
/Dockerfile., *DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowCPUInferencePy3DockerImage:
<<: *INFERENCE_REPOSITORY
build: &TENSORFLOW_CPU_INFERENCE_PY3 false
image_size_baseline: 4899
device_type: &DEVICE_TYPE cpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
context:
<<: *INFERENCE_CONTEXT
BuildTensorflowGPUInferencePy3DockerImage:
<<: *INFERENCE_REPOSITORY
build: &TENSORFLOW_GPU_INFERENCE_PY3 false
image_size_baseline: 7738
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
cuda_version: &CUDA_VERSION cu110
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
context:
<<: *INFERENCE_CONTEXT

2 changes: 1 addition & 1 deletion tensorflow/training/docker/2.3/py3/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \
scipy==1.5.2 \
scikit-learn==0.23 \
pandas==1.1 \
Pillow==7.2.0 \
Pillow==8.2.0 \
python-dateutil==2.8.1 \
# install PyYAML>=5.4.1 to avoid conflict with latest awscli
"pyYAML>=5.4.1,<5.5" \
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \
scipy==1.5.2 \
scikit-learn==0.23 \
pandas==1.1 \
Pillow==7.2.0 \
Pillow==8.2.0 \
python-dateutil==2.8.1 \
# install PyYAML>=5.4.1 to avoid conflict with latest awscli
"pyYAML>=5.4.1,<5.5" \
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ RUN ${PIP} install --no-cache-dir -U \
scipy==1.5.2 \
scikit-learn==0.23 \
pandas==1.1 \
Pillow==7.2.0 \
Pillow==8.2.0 \
python-dateutil==2.8.1 \
# install PyYAML>=5.4.1 to avoid conflict with latest awscli
"pyYAML>=5.4.1,<5.5" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,15 +191,15 @@ def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version)
@pytest.mark.model("mnist")
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version):
def test_smdataparallel_smmodelparallel_mnist(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version):
"""
Tests SM Distributed DataParallel and ModelParallel single-node via script mode
This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
TODO: Consider reworking these tests after re:Invent releases are done
"""
instance_type = "ml.p3.16xlarge"
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
image_cuda_version = get_cuda_version_from_tag(ecr_image)
_, image_framework_version = get_framework_and_version_from_tag(n_virginia_ecr_image)
image_cuda_version = get_cuda_version_from_tag(n_virginia_ecr_image)
if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110":
pytest.skip("SMD Model and Data Parallelism are only supported on CUDA 11, and on TensorFlow 2.3.1 or higher")
smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
Expand All @@ -209,12 +209,12 @@ def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type,
instance_count=1,
instance_type=instance_type,
source_dir=smmodelparallel_path,
sagemaker_session=sagemaker_session,
image_uri=ecr_image,
sagemaker_session=n_virginia_sagemaker_session,
image_uri=n_virginia_ecr_image,
framework_version=framework_version,
py_version='py3')

estimator = _disable_sm_profiler(sagemaker_session.boto_region_name, estimator)
estimator = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, estimator)

estimator.fit()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,12 +58,12 @@ def can_run_smdataparallel_efa(ecr_image):
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
def test_distributed_training_smdataparallel_script_mode(
sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version
):
"""
Tests SMDataParallel single-node command via script mode
"""
validate_or_skip_smdataparallel(ecr_image)
validate_or_skip_smdataparallel(n_virginia_ecr_image)
instance_type = "ml.p3.16xlarge"
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
estimator = TensorFlow(
Expand All @@ -72,10 +72,10 @@ def test_distributed_training_smdataparallel_script_mode(
role='SageMakerRole',
instance_type=instance_type,
instance_count=1,
image_uri=ecr_image,
image_uri=n_virginia_ecr_image,
framework_version=framework_version,
py_version='py3',
sagemaker_session=sagemaker_session,
sagemaker_session=n_virginia_sagemaker_session,
distribution=distribution)

estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,12 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.parametrize("test_script, num_processes", [("tf2_conv.py", 2), ("tf2_conv_xla.py", 2), ("smmodelparallel_hvd2_conv.py", 4), ("send_receive_checkpoint.py", 2), ("tf2_checkpoint_test.py", 2)])
def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
"""
Tests SM Modelparallel in sagemaker
"""
instance_type = "ml.p3.16xlarge"
validate_or_skip_smmodelparallel(ecr_image)
validate_or_skip_smmodelparallel(n_virginia_ecr_image)
smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
estimator = TensorFlow(entry_point=test_script,
role='SageMakerRole',
Expand All @@ -141,8 +141,8 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
}
},
sagemaker_session=sagemaker_session,
image_uri=ecr_image,
sagemaker_session=n_virginia_sagemaker_session,
image_uri=n_virginia_ecr_image,
framework_version=framework_version,
py_version='py3',
base_job_name='smp-test1')
Expand All @@ -156,12 +156,12 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_hvd2_conv_multinode.py", 2)])
def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
"""
Tests SM Modelparallel in sagemaker
"""
instance_type = "ml.p3.16xlarge"
validate_or_skip_smmodelparallel(ecr_image)
validate_or_skip_smmodelparallel(n_virginia_ecr_image)
smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
estimator = TensorFlow(entry_point=test_script,
role='SageMakerRole',
Expand All @@ -175,8 +175,8 @@ def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image,
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
}
},
sagemaker_session=sagemaker_session,
image_uri=ecr_image,
sagemaker_session=n_virginia_sagemaker_session,
image_uri=n_virginia_ecr_image,
framework_version=framework_version,
py_version='py3',
base_job_name='smp-test2')
Expand Down
6 changes: 3 additions & 3 deletions test/testrunner.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images):
sm_tests_tar_name = "sagemaker_tests.tar.gz"
run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")

pool_number = len(images)
with Pool(pool_number) as p:
p.map(sm_utils.execute_local_tests, images)
# pool_number = len(images)
# with Pool(pool_number) as p:
# p.map(sm_utils.execute_local_tests, images)


def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):
Expand Down