Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pytorch/training/docker/1.8/py3/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pytho
sagemaker-experiments==0.* \
"sagemaker-pytorch-training<3" \
psutil==5.6.7 \
Pillow==7.1.0 \
Pillow==8.2.0 \
&& pip uninstall -y torch \
&& pip install --no-cache-dir -U ${PT_TRAINING_URL} \
&& pip uninstall -y torchvision \
Expand Down
2 changes: 1 addition & 1 deletion pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ ENV MANUAL_BUILD=0
ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-28/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl

RUN apt-get update \
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
Expand Down
2 changes: 1 addition & 1 deletion tensorflow/training/docker/2.4/py3/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ RUN ${PIP} install --no-cache-dir -U \
scipy==1.5.2 \
scikit-learn==0.23 \
pandas==1.1 \
Pillow==7.2.0 \
Pillow==8.2.0 \
# python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
python-dateutil==2.8.1 \
# install PyYAML>=5.4 to avoid conflict with latest awscli
Expand Down
4 changes: 2 additions & 2 deletions tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/est
# the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed.
ARG SMDEBUG_VERSION=1.0.8

ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-12/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-28/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl

ARG SMMODELPARALLEL_BINARY=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/tensorflow-2.4/build-artifacts/2021-03-26-21-57/smdistributed_modelparallel-1.3.1-cp37-cp37m-linux_x86_64.whl

Expand Down Expand Up @@ -193,7 +193,7 @@ RUN ${PIP} install --no-cache-dir -U \
scipy==1.5.2 \
scikit-learn==0.23 \
pandas==1.1 \
Pillow==7.2.0 \
Pillow==8.2.0 \
# python-dateutil==2.8.1 to satisfy botocore associated with latest awscli
python-dateutil==2.8.1 \
# install PyYAML>=5.4 to avoid conflict with latest awscli
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,21 @@ def test_mnist_gpu(sagemaker_session, framework_version, ecr_image, dist_gpu_bac
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_pt_mnist.py", 8)])
def test_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, py_version, sagemaker_session, tmpdir, test_script, num_processes):
def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance_type, py_version, n_virginia_sagemaker_session, tmpdir, test_script, num_processes):
"""
Tests pt mnist command via script mode
"""
instance_type = "ml.p3.16xlarge"
validate_or_skip_smmodelparallel(ecr_image)
validate_or_skip_smmodelparallel(n_virginia_ecr_image)
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(
entry_point=test_script,
role='SageMakerRole',
image_uri=ecr_image,
image_uri=n_virginia_ecr_image,
source_dir=mnist_path,
instance_count=2,
instance_type=instance_type,
sagemaker_session=sagemaker_session,
sagemaker_session=n_virginia_sagemaker_session,
hyperparameters = {"assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5},
distribution={
"smdistributed": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,14 @@ def test_smdataparallel_mnist(n_virginia_sagemaker_session, framework_version, n
@pytest.mark.integration("smdataparallel_smmodelparallel")
@pytest.mark.model("mnist")
@pytest.mark.parametrize('instance_types', ["ml.p3.16xlarge"])
def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_version, sagemaker_session, tmpdir):
def test_smmodelparallel_smdataparallel_mnist(instance_types, n_virginia_ecr_image, py_version, n_virginia_sagemaker_session, tmpdir):
"""
Tests SM Distributed DataParallel and ModelParallel single-node via script mode
This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
TODO: Consider reworking these tests after re:Invent releases are done
"""
can_run_modelparallel = can_run_smmodelparallel(ecr_image)
can_run_dataparallel = can_run_smdataparallel(ecr_image)
can_run_modelparallel = can_run_smmodelparallel(n_virginia_ecr_image)
can_run_dataparallel = can_run_smdataparallel(n_virginia_ecr_image)
if can_run_dataparallel and can_run_modelparallel:
entry_point = 'smdataparallel_smmodelparallel_mnist_script_mode.sh'
elif can_run_dataparallel:
Expand All @@ -160,12 +160,12 @@ def test_smmodelparallel_smdataparallel_mnist(instance_types, ecr_image, py_vers
with timeout(minutes=DEFAULT_TIMEOUT):
pytorch = PyTorch(entry_point=entry_point,
role='SageMakerRole',
image_uri=ecr_image,
image_uri=n_virginia_ecr_image,
source_dir=mnist_path,
instance_count=1,
instance_type=instance_types,
sagemaker_session=sagemaker_session)
sagemaker_session=n_virginia_sagemaker_session)

pytorch = _disable_sm_profiler(sagemaker_session.boto_region_name, pytorch)
pytorch = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, pytorch)

pytorch.fit()