From d42991bc160f1e37f16ffdfb466dde603d0d8a9f Mon Sep 17 00:00:00 2001 From: Mike Schneider Date: Fri, 4 Aug 2023 08:29:54 -0700 Subject: [PATCH 1/4] disable dataparallel tests --- dlc_developer_config.toml | 16 ++++++++-------- .../integration/sagemaker/test_smdataparallel.py | 10 +++++++--- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index 157c060efbf9..aec1262730a3 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -31,11 +31,11 @@ benchmark_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = [] +build_frameworks = ["tensorflow"] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = true +build_inference = false # Set to false in order to remove datetime tag on PR builds datetime_tag = true @@ -45,12 +45,12 @@ do_build = true [test] ### On by default -sanity_tests = true +sanity_tests = false safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = true -eks_tests = true -ec2_tests = true +ecs_tests = false +eks_tests = false +ec2_tests = false ### Set ec2_efa_tests = true to be able to run any EC2 tests on any instance type that uses EFA-capable instances by ### default. If false, these types of tests will be skipped while other tests will run as usual. @@ -60,14 +60,14 @@ ec2_efa_tests = false ### SM specific tests ### Off by default -sagemaker_local_tests = false +sagemaker_local_tests = true # SM remote test valid values: # "off" --> do not trigger sagemaker remote tests (default) # "standard" --> run standard sagemaker remote tests from test/sagemaker_tests # "rc" --> run release_candidate_integration tests # "efa" --> run efa sagemaker tests -sagemaker_remote_tests = "off" +sagemaker_remote_tests = "efa" # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py index 6d3de16652e9..34766b9b04b1 100644 --- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py +++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py @@ -116,7 +116,9 @@ def _test_distributed_training_smdataparallel_script_mode_function( @pytest.mark.skip_py2_containers @pytest.mark.efa() @pytest.mark.parametrize("instance_types", ["ml.p3.16xlarge", "ml.p4d.24xlarge"]) -def test_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types, py_version, tmpdir): +def test_smdataparallel_mnist( + ecr_image, sagemaker_regions, instance_types, py_version, tmpdir, sm_below_tf213_only +): invoke_sm_helper_function( ecr_image, sagemaker_regions, _test_smdataparallel_mnist_function, instance_types ) @@ -152,7 +154,9 @@ def _test_smdataparallel_mnist_function(ecr_image, sagemaker_session, instance_t @pytest.mark.skip_py2_containers @pytest.mark.efa() @pytest.mark.parametrize("instance_types", ["ml.p3.16xlarge", "ml.p4d.24xlarge"]) -def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types, py_version, tmpdir): +def test_hc_smdataparallel_mnist( + ecr_image, sagemaker_regions, instance_types, py_version, tmpdir, sm_below_tf213_only +): training_group = InstanceGroup("train_group", instance_types, 2) invoke_sm_helper_function( ecr_image, sagemaker_regions, _test_hc_smdataparallel_mnist_function, [training_group] @@ -192,7 +196,7 @@ def _test_hc_smdataparallel_mnist_function(ecr_image, sagemaker_session, instanc @pytest.mark.efa() @pytest.mark.parametrize("instance_types", ["ml.p4d.24xlarge"]) def test_smdataparallel_throughput( - ecr_image, sagemaker_regions, instance_types, py_version, tmpdir + ecr_image, sagemaker_regions, instance_types, py_version, tmpdir, sm_below_tf213_only ): invoke_sm_helper_function( ecr_image, sagemaker_regions, _test_smdataparallel_throughput_function, instance_types From 888db5c9ff920360af54aa849705a2ca97b39e1f Mon Sep 17 00:00:00 2001 From: Mike Schneider Date: Fri, 4 Aug 2023 08:50:45 -0700 Subject: [PATCH 2/4] disable sm_profiler for RC integration --- .../dlc_tests/release_candidate_integration/test_sm_profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/release_candidate_integration/test_sm_profiler.py b/test/dlc_tests/release_candidate_integration/test_sm_profiler.py index 371ce51876b3..9bc37770721e 100644 --- a/test/dlc_tests/release_candidate_integration/test_sm_profiler.py +++ b/test/dlc_tests/release_candidate_integration/test_sm_profiler.py @@ -82,7 +82,7 @@ def test_sm_profiler_pt(pytorch_training): @pytest.mark.skipif( not is_mainline_context() and not is_rc_test_context(), reason="Mainline only test" ) -def test_sm_profiler_tf(tensorflow_training): +def test_sm_profiler_tf(tensorflow_training, below_tf213_only): if is_tf_version("1", tensorflow_training): pytest.skip("Skipping test on TF1, since there are no smprofiler config files for TF1") processor = get_processor_from_image_uri(tensorflow_training) From 7024bd0d1d8fc41c5204b2c2322cc37261c3a7c1 Mon Sep 17 00:00:00 2001 From: Mike Schneider Date: Fri, 4 Aug 2023 10:35:25 -0700 Subject: [PATCH 3/4] run SM RC tests --- dlc_developer_config.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index aec1262730a3..b952c4a039fd 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -60,14 +60,14 @@ ec2_efa_tests = false ### SM specific tests ### Off by default -sagemaker_local_tests = true +sagemaker_local_tests = false # SM remote test valid values: # "off" --> do not trigger sagemaker remote tests (default) # "standard" --> run standard sagemaker remote tests from test/sagemaker_tests # "rc" --> run release_candidate_integration tests # "efa" --> run efa sagemaker tests -sagemaker_remote_tests = "efa" +sagemaker_remote_tests = "rc" # SM remote EFA test instance type sagemaker_remote_efa_instance_type = "" From d060668cc8ec34724f19b51984a885e6eadb26e7 Mon Sep 17 00:00:00 2001 From: Mike Schneider Date: Fri, 4 Aug 2023 11:24:36 -0700 Subject: [PATCH 4/4] revert toml --- dlc_developer_config.toml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dlc_developer_config.toml b/dlc_developer_config.toml index b952c4a039fd..157c060efbf9 100644 --- a/dlc_developer_config.toml +++ b/dlc_developer_config.toml @@ -31,11 +31,11 @@ benchmark_mode = false [build] # Add in frameworks you would like to build. By default, builds are disabled unless you specify building an image. # available frameworks - ["autogluon", "huggingface_tensorflow", "huggingface_pytorch", "huggingface_tensorflow_trcomp", "huggingface_pytorch_trcomp", "pytorch_trcomp", "tensorflow", "mxnet", "pytorch", "stabilityai_pytorch"] -build_frameworks = ["tensorflow"] +build_frameworks = [] # By default we build both training and inference containers. Set true/false values to determine which to build. build_training = true -build_inference = false +build_inference = true # Set to false in order to remove datetime tag on PR builds datetime_tag = true @@ -45,12 +45,12 @@ do_build = true [test] ### On by default -sanity_tests = false +sanity_tests = true safety_check_test = false ecr_scan_allowlist_feature = false -ecs_tests = false -eks_tests = false -ec2_tests = false +ecs_tests = true +eks_tests = true +ec2_tests = true ### Set ec2_efa_tests = true to be able to run any EC2 tests on any instance type that uses EFA-capable instances by ### default. If false, these types of tests will be skipped while other tests will run as usual. @@ -67,7 +67,7 @@ sagemaker_local_tests = false # "standard" --> run standard sagemaker remote tests from test/sagemaker_tests # "rc" --> run release_candidate_integration tests # "efa" --> run efa sagemaker tests -sagemaker_remote_tests = "rc" +sagemaker_remote_tests = "off" # SM remote EFA test instance type sagemaker_remote_efa_instance_type = ""