From 05be636ae87a6f038747921598819ff189a61413 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Wed, 10 Jun 2020 16:54:56 -0700 Subject: [PATCH 1/5] infra: clean up pickle.load logic in integ tests Because we no longer run our tests with Python 2, we no longer need the branched logic for pickle.load args. --- tests/integ/test_airflow_config.py | 16 +++++----------- tests/integ/test_byo_estimator.py | 7 ++----- tests/integ/test_factorization_machines.py | 6 ++---- tests/integ/test_kmeans.py | 7 ++----- tests/integ/test_knn.py | 7 ++----- tests/integ/test_linear_learner.py | 10 +++------- tests/integ/test_pca.py | 7 ++----- tests/integ/test_record_set.py | 4 +--- tests/integ/test_transformer.py | 9 +++------ tests/integ/test_tuner.py | 8 +++----- tests/integ/test_tuner_multi_algo.py | 6 ++---- 11 files changed, 27 insertions(+), 60 deletions(-) diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py index 630e62124f..d2f5d8c9d4 100644 --- a/tests/integ/test_airflow_config.py +++ b/tests/integ/test_airflow_config.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys import pytest import tests.integ @@ -102,11 +101,10 @@ def test_byo_airflow_config_uploads_data_source_to_s3_when_inputs_provided( def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") kmeans = KMeans( role=ROLE, @@ -141,11 +139,10 @@ def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ def test_fm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") fm = FactorizationMachines( role=ROLE, @@ -207,11 +204,10 @@ def test_ipinsights_airflow_config_uploads_data_source_to_s3(sagemaker_session, def test_knn_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") knn = KNN( role=ROLE, @@ -278,11 +274,10 @@ def test_linearlearner_airflow_config_uploads_data_source_to_s3( ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") train_set[1][:100] = 1 train_set[1][100:200] = 0 @@ -381,11 +376,10 @@ def test_ntm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins def test_pca_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") pca = PCA( role=ROLE, diff --git a/tests/integ/test_byo_estimator.py b/tests/integ/test_byo_estimator.py index d84ef1cc10..dfb25bc5c4 100644 --- a/tests/integ/test_byo_estimator.py +++ b/tests/integ/test_byo_estimator.py @@ -16,7 +16,6 @@ import json import os import pickle -import sys import pytest @@ -58,10 +57,9 @@ def test_byo_estimator(sagemaker_session, region, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") prefix = "test_byo_estimator" key = "recordio-pb-data" @@ -107,10 +105,9 @@ def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type): with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") prefix = "test_byo_estimator" key = "recordio-pb-data" diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py index 6aba98c5f6..153966906b 100644 --- a/tests/integ/test_factorization_machines.py +++ b/tests/integ/test_factorization_machines.py @@ -29,11 +29,10 @@ def test_factorization_machines(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") fm = FactorizationMachines( role="SageMakerRole", @@ -71,11 +70,10 @@ def test_async_factorization_machines(sagemaker_session, cpu_instance_type): with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") fm = FactorizationMachines( role="SageMakerRole", diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py index c6dd990576..33cfeeaee7 100644 --- a/tests/integ/test_kmeans.py +++ b/tests/integ/test_kmeans.py @@ -16,7 +16,6 @@ import json import os import pickle -import sys import time import pytest @@ -31,11 +30,10 @@ def test_kmeans(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("kmeans") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") kmeans = KMeans( role="SageMakerRole", @@ -94,11 +92,10 @@ def test_async_kmeans(sagemaker_session, cpu_instance_type): with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") kmeans = KMeans( role="SageMakerRole", diff --git a/tests/integ/test_knn.py b/tests/integ/test_knn.py index 52d2e2e3e0..09ab761c4b 100644 --- a/tests/integ/test_knn.py +++ b/tests/integ/test_knn.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys import time from sagemaker import KNN, KNNModel @@ -29,11 +28,10 @@ def test_knn_regressor(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") knn = KNN( role="SageMakerRole", @@ -66,11 +64,10 @@ def test_async_knn_classifier(sagemaker_session, cpu_instance_type): with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") knn = KNN( role="SageMakerRole", diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py index 62bd35b8c1..062e1ca7b5 100644 --- a/tests/integ/test_linear_learner.py +++ b/tests/integ/test_linear_learner.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys import time import numpy as np @@ -33,11 +32,10 @@ def test_linear_learner(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") train_set[1][:100] = 1 train_set[1][100:200] = 0 @@ -102,11 +100,10 @@ def test_linear_learner_multiclass(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") train_set = train_set[0], train_set[1].astype(np.dtype("float32")) @@ -137,11 +134,10 @@ def test_async_linear_learner(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") train_set[1][:100] = 1 train_set[1][100:200] = 0 diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py index 8babd0bcc7..ff8669e2c7 100644 --- a/tests/integ/test_pca.py +++ b/tests/integ/test_pca.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys import time import sagemaker.amazon.pca @@ -29,11 +28,10 @@ def test_pca(sagemaker_session, cpu_instance_type): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", @@ -72,11 +70,10 @@ def test_async_pca(sagemaker_session, cpu_instance_type): with timeout(minutes=5): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", diff --git a/tests/integ/test_record_set.py b/tests/integ/test_record_set.py index c92ba9e016..74664c3752 100644 --- a/tests/integ/test_record_set.py +++ b/tests/integ/test_record_set.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys from six.moves.urllib.parse import urlparse @@ -29,9 +28,8 @@ def test_record_set(sagemaker_session, cpu_instance_type): In particular, test that the objects uploaded to the S3 bucket are encrypted. """ data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(data_path, "rb") as file_object: - train_set, _, _ = pickle.load(file_object, **pickle_args) + train_set, _, _ = pickle.load(file_object, encoding="latin1") kmeans = KMeans( role="SageMakerRole", train_instance_count=1, diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py index 4e52d4da14..78ea74e43e 100644 --- a/tests/integ/test_transformer.py +++ b/tests/integ/test_transformer.py @@ -16,7 +16,6 @@ import json import os import pickle -import sys import time import pytest @@ -109,12 +108,11 @@ def test_transform_mxnet( @pytest.mark.canary_quick def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") kmeans = KMeans( role="SageMakerRole", @@ -177,7 +175,7 @@ def test_transform_pytorch_vpc_custom_model_bucket( entry_point=os.path.join(data_dir, "mnist.py"), role="SageMakerRole", framework_version=pytorch_full_version, - py_version="py3", + py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, vpc_config={"Subnets": subnet_ids, "SecurityGroupIds": [security_group_id]}, code_location="s3://{}".format(custom_bucket_name), @@ -232,13 +230,12 @@ def test_transform_mxnet_tags( def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): data_path = os.path.join(DATA_DIR, "one_p_mnist") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} tags = [{"Key": "some-tag", "Value": "value-for-tag"}] # Load the data into memory as numpy arrays train_set_path = os.path.join(data_path, "mnist.pkl.gz") with gzip.open(train_set_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") kmeans = KMeans( role="SageMakerRole", diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index fb7c4623a2..c0ae640038 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -16,7 +16,6 @@ import json import os import pickle -import sys import time import numpy as np @@ -55,10 +54,10 @@ @pytest.fixture(scope="module") def kmeans_train_set(sagemaker_session): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} + # Load the data into memory as numpy arrays with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") return train_set @@ -847,10 +846,9 @@ def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, **pickle_args) + train_set, _, _ = pickle.load(f, encoding="latin1") prefix = "test_byo_estimator" key = "recordio-pb-data" diff --git a/tests/integ/test_tuner_multi_algo.py b/tests/integ/test_tuner_multi_algo.py index 3f8a00a34a..3fc97d5055 100644 --- a/tests/integ/test_tuner_multi_algo.py +++ b/tests/integ/test_tuner_multi_algo.py @@ -16,9 +16,9 @@ import json import os import pickle -import sys import pytest + from sagemaker import utils from sagemaker.amazon.amazon_estimator import get_image_uri from sagemaker.analytics import HyperparameterTuningJobAnalytics @@ -26,7 +26,6 @@ from sagemaker.estimator import Estimator from sagemaker.predictor import json_deserializer from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner - from tests.integ import DATA_DIR, TUNING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name @@ -63,9 +62,8 @@ @pytest.fixture(scope="module") def data_set(): - pickle_args = {} if sys.version_info.major == 2 else {"encoding": "latin1"} with gzip.open(DATA_PATH, "rb") as f: - data_set, _, _ = pickle.load(f, **pickle_args) + data_set, _, _ = pickle.load(f, encoding="latin1") return data_set From 64b88923838d40569944f55dbab89756410a43a3 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Thu, 18 Jun 2020 15:31:35 -0700 Subject: [PATCH 2/5] remove stray import --- tests/integ/test_factorization_machines.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py index 153966906b..4d430ce44a 100644 --- a/tests/integ/test_factorization_machines.py +++ b/tests/integ/test_factorization_machines.py @@ -15,7 +15,6 @@ import gzip import os import pickle -import sys import time from sagemaker import FactorizationMachines, FactorizationMachinesModel From b16629f1651e1474f6e59dfff57b552ed233686c Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Fri, 19 Jun 2020 17:13:38 -0700 Subject: [PATCH 3/5] refactor to use helper method --- tests/integ/test_airflow_config.py | 72 ++++++---------------- tests/integ/test_byo_estimator.py | 27 +++----- tests/integ/test_factorization_machines.py | 36 +++++------ tests/integ/test_kmeans.py | 34 ++++------ tests/integ/test_knn.py | 34 ++++------ tests/integ/test_linear_learner.py | 64 ++++++++----------- tests/integ/test_pca.py | 36 +++++------ tests/integ/test_record_set.py | 11 +--- tests/integ/test_transformer.py | 24 ++------ tests/integ/test_tuner.py | 38 +++++------- tests/integ/test_tuner_multi_algo.py | 10 +-- 11 files changed, 134 insertions(+), 252 deletions(-) diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py index d2f5d8c9d4..425331dfd6 100644 --- a/tests/integ/test_airflow_config.py +++ b/tests/integ/test_airflow_config.py @@ -12,14 +12,17 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import os -import pickle -import pytest -import tests.integ +import airflow +import pytest import numpy as np +from airflow import DAG +from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator +from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator +from six.moves.urllib.parse import urlparse +import tests.integ from sagemaker import ( KMeans, FactorizationMachines, @@ -39,21 +42,13 @@ from sagemaker.pytorch.estimator import PyTorch from sagemaker.sklearn import SKLearn from sagemaker.tensorflow import TensorFlow -from sagemaker.workflow import airflow as sm_airflow from sagemaker.utils import sagemaker_timestamp - -import airflow -from airflow import DAG -from airflow.contrib.operators.sagemaker_training_operator import SageMakerTrainingOperator -from airflow.contrib.operators.sagemaker_transform_operator import SageMakerTransformOperator - +from sagemaker.workflow import airflow as sm_airflow from sagemaker.xgboost import XGBoost -from tests.integ import DATA_DIR, PYTHON_VERSION +from tests.integ import datasets, DATA_DIR, PYTHON_VERSION from tests.integ.record_set import prepare_record_set_from_local_files from tests.integ.timeout import timeout -from six.moves.urllib.parse import urlparse - PYTORCH_MNIST_DIR = os.path.join(DATA_DIR, "pytorch_mnist") PYTORCH_MNIST_SCRIPT = os.path.join(PYTORCH_MNIST_DIR, "mnist.py") AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS = 10 @@ -100,12 +95,6 @@ def test_byo_airflow_config_uploads_data_source_to_s3_when_inputs_provided( @pytest.mark.canary_quick def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - kmeans = KMeans( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, @@ -124,7 +113,7 @@ def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ kmeans.center_factor = 1 kmeans.eval_metrics = ["ssd", "msd"] - records = kmeans.record_set(train_set[0][:100]) + records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) training_config = _build_airflow_workflow( estimator=kmeans, instance_type=cpu_instance_type, inputs=records @@ -138,12 +127,6 @@ def test_kmeans_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ def test_fm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - fm = FactorizationMachines( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, @@ -157,7 +140,8 @@ def test_fm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_inst sagemaker_session=sagemaker_session, ) - records = fm.record_set(train_set[0][:200], train_set[1][:200].astype("float32")) + training_set = datasets.one_p_mnist() + records = fm.record_set(training_set[0][:200], training_set[1][:200].astype("float32")) training_config = _build_airflow_workflow( estimator=fm, instance_type=cpu_instance_type, inputs=records @@ -203,12 +187,6 @@ def test_ipinsights_airflow_config_uploads_data_source_to_s3(sagemaker_session, def test_knn_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - knn = KNN( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, @@ -219,7 +197,8 @@ def test_knn_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins sagemaker_session=sagemaker_session, ) - records = knn.record_set(train_set[0][:200], train_set[1][:200].astype("float32")) + training_set = datasets.one_p_mnist() + records = knn.record_set(training_set[0][:200], training_set[1][:200].astype("float32")) training_config = _build_airflow_workflow( estimator=knn, instance_type=cpu_instance_type, inputs=records @@ -273,15 +252,10 @@ def test_linearlearner_airflow_config_uploads_data_source_to_s3( sagemaker_session, cpu_instance_type ): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - - train_set[1][:100] = 1 - train_set[1][100:200] = 0 - train_set = train_set[0], train_set[1].astype(np.dtype("float32")) + training_set = datasets.one_p_mnist() + training_set[1][:100] = 1 + training_set[1][100:200] = 0 + training_set = training_set[0], training_set[1].astype(np.dtype("float32")) ll = LinearLearner( ROLE, @@ -326,7 +300,7 @@ def test_linearlearner_airflow_config_uploads_data_source_to_s3( ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 - records = ll.record_set(train_set[0][:200], train_set[1][:200]) + records = ll.record_set(training_set[0][:200], training_set[1][:200]) training_config = _build_airflow_workflow( estimator=ll, instance_type=cpu_instance_type, inputs=records @@ -375,12 +349,6 @@ def test_ntm_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins @pytest.mark.canary_quick def test_pca_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_instance_type): with timeout(seconds=AIRFLOW_CONFIG_TIMEOUT_IN_SECONDS): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - pca = PCA( role=ROLE, train_instance_count=SINGLE_INSTANCE_COUNT, @@ -393,7 +361,7 @@ def test_pca_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_ins pca.subtract_mean = True pca.extra_components = 5 - records = pca.record_set(train_set[0][:100]) + records = pca.record_set(datasets.one_p_mnist()[0][:100]) training_config = _build_airflow_workflow( estimator=pca, instance_type=cpu_instance_type, inputs=records diff --git a/tests/integ/test_byo_estimator.py b/tests/integ/test_byo_estimator.py index dfb25bc5c4..738af4c57a 100644 --- a/tests/integ/test_byo_estimator.py +++ b/tests/integ/test_byo_estimator.py @@ -12,10 +12,8 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import json import os -import pickle import pytest @@ -23,7 +21,7 @@ from sagemaker.amazon.amazon_estimator import get_image_uri from sagemaker.estimator import Estimator from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES, datasets from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name @@ -32,6 +30,11 @@ def region(sagemaker_session): return sagemaker_session.boto_session.region_name +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() + + def fm_serializer(data): js = {"instances": []} for row in data: @@ -40,7 +43,7 @@ def fm_serializer(data): @pytest.mark.canary_quick -def test_byo_estimator(sagemaker_session, region, cpu_instance_type): +def test_byo_estimator(sagemaker_session, region, cpu_instance_type, training_set): """Use Factorization Machines algorithm as an example here. First we need to prepare data for training. We take standard data set, convert it to the @@ -56,11 +59,6 @@ def test_byo_estimator(sagemaker_session, region, cpu_instance_type): job_name = unique_name_from_base("byo") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - prefix = "test_byo_estimator" key = "recordio-pb-data" @@ -90,25 +88,20 @@ def test_byo_estimator(sagemaker_session, region, cpu_instance_type): predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: assert prediction["score"] is not None -def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type): +def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type, training_set): image_name = get_image_uri(region, "factorization-machines") endpoint_name = unique_name_from_base("byo") training_data_path = os.path.join(DATA_DIR, "dummy_tensor") job_name = unique_name_from_base("byo") with timeout(minutes=5): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - prefix = "test_byo_estimator" key = "recordio-pb-data" @@ -141,7 +134,7 @@ def test_async_byo_estimator(sagemaker_session, region, cpu_instance_type): predictor.content_type = "application/json" predictor.deserializer = sagemaker.predictor.json_deserializer - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: diff --git a/tests/integ/test_factorization_machines.py b/tests/integ/test_factorization_machines.py index 4d430ce44a..5e4b37d1a0 100644 --- a/tests/integ/test_factorization_machines.py +++ b/tests/integ/test_factorization_machines.py @@ -12,27 +12,25 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip -import os -import pickle import time +import pytest + from sagemaker import FactorizationMachines, FactorizationMachinesModel from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_factorization_machines(sagemaker_session, cpu_instance_type): - job_name = unique_name_from_base("fm") +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") +def test_factorization_machines(sagemaker_session, cpu_instance_type, training_set): + job_name = unique_name_from_base("fm") + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): fm = FactorizationMachines( role="SageMakerRole", train_instance_count=1, @@ -48,7 +46,7 @@ def test_factorization_machines(sagemaker_session, cpu_instance_type): # training labels must be 'float32' fm.fit( - fm.record_set(train_set[0][:200], train_set[1][:200].astype("float32")), + fm.record_set(training_set[0][:200], training_set[1][:200].astype("float32")), job_name=job_name, ) @@ -57,23 +55,17 @@ def test_factorization_machines(sagemaker_session, cpu_instance_type): fm.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: assert record.label["score"] is not None -def test_async_factorization_machines(sagemaker_session, cpu_instance_type): +def test_async_factorization_machines(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("fm") with timeout(minutes=5): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - fm = FactorizationMachines( role="SageMakerRole", train_instance_count=1, @@ -89,7 +81,7 @@ def test_async_factorization_machines(sagemaker_session, cpu_instance_type): # training labels must be 'float32' fm.fit( - fm.record_set(train_set[0][:200], train_set[1][:200].astype("float32")), + fm.record_set(training_set[0][:200], training_set[1][:200].astype("float32")), job_name=job_name, wait=False, ) @@ -106,7 +98,7 @@ def test_async_factorization_machines(sagemaker_session, cpu_instance_type): estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: diff --git a/tests/integ/test_kmeans.py b/tests/integ/test_kmeans.py index 33cfeeaee7..e4f071efd6 100644 --- a/tests/integ/test_kmeans.py +++ b/tests/integ/test_kmeans.py @@ -12,29 +12,25 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import json -import os -import pickle import time import pytest from sagemaker import KMeans, KMeansModel from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_kmeans(sagemaker_session, cpu_instance_type): - job_name = unique_name_from_base("kmeans") - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") +def test_kmeans(sagemaker_session, cpu_instance_type, training_set): + job_name = unique_name_from_base("kmeans") + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans = KMeans( role="SageMakerRole", train_instance_count=1, @@ -67,14 +63,14 @@ def test_kmeans(sagemaker_session, cpu_instance_type): force_dense="True", ) - kmeans.fit(kmeans.record_set(train_set[0][:100]), job_name=job_name) + kmeans.fit(kmeans.record_set(training_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KMeansModel( kmeans.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: @@ -87,16 +83,10 @@ def test_kmeans(sagemaker_session, cpu_instance_type): assert "Could not find model" in str(exception.value) -def test_async_kmeans(sagemaker_session, cpu_instance_type): +def test_async_kmeans(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("kmeans") with timeout(minutes=5): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - kmeans = KMeans( role="SageMakerRole", train_instance_count=1, @@ -127,7 +117,7 @@ def test_async_kmeans(sagemaker_session, cpu_instance_type): force_dense="True", ) - kmeans.fit(kmeans.record_set(train_set[0][:100]), wait=False, job_name=job_name) + kmeans.fit(kmeans.record_set(training_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) @@ -139,7 +129,7 @@ def test_async_kmeans(sagemaker_session, cpu_instance_type): estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: diff --git a/tests/integ/test_knn.py b/tests/integ/test_knn.py index 09ab761c4b..ec05f9fecd 100644 --- a/tests/integ/test_knn.py +++ b/tests/integ/test_knn.py @@ -12,27 +12,25 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip -import os -import pickle import time +import pytest + from sagemaker import KNN, KNNModel from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_knn_regressor(sagemaker_session, cpu_instance_type): - job_name = unique_name_from_base("knn") +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") +def test_knn_regressor(sagemaker_session, cpu_instance_type, training_set): + job_name = unique_name_from_base("knn") + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): knn = KNN( role="SageMakerRole", train_instance_count=1, @@ -45,14 +43,14 @@ def test_knn_regressor(sagemaker_session, cpu_instance_type): # training labels must be 'float32' knn.fit( - knn.record_set(train_set[0][:200], train_set[1][:200].astype("float32")), + knn.record_set(training_set[0][:200], training_set[1][:200].astype("float32")), job_name=job_name, ) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): model = KNNModel(knn.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: @@ -63,12 +61,6 @@ def test_async_knn_classifier(sagemaker_session, cpu_instance_type): job_name = unique_name_from_base("knn") with timeout(minutes=5): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - knn = KNN( role="SageMakerRole", train_instance_count=1, @@ -83,7 +75,7 @@ def test_async_knn_classifier(sagemaker_session, cpu_instance_type): # training labels must be 'float32' knn.fit( - knn.record_set(train_set[0][:200], train_set[1][:200].astype("float32")), + knn.record_set(training_set[0][:200], training_set[1][:200].astype("float32")), wait=False, job_name=job_name, ) @@ -98,7 +90,7 @@ def test_async_knn_classifier(sagemaker_session, cpu_instance_type): estimator.model_data, role="SageMakerRole", sagemaker_session=sagemaker_session ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(training_set[0][:10]) assert len(result) == 10 for record in result: diff --git a/tests/integ/test_linear_learner.py b/tests/integ/test_linear_learner.py index 062e1ca7b5..5d49f77306 100644 --- a/tests/integ/test_linear_learner.py +++ b/tests/integ/test_linear_learner.py @@ -12,9 +12,6 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip -import os -import pickle import time import numpy as np @@ -22,24 +19,23 @@ from sagemaker.amazon.linear_learner import LinearLearner, LinearLearnerModel from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() + + @pytest.mark.canary_quick -def test_linear_learner(sagemaker_session, cpu_instance_type): +def test_linear_learner(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - - train_set[1][:100] = 1 - train_set[1][100:200] = 0 - train_set = train_set[0], train_set[1].astype(np.dtype("float32")) + training_set[1][:100] = 1 + training_set[1][100:200] = 0 + training_set = training_set[0], training_set[1].astype(np.dtype("float32")) ll = LinearLearner( "SageMakerRole", @@ -83,29 +79,23 @@ def test_linear_learner(sagemaker_session, cpu_instance_type): ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 - ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), job_name=job_name) + ll.fit(ll.record_set(training_set[0][:200], training_set[1][:200]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][0:100]) + result = predictor.predict(training_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None -def test_linear_learner_multiclass(sagemaker_session, cpu_instance_type): +def test_linear_learner_multiclass(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - - train_set = train_set[0], train_set[1].astype(np.dtype("float32")) + training_set = training_set[0], training_set[1].astype(np.dtype("float32")) ll = LinearLearner( "SageMakerRole", @@ -117,31 +107,25 @@ def test_linear_learner_multiclass(sagemaker_session, cpu_instance_type): ) ll.epochs = 1 - ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), job_name=job_name) + ll.fit(ll.record_set(training_set[0][:200], training_set[1][:200]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): predictor = ll.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][0:100]) + result = predictor.predict(training_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None assert record.label["score"] is not None -def test_async_linear_learner(sagemaker_session, cpu_instance_type): +def test_async_linear_learner(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("linear-learner") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - - train_set[1][:100] = 1 - train_set[1][100:200] = 0 - train_set = train_set[0], train_set[1].astype(np.dtype("float32")) + training_set[1][:100] = 1 + training_set[1][100:200] = 0 + training_set = training_set[0], training_set[1].astype(np.dtype("float32")) ll = LinearLearner( "SageMakerRole", @@ -185,7 +169,11 @@ def test_async_linear_learner(sagemaker_session, cpu_instance_type): ll.huber_delta = 0.1 ll.early_stopping_tolerance = 0.0001 ll.early_stopping_patience = 3 - ll.fit(ll.record_set(train_set[0][:200], train_set[1][:200]), wait=False, job_name=job_name) + ll.fit( + ll.record_set(training_set[0][:200], training_set[1][:200]), + wait=False, + job_name=job_name, + ) print("Waiting to re-attach to the training job: %s" % job_name) time.sleep(20) @@ -199,7 +187,7 @@ def test_async_linear_learner(sagemaker_session, cpu_instance_type): ) predictor = model.deploy(1, cpu_instance_type, endpoint_name=job_name) - result = predictor.predict(train_set[0][0:100]) + result = predictor.predict(training_set[0][0:100]) assert len(result) == 100 for record in result: assert record.label["predicted_label"] is not None diff --git a/tests/integ/test_pca.py b/tests/integ/test_pca.py index ff8669e2c7..0787b9f2cd 100644 --- a/tests/integ/test_pca.py +++ b/tests/integ/test_pca.py @@ -12,27 +12,25 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip -import os -import pickle import time +import pytest + import sagemaker.amazon.pca from sagemaker.utils import unique_name_from_base -from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, TRAINING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -def test_pca(sagemaker_session, cpu_instance_type): - job_name = unique_name_from_base("pca") +@pytest.fixture +def training_set(): + return datasets.one_p_mnist() - with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") +def test_pca(sagemaker_session, cpu_instance_type, training_set): + job_name = unique_name_from_base("pca") + with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", train_instance_count=1, @@ -45,7 +43,7 @@ def test_pca(sagemaker_session, cpu_instance_type): pca.algorithm_mode = "randomized" pca.subtract_mean = True pca.extra_components = 5 - pca.fit(pca.record_set(train_set[0][:100]), job_name=job_name) + pca.fit(pca.record_set(training_set[0][:100]), job_name=job_name) with timeout_and_delete_endpoint_by_name(job_name, sagemaker_session): pca_model = sagemaker.amazon.pca.PCAModel( @@ -58,23 +56,17 @@ def test_pca(sagemaker_session, cpu_instance_type): initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=job_name ) - result = predictor.predict(train_set[0][:5]) + result = predictor.predict(training_set[0][:5]) assert len(result) == 5 for record in result: assert record.label["projection"] is not None -def test_async_pca(sagemaker_session, cpu_instance_type): +def test_async_pca(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("pca") with timeout(minutes=5): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - pca = sagemaker.amazon.pca.PCA( role="SageMakerRole", train_instance_count=1, @@ -87,7 +79,7 @@ def test_async_pca(sagemaker_session, cpu_instance_type): pca.algorithm_mode = "randomized" pca.subtract_mean = True pca.extra_components = 5 - pca.fit(pca.record_set(train_set[0][:100]), wait=False, job_name=job_name) + pca.fit(pca.record_set(training_set[0][:100]), wait=False, job_name=job_name) print("Detached from training job. Will re-attach in 20 seconds") time.sleep(20) @@ -104,7 +96,7 @@ def test_async_pca(sagemaker_session, cpu_instance_type): initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=job_name ) - result = predictor.predict(train_set[0][:5]) + result = predictor.predict(training_set[0][:5]) assert len(result) == 5 for record in result: diff --git a/tests/integ/test_record_set.py b/tests/integ/test_record_set.py index 74664c3752..4dfe59ed6a 100644 --- a/tests/integ/test_record_set.py +++ b/tests/integ/test_record_set.py @@ -12,14 +12,10 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip -import os -import pickle - from six.moves.urllib.parse import urlparse from sagemaker import KMeans -from tests.integ import DATA_DIR +from tests.integ import datasets def test_record_set(sagemaker_session, cpu_instance_type): @@ -27,9 +23,6 @@ def test_record_set(sagemaker_session, cpu_instance_type): In particular, test that the objects uploaded to the S3 bucket are encrypted. """ - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - with gzip.open(data_path, "rb") as file_object: - train_set, _, _ = pickle.load(file_object, encoding="latin1") kmeans = KMeans( role="SageMakerRole", train_instance_count=1, @@ -37,7 +30,7 @@ def test_record_set(sagemaker_session, cpu_instance_type): k=10, sagemaker_session=sagemaker_session, ) - record_set = kmeans.record_set(train_set[0][:100], encrypt=True) + record_set = kmeans.record_set(datasets.one_p_mnist()[0][:100], encrypt=True) parsed_url = urlparse(record_set.s3_data) s3_client = sagemaker_session.boto_session.client("s3") head = s3_client.head_object(Bucket=parsed_url.netloc, Key=parsed_url.path.lstrip("/")) diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py index 78ea74e43e..8711268346 100644 --- a/tests/integ/test_transformer.py +++ b/tests/integ/test_transformer.py @@ -12,10 +12,8 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import json import os -import pickle import time import pytest @@ -28,6 +26,7 @@ from sagemaker.estimator import Estimator from sagemaker.utils import unique_name_from_base from tests.integ import ( + datasets, DATA_DIR, PYTHON_VERSION, TRAINING_DEFAULT_TIMEOUT_MINUTES, @@ -107,13 +106,6 @@ def test_transform_mxnet( @pytest.mark.canary_quick def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): - data_path = os.path.join(DATA_DIR, "one_p_mnist") - - # Load the data into memory as numpy arrays - train_set_path = os.path.join(data_path, "mnist.pkl.gz") - with gzip.open(train_set_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - kmeans = KMeans( role="SageMakerRole", train_instance_count=1, @@ -132,14 +124,14 @@ def test_attach_transform_kmeans(sagemaker_session, cpu_instance_type): kmeans.half_life_time_size = 1 kmeans.epochs = 1 - records = kmeans.record_set(train_set[0][:100]) + records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): kmeans.fit(records, job_name=job_name) - transform_input_path = os.path.join(data_path, "transform_input.csv") + transform_input_path = os.path.join(DATA_DIR, "one_p_mnist", "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix @@ -229,14 +221,8 @@ def test_transform_mxnet_tags( def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): - data_path = os.path.join(DATA_DIR, "one_p_mnist") tags = [{"Key": "some-tag", "Value": "value-for-tag"}] - # Load the data into memory as numpy arrays - train_set_path = os.path.join(data_path, "mnist.pkl.gz") - with gzip.open(train_set_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - kmeans = KMeans( role="SageMakerRole", train_instance_count=1, @@ -255,7 +241,7 @@ def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): kmeans.half_life_time_size = 1 kmeans.epochs = 1 - records = kmeans.record_set(train_set[0][:100]) + records = kmeans.record_set(datasets.one_p_mnist()[0][:100]) job_name = unique_name_from_base("test-kmeans-attach") @@ -265,7 +251,7 @@ def test_transform_byo_estimator(sagemaker_session, cpu_instance_type): estimator = Estimator.attach(training_job_name=job_name, sagemaker_session=sagemaker_session) estimator._enable_network_isolation = True - transform_input_path = os.path.join(data_path, "transform_input.csv") + transform_input_path = os.path.join(DATA_DIR, "one_p_mnist", "transform_input.csv") transform_input_key_prefix = "integ-test-data/one_p_mnist/transform" transform_input = kmeans.sagemaker_session.upload_data( path=transform_input_path, key_prefix=transform_input_key_prefix diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index c0ae640038..ee00f5cdf4 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -12,21 +12,15 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import json import os -import pickle import time import numpy as np import pytest -import tests.integ from botocore.exceptions import ClientError -from tests.integ import DATA_DIR, PYTHON_VERSION, TUNING_DEFAULT_TIMEOUT_MINUTES -from tests.integ.record_set import prepare_record_set_from_local_files -from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name -from tests.integ import vpc_test_utils +import tests.integ from sagemaker import KMeans, LDA, RandomCutForest from sagemaker.amazon.amazon_estimator import get_image_uri from sagemaker.amazon.common import read_records @@ -47,19 +41,22 @@ create_identical_dataset_and_algorithm_tuner, ) from sagemaker.utils import unique_name_from_base +from tests.integ import ( + datasets, + vpc_test_utils, + DATA_DIR, + PYTHON_VERSION, + TUNING_DEFAULT_TIMEOUT_MINUTES, +) +from tests.integ.record_set import prepare_record_set_from_local_files +from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name DATA_PATH = os.path.join(DATA_DIR, "iris", "data") @pytest.fixture(scope="module") def kmeans_train_set(sagemaker_session): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - # Load the data into memory as numpy arrays - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - - return train_set + return datasets.one_p_mnist() @pytest.fixture(scope="module") @@ -382,8 +379,10 @@ def test_create_transfer_learning_tuner( def test_tuning_kmeans_identical_dataset_algorithm_tuner_from_non_terminal_parent( sagemaker_session, kmeans_train_set, kmeans_estimator, hyperparameter_ranges ): - """Tests Identical dataset and algorithm use case with one non terminal parent and child job launched with - .identical_dataset_and_algorithm_tuner() """ + """Tests Identical dataset and algorithm use case with + one non terminal parent and child job launched with + .identical_dataset_and_algorithm_tuner() + """ parent_tuning_job_name = unique_name_from_base("km-non-term", max_length=32) child_tuning_job_name = unique_name_from_base("km-non-term-child", max_length=32) @@ -845,11 +844,6 @@ def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): training_data_path = os.path.join(DATA_DIR, "dummy_tensor") with timeout(minutes=TUNING_DEFAULT_TIMEOUT_MINUTES): - data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - - with gzip.open(data_path, "rb") as f: - train_set, _, _ = pickle.load(f, encoding="latin1") - prefix = "test_byo_estimator" key = "recordio-pb-data" s3_train_data = sagemaker_session.upload_data( @@ -896,7 +890,7 @@ def test_tuning_byo_estimator(sagemaker_session, cpu_instance_type): predictor.content_type = "application/json" predictor.deserializer = json_deserializer - result = predictor.predict(train_set[0][:10]) + result = predictor.predict(datasets.one_p_mnist()[0][:10]) assert len(result["predictions"]) == 10 for prediction in result["predictions"]: diff --git a/tests/integ/test_tuner_multi_algo.py b/tests/integ/test_tuner_multi_algo.py index 3fc97d5055..ad933075e1 100644 --- a/tests/integ/test_tuner_multi_algo.py +++ b/tests/integ/test_tuner_multi_algo.py @@ -12,10 +12,8 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import gzip import json import os -import pickle import pytest @@ -26,13 +24,11 @@ from sagemaker.estimator import Estimator from sagemaker.predictor import json_deserializer from sagemaker.tuner import ContinuousParameter, IntegerParameter, HyperparameterTuner -from tests.integ import DATA_DIR, TUNING_DEFAULT_TIMEOUT_MINUTES +from tests.integ import datasets, DATA_DIR, TUNING_DEFAULT_TIMEOUT_MINUTES from tests.integ.timeout import timeout, timeout_and_delete_endpoint_by_name BASE_TUNING_JOB_NAME = "multi-algo-pysdk" -DATA_PATH = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") - EXECUTION_ROLE = "SageMakerRole" STRATEGY = "Bayesian" @@ -62,9 +58,7 @@ @pytest.fixture(scope="module") def data_set(): - with gzip.open(DATA_PATH, "rb") as f: - data_set, _, _ = pickle.load(f, encoding="latin1") - return data_set + return datasets.one_p_mnist() @pytest.fixture(scope="function") From 3dacff65b87943828cc347956ba503a8347ffb91 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Fri, 19 Jun 2020 17:14:22 -0700 Subject: [PATCH 4/5] add file --- tests/integ/datasets.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tests/integ/datasets.py diff --git a/tests/integ/datasets.py b/tests/integ/datasets.py new file mode 100644 index 0000000000..a68b1ee648 --- /dev/null +++ b/tests/integ/datasets.py @@ -0,0 +1,27 @@ +# Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import gzip +import os +import pickle + +from tests.integ import DATA_DIR + + +def one_p_mnist(): + data_path = os.path.join(DATA_DIR, "one_p_mnist", "mnist.pkl.gz") + with gzip.open(data_path, "rb") as f: + training_set, _, _ = pickle.load(f, encoding="latin1") + + return training_set From 9d9408f7f214f8fbdb1ddd2773ba65ca44bfb683 Mon Sep 17 00:00:00 2001 From: Lauren Yu <6631887+laurenyu@users.noreply.github.com> Date: Fri, 19 Jun 2020 17:55:34 -0700 Subject: [PATCH 5/5] fix fixture --- tests/integ/test_knn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integ/test_knn.py b/tests/integ/test_knn.py index ec05f9fecd..7f9b9854e9 100644 --- a/tests/integ/test_knn.py +++ b/tests/integ/test_knn.py @@ -57,7 +57,7 @@ def test_knn_regressor(sagemaker_session, cpu_instance_type, training_set): assert record.label["score"] is not None -def test_async_knn_classifier(sagemaker_session, cpu_instance_type): +def test_async_knn_classifier(sagemaker_session, cpu_instance_type, training_set): job_name = unique_name_from_base("knn") with timeout(minutes=5):