diff --git a/src/sagemaker/tensorflow/estimator.py b/src/sagemaker/tensorflow/estimator.py index 71b4138875..da5bf886fa 100644 --- a/src/sagemaker/tensorflow/estimator.py +++ b/src/sagemaker/tensorflow/estimator.py @@ -29,27 +29,18 @@ logger = logging.getLogger("sagemaker") -# TODO: consider creating a function for generating this command before removing this constant -_SCRIPT_MODE_TENSORBOARD_WARNING = ( - "Tensorboard is not supported with script mode. You can run the following " - "command: tensorboard --logdir %s --host localhost --port 6006 This can be " - "run from anywhere with access to the S3 URI used as the logdir." -) - class TensorFlow(Framework): """Handle end-to-end training and deployment of user-provided TensorFlow code.""" __framework_name__ = "tensorflow" - _SCRIPT_MODE_REPO_NAME = "tensorflow-scriptmode" + _ECR_REPO_NAME = "tensorflow-scriptmode" LATEST_VERSION = defaults.LATEST_VERSION _LATEST_1X_VERSION = "1.15.2" _HIGHEST_LEGACY_MODE_ONLY_VERSION = version.Version("1.10.0") - _LOWEST_SCRIPT_MODE_ONLY_VERSION = version.Version("1.13.1") - _HIGHEST_PYTHON_2_VERSION = version.Version("2.1.0") def __init__( @@ -59,7 +50,6 @@ def __init__( model_dir=None, image_name=None, distributions=None, - script_mode=True, **kwargs ): """Initialize a ``TensorFlow`` estimator. @@ -82,6 +72,8 @@ def __init__( * *Local Mode with local sources (file:// instead of s3://)* - \ ``/opt/ml/shared/model`` + To disable having ``model_dir`` passed to your training script, + set ``model_dir=False``. image_name (str): If specified, the estimator will use this image for training and hosting, instead of selecting the appropriate SageMaker official image based on framework_version and py_version. It can be an ECR url or dockerhub image and tag. @@ -114,8 +106,6 @@ def __init__( } } - script_mode (bool): Whether or not to use the Script Mode TensorFlow images - (default: True). **kwargs: Additional kwargs passed to the Framework constructor. .. tip:: @@ -154,7 +144,6 @@ def __init__( self.model_dir = model_dir self.distributions = distributions or {} - self._script_mode_enabled = script_mode self._validate_args(py_version=py_version, framework_version=self.framework_version) def _validate_args(self, py_version, framework_version): @@ -171,30 +160,29 @@ def _validate_args(self, py_version, framework_version): ) raise AttributeError(msg) - if (not self._script_mode_enabled) and self._only_script_mode_supported(): - logger.warning( - "Legacy mode is deprecated in versions 1.13 and higher. Using script mode instead." + if self._only_legacy_mode_supported() and self.image_name is None: + legacy_image_uri = fw.create_image_uri( + self.sagemaker_session.boto_region_name, + "tensorflow", + self.train_instance_type, + self.framework_version, + self.py_version, ) - self._script_mode_enabled = True - if self._only_legacy_mode_supported(): # TODO: add link to docs to explain how to use legacy mode with v2 - logger.warning( - "TF %s supports only legacy mode. If you were using any legacy mode parameters " + msg = ( + "TF {} supports only legacy mode. Please supply the image URI directly with " + "'image_name={}' and set 'model_dir=False'. If you are using any legacy parameters " "(training_steps, evaluation_steps, checkpoint_path, requirements_file), " - "make sure to pass them directly as hyperparameters instead.", - self.framework_version, - ) - self._script_mode_enabled = False + "make sure to pass them directly as hyperparameters instead." + ).format(self.framework_version, legacy_image_uri) + + raise ValueError(msg) def _only_legacy_mode_supported(self): """Placeholder docstring""" return version.Version(self.framework_version) <= self._HIGHEST_LEGACY_MODE_ONLY_VERSION - def _only_script_mode_supported(self): - """Placeholder docstring""" - return version.Version(self.framework_version) >= self._LOWEST_SCRIPT_MODE_ONLY_VERSION - def _only_python_3_supported(self): """Placeholder docstring""" return version.Version(self.framework_version) > self._HIGHEST_PYTHON_2_VERSION @@ -214,10 +202,6 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na job_details, model_channel_name ) - model_dir = init_params["hyperparameters"].pop("model_dir", None) - if model_dir is not None: - init_params["model_dir"] = model_dir - image_name = init_params.pop("image") framework, py_version, tag, script_mode = fw.framework_name_from_image(image_name) if not framework: @@ -226,8 +210,11 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na init_params["image_name"] = image_name return init_params - if script_mode is None: - init_params["script_mode"] = False + model_dir = init_params["hyperparameters"].pop("model_dir", None) + if model_dir: + init_params["model_dir"] = model_dir + elif script_mode is None: + init_params["model_dir"] = False init_params["py_version"] = py_version @@ -239,6 +226,10 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na "1.4" if tag == "1.0" else fw.framework_version_from_tag(tag) ) + # Legacy images are required to be passed in explicitly. + if not script_mode: + init_params["image_name"] = image_name + training_job_name = init_params["base_job_name"] if framework != cls.__framework_name__: raise ValueError( @@ -309,27 +300,26 @@ def hyperparameters(self): hyperparameters = super(TensorFlow, self).hyperparameters() additional_hyperparameters = {} - if self._script_mode_enabled: - mpi_enabled = False - - if "parameter_server" in self.distributions: - ps_enabled = self.distributions["parameter_server"].get("enabled", False) - additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled + if "parameter_server" in self.distributions: + ps_enabled = self.distributions["parameter_server"].get("enabled", False) + additional_hyperparameters[self.LAUNCH_PS_ENV_NAME] = ps_enabled - if "mpi" in self.distributions: - mpi_dict = self.distributions["mpi"] - mpi_enabled = mpi_dict.get("enabled", False) - additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled + mpi_enabled = False + if "mpi" in self.distributions: + mpi_dict = self.distributions["mpi"] + mpi_enabled = mpi_dict.get("enabled", False) + additional_hyperparameters[self.LAUNCH_MPI_ENV_NAME] = mpi_enabled - if mpi_dict.get("processes_per_host"): - additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get( - "processes_per_host" - ) - - additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get( - "custom_mpi_options", "" + if mpi_dict.get("processes_per_host"): + additional_hyperparameters[self.MPI_NUM_PROCESSES_PER_HOST] = mpi_dict.get( + "processes_per_host" ) + additional_hyperparameters[self.MPI_CUSTOM_MPI_OPTIONS] = mpi_dict.get( + "custom_mpi_options", "" + ) + + if self.model_dir is not False: self.model_dir = self.model_dir or self._default_s3_path("model", mpi=mpi_enabled) additional_hyperparameters["model_dir"] = self.model_dir @@ -375,16 +365,13 @@ def train_image(self): if self.image_name: return self.image_name - if self._script_mode_enabled: - return fw.create_image_uri( - self.sagemaker_session.boto_region_name, - self._SCRIPT_MODE_REPO_NAME, - self.train_instance_type, - self.framework_version, - self.py_version, - ) - - return super(TensorFlow, self).train_image() + return fw.create_image_uri( + self.sagemaker_session.boto_region_name, + self._ECR_REPO_NAME, + self.train_instance_type, + self.framework_version, + self.py_version, + ) def transformer( self, diff --git a/tests/conftest.py b/tests/conftest.py index 22bbac9603..6c9db6d5d8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -194,6 +194,21 @@ def xgboost_version(request): "1.9.0", "1.10", "1.10.0", + "1.11", + "1.11.0", + "1.12", + "1.12.0", + "1.13", + "1.14", + "1.14.0", + "1.15", + "1.15.0", + "1.15.2", + "2.0", + "2.0.0", + "2.0.1", + "2.1", + "2.1.0", ], ) def tf_version(request): diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py index 25439f67af..7ce685b336 100644 --- a/tests/integ/test_airflow_config.py +++ b/tests/integ/test_airflow_config.py @@ -561,7 +561,6 @@ def test_tf_airflow_config_uploads_data_source_to_s3(sagemaker_session, cpu_inst train_instance_count=SINGLE_INSTANCE_COUNT, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PYTHON_VERSION, metric_definitions=[ diff --git a/tests/integ/test_horovod.py b/tests/integ/test_horovod.py index ede3d12f2d..cc37532ef3 100644 --- a/tests/integ/test_horovod.py +++ b/tests/integ/test_horovod.py @@ -58,7 +58,6 @@ def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdi train_instance_type="local", sagemaker_session=sagemaker_local_session, py_version=integ.PYTHON_VERSION, - script_mode=True, output_path=output_path, framework_version="1.12", distributions={"mpi": {"enabled": True, "processes_per_host": processes}}, @@ -106,7 +105,6 @@ def _create_and_fit_estimator(sagemaker_session, instance_type, tmpdir): train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=integ.PYTHON_VERSION, - script_mode=True, framework_version="1.12", distributions={"mpi": {"enabled": True}}, ) diff --git a/tests/integ/test_tf_script_mode.py b/tests/integ/test_tf.py similarity index 98% rename from tests/integ/test_tf_script_mode.py rename to tests/integ/test_tf.py index a6326bccd9..82d88e748b 100644 --- a/tests/integ/test_tf_script_mode.py +++ b/tests/integ/test_tf.py @@ -59,7 +59,6 @@ def test_mnist_with_checkpoint_config( train_instance_count=1, train_instance_type=instance_type, sagemaker_session=sagemaker_session, - script_mode=True, framework_version=tf_full_version, py_version=py_version, metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}], @@ -104,7 +103,6 @@ def test_server_side_encryption(sagemaker_session, tf_full_version, py_version): train_instance_count=1, train_instance_type="ml.c5.xlarge", sagemaker_session=sagemaker_session, - script_mode=True, framework_version=tf_full_version, py_version=py_version, code_location=output_path, @@ -141,7 +139,6 @@ def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version, py train_instance_type=instance_type, sagemaker_session=sagemaker_session, py_version=py_version, - script_mode=True, framework_version=tf_full_version, distributions=PARAMETER_SERVER_DISTRIBUTION, ) @@ -166,7 +163,6 @@ def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, py_v train_instance_type="ml.c5.4xlarge", py_version=tests.integ.PYTHON_VERSION, sagemaker_session=sagemaker_session, - script_mode=True, # testing py-sdk functionality, no need to run against all TF versions framework_version=TensorFlow.LATEST_VERSION, tags=TAGS, @@ -209,7 +205,6 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_full_ve train_instance_type=instance_type, py_version=py_version, sagemaker_session=sagemaker_session, - script_mode=True, framework_version=tf_full_version, tags=TAGS, ) diff --git a/tests/integ/test_tf_efs_fsx.py b/tests/integ/test_tf_efs_fsx.py index a3a4e098cf..fb085cfe1f 100644 --- a/tests/integ/test_tf_efs_fsx.py +++ b/tests/integ/test_tf_efs_fsx.py @@ -65,7 +65,6 @@ def test_mnist_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PY_VERSION, subnets=subnets, @@ -105,7 +104,6 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): train_instance_count=1, train_instance_type=cpu_instance_type, sagemaker_session=sagemaker_session, - script_mode=True, framework_version=TensorFlow.LATEST_VERSION, py_version=PY_VERSION, subnets=subnets, @@ -130,7 +128,7 @@ def test_mnist_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, reason="EFS integration tests need to be fixed before running in all regions.", ) -def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): +def test_tuning_tf_efs(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] @@ -140,7 +138,6 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc role=role, train_instance_count=1, train_instance_type=cpu_instance_type, - script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, framework_version=TensorFlow.LATEST_VERSION, @@ -178,7 +175,7 @@ def test_tuning_tf_script_mode_efs(efs_fsx_setup, sagemaker_session, cpu_instanc tests.integ.test_region() not in tests.integ.EFS_TEST_ENABLED_REGION, reason="EFS integration tests need to be fixed before running in all regions.", ) -def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): +def test_tuning_tf_lustre(efs_fsx_setup, sagemaker_session, cpu_instance_type): role = efs_fsx_setup["role_name"] subnets = [efs_fsx_setup["subnet_id"]] security_group_ids = efs_fsx_setup["security_group_ids"] @@ -188,7 +185,6 @@ def test_tuning_tf_script_mode_lustre(efs_fsx_setup, sagemaker_session, cpu_inst role=role, train_instance_count=1, train_instance_type=cpu_instance_type, - script_mode=True, sagemaker_session=sagemaker_session, py_version=PY_VERSION, framework_version=TensorFlow.LATEST_VERSION, diff --git a/tests/integ/test_transformer.py b/tests/integ/test_transformer.py index 8eec086a11..c253596e53 100644 --- a/tests/integ/test_transformer.py +++ b/tests/integ/test_transformer.py @@ -352,7 +352,6 @@ def test_transform_tf_kms_network_isolation(sagemaker_session, cpu_instance_type train_instance_count=1, train_instance_type=cpu_instance_type, framework_version=TensorFlow.LATEST_VERSION, - script_mode=True, py_version=PYTHON_VERSION, sagemaker_session=sagemaker_session, ) diff --git a/tests/integ/test_tuner.py b/tests/integ/test_tuner.py index 705fbde56f..b646b930a6 100644 --- a/tests/integ/test_tuner.py +++ b/tests/integ/test_tuner.py @@ -599,7 +599,6 @@ def test_tuning_tf_script_mode(sagemaker_session, cpu_instance_type, tf_full_ver role="SageMakerRole", train_instance_count=1, train_instance_type=cpu_instance_type, - script_mode=True, sagemaker_session=sagemaker_session, py_version=PYTHON_VERSION, framework_version=tf_full_version, diff --git a/tests/unit/sagemaker/tensorflow/test_estimator.py b/tests/unit/sagemaker/tensorflow/test_estimator.py new file mode 100644 index 0000000000..1fe9dee1f6 --- /dev/null +++ b/tests/unit/sagemaker/tensorflow/test_estimator.py @@ -0,0 +1,495 @@ +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import json +import logging +import os + +from mock import patch, Mock, MagicMock +from packaging import version +import pytest + +from sagemaker.estimator import _TrainingJob +from sagemaker.tensorflow import defaults, serving, TensorFlow +from tests.unit import DATA_DIR + +SCRIPT_FILE = "dummy_script.py" +SCRIPT_PATH = os.path.join(DATA_DIR, SCRIPT_FILE) +SERVING_SCRIPT_FILE = "another_dummy_script.py" +TIMESTAMP = "2017-11-06-14:14:15.673" +TIME = 1510006209.073025 +BUCKET_NAME = "mybucket" +INSTANCE_COUNT = 1 +INSTANCE_TYPE = "ml.c4.4xlarge" +JOB_NAME = "sagemaker-tensorflow-scriptmode-{}".format(TIMESTAMP) +ROLE = "Dummy" +REGION = "us-west-2" +IMAGE_URI_FORMAT_STRING = ( + "520713654638.dkr.ecr.{}.amazonaws.com/sagemaker-tensorflow-scriptmode:{}-cpu-{}" +) +DISTRIBUTION_ENABLED = {"parameter_server": {"enabled": True}} +DISTRIBUTION_MPI_ENABLED = { + "mpi": {"enabled": True, "custom_mpi_options": "options", "processes_per_host": 2} +} + +ENDPOINT_DESC = {"EndpointConfigName": "test-endpoint"} + +ENDPOINT_CONFIG_DESC = {"ProductionVariants": [{"ModelName": "model-1"}, {"ModelName": "model-2"}]} + +LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} + +EXPERIMENT_CONFIG = { + "ExperimentName": "exp", + "TrialName": "trial", + "TrialComponentDisplayName": "tc", +} + + +@pytest.fixture() +def sagemaker_session(): + boto_mock = Mock(name="boto_session", region_name=REGION) + session = Mock( + name="sagemaker_session", + boto_session=boto_mock, + boto_region_name=REGION, + config=None, + local_mode=False, + s3_resource=None, + s3_client=None, + ) + session.default_bucket = Mock(name="default_bucket", return_value=BUCKET_NAME) + session.expand_role = Mock(name="expand_role", return_value=ROLE) + describe = {"ModelArtifacts": {"S3ModelArtifacts": "s3://m/m.tar.gz"}} + session.sagemaker_client.describe_training_job = Mock(return_value=describe) + session.sagemaker_client.describe_endpoint = Mock(return_value=ENDPOINT_DESC) + session.sagemaker_client.describe_endpoint_config = Mock(return_value=ENDPOINT_CONFIG_DESC) + session.sagemaker_client.list_tags = Mock(return_value=LIST_TAGS_RESULT) + return session + + +def _image_uri(tf_version, py_version): + return IMAGE_URI_FORMAT_STRING.format(REGION, tf_version, py_version) + + +def _hyperparameters(horovod=False): + hps = { + "sagemaker_program": json.dumps("dummy_script.py"), + "sagemaker_submit_directory": json.dumps( + "s3://{}/{}/source/sourcedir.tar.gz".format(BUCKET_NAME, JOB_NAME) + ), + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": str(logging.INFO), + "sagemaker_job_name": json.dumps(JOB_NAME), + "sagemaker_region": json.dumps("us-west-2"), + } + + if horovod: + hps["model_dir"] = json.dumps("/opt/ml/model") + else: + hps["model_dir"] = json.dumps("s3://{}/{}/model".format(BUCKET_NAME, JOB_NAME)) + + return hps + + +def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2"): + conf = { + "image": _image_uri(tf_version, py_version), + "input_mode": "File", + "input_config": [ + { + "ChannelName": "training", + "DataSource": { + "S3DataSource": { + "S3DataDistributionType": "FullyReplicated", + "S3DataType": "S3Prefix", + } + }, + } + ], + "role": ROLE, + "job_name": JOB_NAME, + "output_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)}, + "resource_config": { + "InstanceType": "ml.c4.4xlarge", + "InstanceCount": 1, + "VolumeSizeInGB": 30, + }, + "hyperparameters": _hyperparameters(horovod), + "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "tags": None, + "vpc_config": None, + "metric_definitions": None, + "experiment_config": None, + } + + if not ps: + conf["debugger_hook_config"] = { + "CollectionConfigurations": [], + "S3OutputPath": "s3://{}/".format(BUCKET_NAME), + } + + return conf + + +def _build_tf( + sagemaker_session, + framework_version=defaults.TF_VERSION, + train_instance_type=None, + base_job_name=None, + **kwargs +): + return TensorFlow( + entry_point=SCRIPT_PATH, + framework_version=framework_version, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=train_instance_type if train_instance_type else INSTANCE_TYPE, + base_job_name=base_job_name, + **kwargs + ) + + +def test_create_model(sagemaker_session, tf_version): + if version.Version(tf_version) < version.Version("1.11"): + pytest.skip( + "Legacy TF version requires explicit image URI, and " + "this logic is tested in test_create_model_with_custom_image." + ) + + container_log_level = '"logging.INFO"' + source_dir = "s3://mybucket/source" + tf = TensorFlow( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, + framework_version=tf_version, + container_log_level=container_log_level, + base_job_name="job", + source_dir=source_dir, + enable_network_isolation=True, + ) + + job_name = "doing something" + tf.fit(inputs="s3://mybucket/train", job_name=job_name) + model = tf.create_model() + + assert model.sagemaker_session == sagemaker_session + assert model._framework_version == tf_version + assert model.entry_point is None + assert model.role == ROLE + assert model.name == job_name + assert model._container_log_level == container_log_level + assert model.source_dir is None + assert model.vpc_config is None + assert model.enable_network_isolation() + + +def test_create_model_with_optional_params(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = "s3://mybucket/source" + enable_cloudwatch_metrics = "true" + tf = TensorFlow( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, + container_log_level=container_log_level, + base_job_name="job", + source_dir=source_dir, + enable_cloudwatch_metrics=enable_cloudwatch_metrics, + ) + + job_name = "doing something" + tf.fit(inputs="s3://mybucket/train", job_name=job_name) + + new_role = "role" + vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} + model_name = "model-name" + model = tf.create_model( + role=new_role, + vpc_config_override=vpc_config, + entry_point=SERVING_SCRIPT_FILE, + name=model_name, + enable_network_isolation=True, + ) + + assert model.role == new_role + assert model.vpc_config == vpc_config + assert model.entry_point == SERVING_SCRIPT_FILE + assert model.name == model_name + assert model.enable_network_isolation() + + +def test_create_model_with_custom_image(sagemaker_session): + container_log_level = '"logging.INFO"' + source_dir = "s3://mybucket/source" + custom_image = "tensorflow:1.0" + tf = TensorFlow( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, + image_name=custom_image, + container_log_level=container_log_level, + base_job_name="job", + source_dir=source_dir, + ) + + job_name = "doing something" + tf.fit(inputs="s3://mybucket/train", job_name=job_name) + model = tf.create_model() + + assert model.image == custom_image + + +@patch("sagemaker.tensorflow.estimator.TensorFlow.create_model") +def test_transformer_creation_with_optional_args(create_model, sagemaker_session): + model = Mock() + create_model.return_value = model + + tf = TensorFlow( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, + ) + tf.latest_training_job = _TrainingJob(sagemaker_session, "some-job-name") + + strategy = "SingleRecord" + assemble_with = "Line" + output_path = "s3://{}/batch-output".format(BUCKET_NAME) + kms_key = "kms" + accept_type = "text/bytes" + env = {"foo": "bar"} + max_concurrent_transforms = 3 + max_payload = 100 + tags = {"Key": "foo", "Value": "bar"} + new_role = "role" + vpc_config = {"Subnets": ["1234"], "SecurityGroupIds": ["5678"]} + model_name = "model-name" + + tf.transformer( + INSTANCE_COUNT, + INSTANCE_TYPE, + strategy=strategy, + assemble_with=assemble_with, + output_path=output_path, + output_kms_key=kms_key, + accept=accept_type, + env=env, + max_concurrent_transforms=max_concurrent_transforms, + max_payload=max_payload, + tags=tags, + role=new_role, + volume_kms_key=kms_key, + entry_point=SERVING_SCRIPT_FILE, + vpc_config_override=vpc_config, + enable_network_isolation=True, + model_name=model_name, + ) + + create_model.assert_called_with( + role=new_role, + vpc_config_override=vpc_config, + entry_point=SERVING_SCRIPT_FILE, + enable_network_isolation=True, + name=model_name, + ) + model.transformer.assert_called_with( + INSTANCE_COUNT, + INSTANCE_TYPE, + accept=accept_type, + assemble_with=assemble_with, + env=env, + max_concurrent_transforms=max_concurrent_transforms, + max_payload=max_payload, + output_kms_key=kms_key, + output_path=output_path, + strategy=strategy, + tags=tags, + volume_kms_key=kms_key, + ) + + +@patch("sagemaker.tensorflow.estimator.TensorFlow.create_model") +def test_transformer_creation_without_optional_args(create_model, sagemaker_session): + model = Mock() + create_model.return_value = model + + tf = TensorFlow( + entry_point=SCRIPT_PATH, + role=ROLE, + sagemaker_session=sagemaker_session, + train_instance_count=INSTANCE_COUNT, + train_instance_type=INSTANCE_TYPE, + ) + tf.latest_training_job = _TrainingJob(sagemaker_session, "some-job-name") + tf.transformer(INSTANCE_COUNT, INSTANCE_TYPE) + + create_model.assert_called_with( + role=ROLE, + vpc_config_override="VPC_CONFIG_DEFAULT", + entry_point=None, + enable_network_isolation=False, + name=None, + ) + model.transformer.assert_called_with( + INSTANCE_COUNT, + INSTANCE_TYPE, + accept=None, + assemble_with=None, + env=None, + max_concurrent_transforms=None, + max_payload=None, + output_kms_key=None, + output_path=None, + strategy=None, + tags=None, + volume_kms_key=None, + ) + + +def test_script_mode_create_model(sagemaker_session): + tf = _build_tf( + sagemaker_session=sagemaker_session, py_version="py3", enable_network_isolation=True + ) + tf._prepare_for_training() # set output_path and job name as if training happened + + model = tf.create_model() + + assert isinstance(model, serving.Model) + + assert model.model_data == tf.model_data + assert model.role == tf.role + assert model.name == tf._current_job_name + assert model.container_log_level == tf.container_log_level + assert model._framework_version == "1.11" + assert model.sagemaker_session == sagemaker_session + assert model.enable_network_isolation() + + +@patch("time.strftime", return_value=TIMESTAMP) +@patch("time.time", return_value=TIME) +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_fit(time, strftime, sagemaker_session): + tf = TensorFlow( + entry_point=SCRIPT_FILE, + role=ROLE, + sagemaker_session=sagemaker_session, + py_version="py3", + train_instance_type=INSTANCE_TYPE, + train_instance_count=1, + framework_version="1.11", + source_dir=DATA_DIR, + ) + + inputs = "s3://mybucket/train" + tf.fit(inputs=inputs) + + call_names = [c[0] for c in sagemaker_session.method_calls] + assert call_names == ["train", "logs_for_job"] + + expected_train_args = _create_train_job("1.11", py_version="py3") + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + + actual_train_args = sagemaker_session.method_calls[0][2] + assert actual_train_args == expected_train_args + + +@patch("time.strftime", return_value=TIMESTAMP) +@patch("time.time", return_value=TIME) +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_fit_ps(time, strftime, sagemaker_session): + tf = TensorFlow( + entry_point=SCRIPT_FILE, + role=ROLE, + sagemaker_session=sagemaker_session, + py_version="py3", + train_instance_type=INSTANCE_TYPE, + train_instance_count=1, + framework_version="1.11", + source_dir=DATA_DIR, + distributions=DISTRIBUTION_ENABLED, + ) + + inputs = "s3://mybucket/train" + tf.fit(inputs=inputs) + + call_names = [c[0] for c in sagemaker_session.method_calls] + assert call_names == ["train", "logs_for_job"] + + expected_train_args = _create_train_job("1.11", ps=True, py_version="py3") + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["hyperparameters"][TensorFlow.LAUNCH_PS_ENV_NAME] = json.dumps(True) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert actual_train_args == expected_train_args + + +@patch("time.strftime", return_value=TIMESTAMP) +@patch("time.time", return_value=TIME) +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_fit_mpi(time, strftime, sagemaker_session): + tf = TensorFlow( + entry_point=SCRIPT_FILE, + role=ROLE, + sagemaker_session=sagemaker_session, + py_version="py3", + train_instance_type=INSTANCE_TYPE, + train_instance_count=1, + framework_version="1.11", + source_dir=DATA_DIR, + distributions=DISTRIBUTION_MPI_ENABLED, + ) + + inputs = "s3://mybucket/train" + tf.fit(inputs=inputs) + + call_names = [c[0] for c in sagemaker_session.method_calls] + assert call_names == ["train", "logs_for_job"] + + expected_train_args = _create_train_job("1.11", horovod=True, py_version="py3") + expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs + expected_train_args["hyperparameters"][TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True) + expected_train_args["hyperparameters"][TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2) + expected_train_args["hyperparameters"][TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps( + "options" + ) + + actual_train_args = sagemaker_session.method_calls[0][2] + assert actual_train_args == expected_train_args + + +def test_hyperparameters_no_model_dir(sagemaker_session): + tf = _build_tf(sagemaker_session, model_dir=False) + hyperparameters = tf.hyperparameters() + assert "model_dir" not in hyperparameters + + +def test_train_image_default(sagemaker_session): + tf = _build_tf(sagemaker_session) + expected_image = _image_uri(defaults.TF_VERSION, "py2") + assert expected_image == tf.train_image() + + +def test_train_image_custom_image(sagemaker_session): + custom_image = "tensorflow:latest" + tf = _build_tf(sagemaker_session, image_name=custom_image) + assert custom_image == tf.train_image() diff --git a/tests/unit/sagemaker/tensorflow/test_estimator_attach.py b/tests/unit/sagemaker/tensorflow/test_estimator_attach.py new file mode 100644 index 0000000000..9b571fe7ce --- /dev/null +++ b/tests/unit/sagemaker/tensorflow/test_estimator_attach.py @@ -0,0 +1,298 @@ +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +import pytest +from mock import MagicMock, Mock, patch + +from sagemaker.tensorflow import TensorFlow + +BUCKET_NAME = "mybucket" +LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} +REGION = "us-west-2" +ROLE = "Dummy" + + +@pytest.fixture() +def sagemaker_session(): + boto_mock = Mock(name="boto_session", region_name=REGION) + session = Mock( + name="sagemaker_session", + boto_session=boto_mock, + boto_region_name=REGION, + config=None, + local_mode=False, + s3_resource=None, + s3_client=None, + ) + session.default_bucket = Mock(name="default_bucket", return_value=BUCKET_NAME) + session.expand_role = Mock(name="expand_role", return_value=ROLE) + describe = {"ModelArtifacts": {"S3ModelArtifacts": "s3://m/m.tar.gz"}} + session.sagemaker_client.describe_training_job = Mock(return_value=describe) + session.sagemaker_client.list_tags = Mock(return_value=LIST_TAGS_RESULT) + return session + + +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_attach(sagemaker_session, tf_version): + training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:{}-cpu-py2".format( + tf_version + ) + rjd = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"neo"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=rjd + ) + + estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == "neo" + assert estimator.py_version == "py2" + assert estimator.framework_version == tf_version + assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == "File" + assert estimator.input_mode == "File" + assert estimator.base_job_name == "neo" + assert estimator.output_path == "s3://place/output/neo" + assert estimator.output_kms_key == "" + assert estimator.source_dir == "s3://some/sourcedir.tar.gz" + assert estimator.entry_point == "iris-dnn-classifier.py" + + +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_attach_new_repo_name(sagemaker_session, tf_version): + training_image = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:{}-cpu-py2".format( + tf_version + ) + rjd = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"neo"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=rjd + ) + + estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == "neo" + assert estimator.py_version == "py2" + assert estimator.framework_version == tf_version + assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == "File" + assert estimator.input_mode == "File" + assert estimator.base_job_name == "neo" + assert estimator.output_path == "s3://place/output/neo" + assert estimator.output_kms_key == "" + assert estimator.source_dir == "s3://some/sourcedir.tar.gz" + assert estimator.entry_point == "iris-dnn-classifier.py" + assert estimator.train_image() == training_image + + +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_attach_old_container(sagemaker_session): + training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:1.0" + rjd = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"neo"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=rjd + ) + + estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == "neo" + assert estimator.py_version == "py2" + assert estimator.framework_version == "1.4" + assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == "File" + assert estimator.input_mode == "File" + assert estimator.base_job_name == "neo" + assert estimator.output_path == "s3://place/output/neo" + assert estimator.output_kms_key == "" + assert estimator.source_dir == "s3://some/sourcedir.tar.gz" + assert estimator.entry_point == "iris-dnn-classifier.py" + + +def test_attach_wrong_framework(sagemaker_session): + returned_job_description = { + "AlgorithmSpecification": { + "TrainingInputMode": "File", + "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0", + }, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=returned_job_description + ) + + with pytest.raises(ValueError) as error: + TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert "didn't use image for requested framework" in str(error) + + +def test_attach_custom_image(sagemaker_session): + training_image = "1.dkr.ecr.us-west-2.amazonaws.com/tensorflow_with_custom_binary:1.0" + rjd = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"neo"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=rjd + ) + + estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert estimator.image_name == training_image + assert estimator.train_image() == training_image + + +@patch("sagemaker.utils.create_tar_file", MagicMock()) +def test_tf_script_mode_attach(sagemaker_session, tf_version): + training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py3-cpu:{}-cpu-py3".format( + tf_version + ) + rjd = { + "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, + "HyperParameters": { + "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', + "sagemaker_program": '"iris-dnn-classifier.py"', + "sagemaker_enable_cloudwatch_metrics": "false", + "sagemaker_container_log_level": '"logging.INFO"', + "sagemaker_job_name": '"neo"', + }, + "RoleArn": "arn:aws:iam::366:role/SageMakerRole", + "ResourceConfig": { + "VolumeSizeInGB": 30, + "InstanceCount": 1, + "InstanceType": "ml.c4.xlarge", + }, + "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, + "TrainingJobName": "neo", + "TrainingJobStatus": "Completed", + "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", + "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, + "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, + } + sagemaker_session.sagemaker_client.describe_training_job = Mock( + name="describe_training_job", return_value=rjd + ) + + estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) + assert estimator.latest_training_job.job_name == "neo" + assert estimator.py_version == "py3" + assert estimator.framework_version == tf_version + assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" + assert estimator.train_instance_count == 1 + assert estimator.train_max_run == 24 * 60 * 60 + assert estimator.input_mode == "File" + assert estimator.input_mode == "File" + assert estimator.base_job_name == "neo" + assert estimator.output_path == "s3://place/output/neo" + assert estimator.output_kms_key == "" + assert estimator.hyperparameters() is not None + assert estimator.source_dir == "s3://some/sourcedir.tar.gz" + assert estimator.entry_point == "iris-dnn-classifier.py" diff --git a/tests/unit/sagemaker/tensorflow/test_estimator_init.py b/tests/unit/sagemaker/tensorflow/test_estimator_init.py new file mode 100644 index 0000000000..374e9e2fe4 --- /dev/null +++ b/tests/unit/sagemaker/tensorflow/test_estimator_init.py @@ -0,0 +1,125 @@ +# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). You +# may not use this file except in compliance with the License. A copy of +# the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "license" file accompanying this file. This file is +# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF +# ANY KIND, either express or implied. See the License for the specific +# language governing permissions and limitations under the License. +from __future__ import absolute_import + +from mock import Mock, patch +from packaging import version +import pytest + +from sagemaker.tensorflow import defaults, TensorFlow + +REGION = "us-west-2" + + +@pytest.fixture() +def sagemaker_session(): + return Mock(name="sagemaker_session", boto_region_name=REGION) + + +def _build_tf(sagemaker_session, **kwargs): + return TensorFlow( + sagemaker_session=sagemaker_session, + entry_point="dummy.py", + role="dummy-role", + train_instance_count=1, + train_instance_type="ml.c4.xlarge", + **kwargs + ) + + +@patch("sagemaker.fw_utils.empty_framework_version_warning") +def test_empty_framework_version(warning, sagemaker_session): + estimator = _build_tf(sagemaker_session, framework_version=None) + + assert estimator.framework_version == defaults.TF_VERSION + warning.assert_called_with(defaults.TF_VERSION, estimator.LATEST_VERSION) + + +@patch("sagemaker.fw_utils.python_deprecation_warning") +def test_estimator_py2_deprecation_warning(warning, sagemaker_session): + estimator = _build_tf(sagemaker_session, py_version="py2") + + assert estimator.py_version == "py2" + warning.assert_called_with("tensorflow", "2.1.0") + + +def test_py2_version_deprecated(sagemaker_session): + with pytest.raises(AttributeError) as e: + _build_tf(sagemaker_session, framework_version="2.1.1", py_version="py2") + + msg = ( + "Python 2 containers are only available with 2.1.0 and lower versions. " + "Please use a Python 3 container." + ) + assert msg in str(e.value) + + +def test_py2_version_is_not_deprecated(sagemaker_session): + estimator = _build_tf(sagemaker_session, framework_version="1.15.0", py_version="py2") + assert estimator.py_version == "py2" + estimator = _build_tf(sagemaker_session, framework_version="2.0.0", py_version="py2") + assert estimator.py_version == "py2" + + +def test_py2_is_default_version_before_tf1_14(sagemaker_session): + estimator = _build_tf(sagemaker_session, framework_version="1.13") + assert estimator.py_version == "py2" + + +def test_framework_name(sagemaker_session): + tf = _build_tf(sagemaker_session, framework_version="1.15.2") + assert tf.__framework_name__ == "tensorflow" + + +def test_enable_sm_metrics(sagemaker_session): + tf = _build_tf(sagemaker_session, enable_sagemaker_metrics=True) + assert tf.enable_sagemaker_metrics + + +def test_disable_sm_metrics(sagemaker_session): + tf = _build_tf(sagemaker_session, enable_sagemaker_metrics=False) + assert not tf.enable_sagemaker_metrics + + +def test_disable_sm_metrics_if_fw_ver_is_less_than_1_15(sagemaker_session, tf_version): + if version.Version(tf_version) > version.Version("1.14"): + pytest.skip("This test is for TF 1.14 and lower.") + + tf = _build_tf(sagemaker_session, framework_version=tf_version, image_name="old-image") + assert tf.enable_sagemaker_metrics is None + + +def test_enable_sm_metrics_if_fw_ver_is_at_least_1_15(sagemaker_session, tf_version): + if version.Version(tf_version) < version.Version("1.15"): + pytest.skip("This test is for TF 1.15 and higher.") + + tf = _build_tf(sagemaker_session, framework_version=tf_version) + assert tf.enable_sagemaker_metrics + + +def test_require_image_name_if_fw_ver_is_less_than_1_11(sagemaker_session, tf_version): + if version.Version(tf_version) > version.Version("1.10"): + pytest.skip("This test is for TF 1.10 and lower.") + + with pytest.raises(ValueError) as e: + _build_tf(sagemaker_session, framework_version=tf_version) + + expected_msg = ( + "TF {version} supports only legacy mode. Please supply the image URI directly with " + "'image_name=520713654638.dkr.ecr.{region}.amazonaws.com/" + "sagemaker-tensorflow:{version}-cpu-py2' and set 'model_dir=False'. " + "If you are using any legacy parameters (training_steps, evaluation_steps, " + "checkpoint_path, requirements_file), make sure to pass them directly as hyperparameters instead." + ).format(version=tf_version, region=REGION) + + assert expected_msg in str(e) diff --git a/tests/unit/test_tfs.py b/tests/unit/sagemaker/tensorflow/test_tfs.py similarity index 94% rename from tests/unit/test_tfs.py rename to tests/unit/sagemaker/tensorflow/test_tfs.py index 173ab89874..c90401ad76 100644 --- a/tests/unit/test_tfs.py +++ b/tests/unit/sagemaker/tensorflow/test_tfs.py @@ -18,13 +18,14 @@ import mock import pytest -from mock import Mock +from mock import Mock, patch from sagemaker.predictor import csv_serializer from sagemaker.tensorflow import TensorFlow from sagemaker.tensorflow.serving import Model, Predictor JSON_CONTENT_TYPE = "application/json" CSV_CONTENT_TYPE = "text/csv" +IMAGE = "tensorflow-inference:2.0.0-cpu" INSTANCE_COUNT = 1 INSTANCE_TYPE = "ml.c4.4xlarge" ACCELERATOR_TYPE = "ml.eia1.medium" @@ -69,7 +70,8 @@ def sagemaker_session(): return session -def test_tfs_model(sagemaker_session, tf_version): +@patch("sagemaker.tensorflow.serving.create_image_uri", return_value=IMAGE) +def test_tfs_model(create_image_uri, sagemaker_session, tf_version): model = Model( "s3://some/data.tar.gz", role=ROLE, @@ -77,14 +79,18 @@ def test_tfs_model(sagemaker_session, tf_version): sagemaker_session=sagemaker_session, ) cdef = model.prepare_container_def(INSTANCE_TYPE) - assert cdef["Image"].endswith("sagemaker-tensorflow-serving:{}-cpu".format(tf_version)) - assert cdef["Environment"] == {} + create_image_uri.assert_called_with( + REGION, "tensorflow-serving", INSTANCE_TYPE, tf_version, accelerator_type=None + ) + assert IMAGE == cdef["Image"] + assert {} == cdef["Environment"] predictor = model.deploy(INSTANCE_COUNT, INSTANCE_TYPE) assert isinstance(predictor, Predictor) -def test_tfs_model_image_accelerator(sagemaker_session, tf_version): +@patch("sagemaker.tensorflow.serving.create_image_uri", return_value=IMAGE) +def test_tfs_model_accelerator(create_image_uri, sagemaker_session, tf_version): model = Model( "s3://some/data.tar.gz", role=ROLE, @@ -92,7 +98,10 @@ def test_tfs_model_image_accelerator(sagemaker_session, tf_version): sagemaker_session=sagemaker_session, ) cdef = model.prepare_container_def(INSTANCE_TYPE, accelerator_type=ACCELERATOR_TYPE) - assert cdef["Image"].endswith("sagemaker-tensorflow-serving-eia:{}-cpu".format(tf_version)) + create_image_uri.assert_called_with( + REGION, "tensorflow-serving", INSTANCE_TYPE, tf_version, accelerator_type=ACCELERATOR_TYPE + ) + assert IMAGE == cdef["Image"] predictor = model.deploy(INSTANCE_COUNT, INSTANCE_TYPE) assert isinstance(predictor, Predictor) diff --git a/tests/unit/test_airflow.py b/tests/unit/test_airflow.py index 7652e8ef18..55ee042396 100644 --- a/tests/unit/test_airflow.py +++ b/tests/unit/test_airflow.py @@ -169,21 +169,18 @@ def test_byo_training_config_all_args(sagemaker_session): @patch( "sagemaker.fw_utils.parse_s3_url", MagicMock( - return_value=[ - "output", - "sagemaker-tensorflow-{}/source/sourcedir.tar.gz".format(TIME_STAMP), - ] + return_value=["output", "tensorflow-training-{}/source/sourcedir.tar.gz".format(TIME_STAMP)] ), ) @patch( "sagemaker.fw_utils.get_ecr_image_uri_prefix", - return_value="520713654638.dkr.ecr.us-west-2.amazonaws.com", + return_value="763104351884.dkr.ecr.us-west-2.amazonaws.com", ) def test_framework_training_config_required_args(ecr_prefix, sagemaker_session): tf = tensorflow.TensorFlow( entry_point="/some/script.py", - framework_version="1.10.0", - hyperparameters={"training_steps": 1000, "evaluation_steps": 100}, + framework_version="1.15.2", + py_version="py3", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", @@ -195,11 +192,11 @@ def test_framework_training_config_required_args(ecr_prefix, sagemaker_session): config = airflow.training_config(tf, data) expected_config = { "AlgorithmSpecification": { - "TrainingImage": "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2", + "TrainingImage": "763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.15.2-cpu-py3", "TrainingInputMode": "File", }, "OutputDataConfig": {"S3OutputPath": "s3://output/"}, - "TrainingJobName": "sagemaker-tensorflow-%s" % TIME_STAMP, + "TrainingJobName": "tensorflow-training-%s" % TIME_STAMP, "StoppingCondition": {"MaxRuntimeInSeconds": 86400}, "ResourceConfig": { "InstanceCount": "{{ instance_count }}", @@ -220,22 +217,21 @@ def test_framework_training_config_required_args(ecr_prefix, sagemaker_session): } ], "HyperParameters": { - "sagemaker_submit_directory": '"s3://output/sagemaker-tensorflow-%s/source/sourcedir.tar.gz"' + "sagemaker_submit_directory": '"s3://output/tensorflow-training-%s/source/sourcedir.tar.gz"' % TIME_STAMP, "sagemaker_program": '"script.py"', "sagemaker_enable_cloudwatch_metrics": "false", "sagemaker_container_log_level": "20", - "sagemaker_job_name": '"sagemaker-tensorflow-%s"' % TIME_STAMP, + "sagemaker_job_name": '"tensorflow-training-%s"' % TIME_STAMP, "sagemaker_region": '"us-west-2"', - "training_steps": "1000", - "evaluation_steps": "100", + "model_dir": '"s3://output/tensorflow-training-%s/model"' % TIME_STAMP, }, "S3Operations": { "S3Upload": [ { "Path": "/some/script.py", "Bucket": "output", - "Key": "sagemaker-tensorflow-%s/source/sourcedir.tar.gz" % TIME_STAMP, + "Key": "tensorflow-training-%s/source/sourcedir.tar.gz" % TIME_STAMP, "Tar": True, } ] @@ -253,7 +249,7 @@ def test_framework_training_config_required_args(ecr_prefix, sagemaker_session): ) @patch( "sagemaker.fw_utils.get_ecr_image_uri_prefix", - return_value="520713654638.dkr.ecr.us-west-2.amazonaws.com", + return_value="763104351884.dkr.ecr.us-west-2.amazonaws.com", ) def test_framework_training_config_all_args(ecr_prefix, sagemaker_session): tf = tensorflow.TensorFlow( @@ -262,14 +258,9 @@ def test_framework_training_config_all_args(ecr_prefix, sagemaker_session): enable_cloudwatch_metrics=False, container_log_level="{{ log_level }}", code_location="s3://{{ bucket_name }}/{{ prefix }}", - hyperparameters={ - "training_steps": 1000, - "evaluation_steps": 100, - "checkpoint_path": "{{ checkpoint_path }}", - "sagemaker_requirements": "", - }, - py_version="py2", - framework_version="1.10.0", + hyperparameters={"epochs": 1}, + py_version="py3", + framework_version="1.15.2", role="{{ role }}", train_instance_count="{{ instance_count }}", train_instance_type="ml.c4.2xlarge", @@ -292,7 +283,7 @@ def test_framework_training_config_all_args(ecr_prefix, sagemaker_session): config = airflow.training_config(tf, data) expected_config = { "AlgorithmSpecification": { - "TrainingImage": "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:1.10.0-cpu-py2", + "TrainingImage": "763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.15.2-cpu-py3", "TrainingInputMode": "Pipe", "MetricDefinitions": [{"Name": "{{ name }}", "Regex": "{{ regex }}"}], }, @@ -333,10 +324,8 @@ def test_framework_training_config_all_args(ecr_prefix, sagemaker_session): "sagemaker_container_log_level": '"{{ log_level }}"', "sagemaker_job_name": '"{{ base_job_name }}-%s"' % TIME_STAMP, "sagemaker_region": '"us-west-2"', - "checkpoint_path": '"{{ checkpoint_path }}"', - "training_steps": "1000", - "evaluation_steps": "100", - "sagemaker_requirements": '""', + "model_dir": '"{{ output_path }}/{{ base_job_name }}-%s/model"' % TIME_STAMP, + "epochs": "1", }, "Tags": [{"{{ key }}": "{{ value }}"}], "S3Operations": { diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index 1c6388c38b..5b015c924f 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -57,10 +57,6 @@ "1.9.0", "1.10", "1.10.0", - "1.11", - "1.11.0", - "1.12", - "1.12.0", ], } @@ -108,13 +104,6 @@ def is_mxnet_1_4_py2(framework, framework_version, py_version): return framework == "mxnet" and py_version == "py2" and framework_version in ["1.4", "1.4.1"] -@pytest.fixture( - scope="module", params=["1.11", "1.11.0", "1.12", "1.12.0", "1.14", "1.14.0", "1.15", "1.15.0"] -) -def tf_version(request): - return request.param - - @pytest.fixture( scope="module", params=["0.4", "0.4.0", "1.0", "1.0.0", "1.1", "1.1.0", "1.2", "1.2.0", "1.3", "1.3.1"], @@ -630,14 +619,25 @@ def test_create_image_uri_mxnet(mxnet_version): def test_create_image_uri_tensorflow(tf_version): - image_uri = fw_utils.create_image_uri( - "us-west-2", "tensorflow-scriptmode", "ml.p3.2xlarge", tf_version, "py3" - ) - assert image_uri == "{}.dkr.ecr.us-west-2.amazonaws.com/{}:{}-gpu-py3".format( - get_account("tensorflow", tf_version), get_repo_name("tensorflow", tf_version), tf_version - ) + if tf_version in ORIGINAL_FW_VERSIONS["tensorflow"]: + image_uri = fw_utils.create_image_uri( + "us-west-2", "tensorflow", "ml.p3.2xlarge", tf_version, "py2" + ) + assert image_uri == "{}.dkr.ecr.us-west-2.amazonaws.com/{}:{}-gpu-py2".format( + get_account("tensorflow", tf_version), + get_repo_name("tensorflow", tf_version), + tf_version, + ) + else: + image_uri = fw_utils.create_image_uri( + "us-west-2", "tensorflow-scriptmode", "ml.p3.2xlarge", tf_version, "py3" + ) + assert image_uri == "{}.dkr.ecr.us-west-2.amazonaws.com/{}:{}-gpu-py3".format( + get_account("tensorflow", tf_version), + get_repo_name("tensorflow", tf_version), + tf_version, + ) - if tf_version not in ORIGINAL_FW_VERSIONS: image_uri = fw_utils.create_image_uri( "us-west-2", "tensorflow-serving", "ml.c4.2xlarge", tf_version ) diff --git a/tests/unit/test_tf_estimator.py b/tests/unit/test_tf_estimator.py deleted file mode 100644 index 82c5533a0d..0000000000 --- a/tests/unit/test_tf_estimator.py +++ /dev/null @@ -1,893 +0,0 @@ -# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"). You -# may not use this file except in compliance with the License. A copy of -# the License is located at -# -# http://aws.amazon.com/apache2.0/ -# -# or in the "license" file accompanying this file. This file is -# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF -# ANY KIND, either express or implied. See the License for the specific -# language governing permissions and limitations under the License. -from __future__ import absolute_import - -import json -import logging -import os - -import pytest -from mock import patch, Mock, MagicMock - -from sagemaker.estimator import _TrainingJob -from sagemaker.tensorflow import defaults, serving, TensorFlow - -DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data") -SCRIPT_FILE = "dummy_script.py" -SCRIPT_PATH = os.path.join(DATA_DIR, SCRIPT_FILE) -SERVING_SCRIPT_FILE = "another_dummy_script.py" -MODEL_DATA = "s3://some/data.tar.gz" -TIMESTAMP = "2017-11-06-14:14:15.673" -TIME = 1510006209.073025 -BUCKET_NAME = "mybucket" -INSTANCE_COUNT = 1 -INSTANCE_TYPE = "ml.c4.4xlarge" -ACCELERATOR_TYPE = "ml.eia.medium" -IMAGE_REPO_NAME = "sagemaker-tensorflow" -SM_IMAGE_REPO_NAME = "sagemaker-tensorflow-scriptmode" -JOB_NAME = "{}-{}".format(IMAGE_REPO_NAME, TIMESTAMP) -SM_JOB_NAME = "{}-{}".format(SM_IMAGE_REPO_NAME, TIMESTAMP) -ROLE = "Dummy" -REGION = "us-west-2" -DOCKER_TAG = "1.0" -IMAGE_URI_FORMAT_STRING = "520713654638.dkr.ecr.{}.amazonaws.com/{}:{}-{}-{}" -SCRIPT_MODE_REPO_NAME = "sagemaker-tensorflow-scriptmode" -DISTRIBUTION_ENABLED = {"parameter_server": {"enabled": True}} -DISTRIBUTION_MPI_ENABLED = { - "mpi": {"enabled": True, "custom_mpi_options": "options", "processes_per_host": 2} -} - -ENDPOINT_DESC = {"EndpointConfigName": "test-endpoint"} - -ENDPOINT_CONFIG_DESC = {"ProductionVariants": [{"ModelName": "model-1"}, {"ModelName": "model-2"}]} - -LIST_TAGS_RESULT = {"Tags": [{"Key": "TagtestKey", "Value": "TagtestValue"}]} - -EXPERIMENT_CONFIG = { - "ExperimentName": "exp", - "TrialName": "trial", - "TrialComponentDisplayName": "tc", -} - - -@pytest.fixture() -def sagemaker_session(): - boto_mock = Mock(name="boto_session", region_name=REGION) - session = Mock( - name="sagemaker_session", - boto_session=boto_mock, - boto_region_name=REGION, - config=None, - local_mode=False, - s3_resource=None, - s3_client=None, - ) - session.default_bucket = Mock(name="default_bucket", return_value=BUCKET_NAME) - session.expand_role = Mock(name="expand_role", return_value=ROLE) - describe = {"ModelArtifacts": {"S3ModelArtifacts": "s3://m/m.tar.gz"}} - session.sagemaker_client.describe_training_job = Mock(return_value=describe) - session.sagemaker_client.describe_endpoint = Mock(return_value=ENDPOINT_DESC) - session.sagemaker_client.describe_endpoint_config = Mock(return_value=ENDPOINT_CONFIG_DESC) - session.sagemaker_client.list_tags = Mock(return_value=LIST_TAGS_RESULT) - return session - - -def _get_full_cpu_image_uri(version, repo=IMAGE_REPO_NAME, py_version="py2"): - return IMAGE_URI_FORMAT_STRING.format(REGION, repo, version, "cpu", py_version) - - -def _get_full_gpu_image_uri(version, repo=IMAGE_REPO_NAME, py_version="py2"): - return IMAGE_URI_FORMAT_STRING.format(REGION, repo, version, "gpu", py_version) - - -def _get_full_cpu_image_uri_with_ei(version): - return _get_full_cpu_image_uri(version, repo="{}-eia".format(IMAGE_REPO_NAME)) - - -def _hyperparameters(script_mode=False, horovod=False): - job_name = SM_JOB_NAME if script_mode else JOB_NAME - hps = { - "sagemaker_program": json.dumps("dummy_script.py"), - "sagemaker_submit_directory": json.dumps( - "s3://{}/{}/source/sourcedir.tar.gz".format(BUCKET_NAME, job_name) - ), - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": str(logging.INFO), - "sagemaker_job_name": json.dumps(job_name), - "sagemaker_region": json.dumps("us-west-2"), - } - - if horovod: - hps["model_dir"] = json.dumps("/opt/ml/model") - else: - hps["model_dir"] = json.dumps("s3://{}/{}/model".format(BUCKET_NAME, job_name)) - - return hps - - -def _create_train_job( - tf_version, - script_mode=False, - horovod=False, - ps=False, - repo_name=IMAGE_REPO_NAME, - py_version="py2", -): - conf = { - "image": _get_full_cpu_image_uri(tf_version, repo=repo_name, py_version=py_version), - "input_mode": "File", - "input_config": [ - { - "ChannelName": "training", - "DataSource": { - "S3DataSource": { - "S3DataDistributionType": "FullyReplicated", - "S3DataType": "S3Prefix", - } - }, - } - ], - "role": ROLE, - "job_name": "{}-{}".format(repo_name, TIMESTAMP), - "output_config": {"S3OutputPath": "s3://{}/".format(BUCKET_NAME)}, - "resource_config": { - "InstanceType": "ml.c4.4xlarge", - "InstanceCount": 1, - "VolumeSizeInGB": 30, - }, - "hyperparameters": _hyperparameters(script_mode, horovod), - "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "tags": None, - "vpc_config": None, - "metric_definitions": None, - "experiment_config": None, - } - - if not ps: - conf["debugger_hook_config"] = { - "CollectionConfigurations": [], - "S3OutputPath": "s3://{}/".format(BUCKET_NAME), - } - - return conf - - -def _build_tf( - sagemaker_session, - framework_version=defaults.TF_VERSION, - train_instance_type=None, - base_job_name=None, - **kwargs -): - return TensorFlow( - entry_point=SCRIPT_PATH, - framework_version=framework_version, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=train_instance_type if train_instance_type else INSTANCE_TYPE, - base_job_name=base_job_name, - **kwargs - ) - - -def test_tf_cpu_images(sagemaker_session, tf_version): - tf = _build_tf(sagemaker_session, tf_version, train_instance_type="ml.c2.2xlarge") - assert tf.train_image() == _get_full_cpu_image_uri(tf_version) - - tf = _build_tf(sagemaker_session, tf_version, train_instance_type="ml.c4.2xlarge") - assert tf.train_image() == _get_full_cpu_image_uri(tf_version) - - tf = _build_tf(sagemaker_session, tf_version, train_instance_type="ml.m16") - assert tf.train_image() == _get_full_cpu_image_uri(tf_version) - - -def test_tf_gpu_images(sagemaker_session, tf_version): - tf = _build_tf(sagemaker_session, tf_version, train_instance_type="ml.g2.2xlarge") - assert tf.train_image() == _get_full_gpu_image_uri(tf_version) - - tf = _build_tf(sagemaker_session, tf_version, train_instance_type="ml.p2.2xlarge") - assert tf.train_image() == _get_full_gpu_image_uri(tf_version) - - -def test_create_model(sagemaker_session, tf_version): - container_log_level = '"logging.INFO"' - source_dir = "s3://mybucket/source" - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - framework_version=tf_version, - container_log_level=container_log_level, - base_job_name="job", - source_dir=source_dir, - enable_network_isolation=True, - ) - - job_name = "doing something" - tf.fit(inputs="s3://mybucket/train", job_name=job_name) - model = tf.create_model() - - assert model.sagemaker_session == sagemaker_session - assert model._framework_version == tf_version - assert model.entry_point is None - assert model.role == ROLE - assert model.name == job_name - assert model._container_log_level == container_log_level - assert model.source_dir is None - assert model.vpc_config is None - assert model.enable_network_isolation() - - -def test_create_model_with_optional_params(sagemaker_session): - container_log_level = '"logging.INFO"' - source_dir = "s3://mybucket/source" - enable_cloudwatch_metrics = "true" - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - container_log_level=container_log_level, - base_job_name="job", - source_dir=source_dir, - enable_cloudwatch_metrics=enable_cloudwatch_metrics, - ) - - job_name = "doing something" - tf.fit(inputs="s3://mybucket/train", job_name=job_name) - - new_role = "role" - vpc_config = {"Subnets": ["foo"], "SecurityGroupIds": ["bar"]} - model_name = "model-name" - model = tf.create_model( - role=new_role, - vpc_config_override=vpc_config, - entry_point=SERVING_SCRIPT_FILE, - name=model_name, - enable_network_isolation=True, - ) - - assert model.role == new_role - assert model.vpc_config == vpc_config - assert model.entry_point == SERVING_SCRIPT_FILE - assert model.name == model_name - assert model.enable_network_isolation() - - -def test_create_model_with_custom_image(sagemaker_session): - container_log_level = '"logging.INFO"' - source_dir = "s3://mybucket/source" - custom_image = "tensorflow:1.0" - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - image_name=custom_image, - container_log_level=container_log_level, - base_job_name="job", - source_dir=source_dir, - ) - - job_name = "doing something" - tf.fit(inputs="s3://mybucket/train", job_name=job_name) - model = tf.create_model() - - assert model.image == custom_image - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_train_image_default(sagemaker_session): - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - ) - - assert _get_full_cpu_image_uri(defaults.TF_VERSION, repo=SM_IMAGE_REPO_NAME) == tf.train_image() - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_attach(sagemaker_session, tf_version): - training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:{}-cpu-py2".format( - tf_version - ) - rjd = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - "sagemaker_job_name": '"neo"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=rjd - ) - - estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == "neo" - assert estimator.py_version == "py2" - assert estimator.framework_version == tf_version - assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == "File" - assert estimator.input_mode == "File" - assert estimator.base_job_name == "neo" - assert estimator.output_path == "s3://place/output/neo" - assert estimator.output_kms_key == "" - assert estimator.source_dir == "s3://some/sourcedir.tar.gz" - assert estimator.entry_point == "iris-dnn-classifier.py" - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_attach_new_repo_name(sagemaker_session, tf_version): - training_image = "520713654638.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow:{}-cpu-py2".format( - tf_version - ) - rjd = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - "sagemaker_job_name": '"neo"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=rjd - ) - - estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == "neo" - assert estimator.py_version == "py2" - assert estimator.framework_version == tf_version - assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == "File" - assert estimator.input_mode == "File" - assert estimator.base_job_name == "neo" - assert estimator.output_path == "s3://place/output/neo" - assert estimator.output_kms_key == "" - assert estimator.source_dir == "s3://some/sourcedir.tar.gz" - assert estimator.entry_point == "iris-dnn-classifier.py" - assert estimator.train_image() == training_image - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_attach_old_container(sagemaker_session): - training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py2-cpu:1.0" - rjd = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - "sagemaker_job_name": '"neo"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=rjd - ) - - estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == "neo" - assert estimator.py_version == "py2" - assert estimator.framework_version == "1.4" - assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == "File" - assert estimator.input_mode == "File" - assert estimator.base_job_name == "neo" - assert estimator.output_path == "s3://place/output/neo" - assert estimator.output_kms_key == "" - assert estimator.source_dir == "s3://some/sourcedir.tar.gz" - assert estimator.entry_point == "iris-dnn-classifier.py" - - -def test_attach_wrong_framework(sagemaker_session): - returned_job_description = { - "AlgorithmSpecification": { - "TrainingInputMode": "File", - "TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-mxnet-py2-cpu:1.0", - }, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=returned_job_description - ) - - with pytest.raises(ValueError) as error: - TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert "didn't use image for requested framework" in str(error) - - -@patch("sagemaker.tensorflow.estimator.TensorFlow.create_model") -def test_transformer_creation_with_optional_args(create_model, sagemaker_session): - model = Mock() - create_model.return_value = model - - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - ) - tf.latest_training_job = _TrainingJob(sagemaker_session, "some-job-name") - - strategy = "SingleRecord" - assemble_with = "Line" - output_path = "s3://{}/batch-output".format(BUCKET_NAME) - kms_key = "kms" - accept_type = "text/bytes" - env = {"foo": "bar"} - max_concurrent_transforms = 3 - max_payload = 100 - tags = {"Key": "foo", "Value": "bar"} - new_role = "role" - vpc_config = {"Subnets": ["1234"], "SecurityGroupIds": ["5678"]} - model_name = "model-name" - - tf.transformer( - INSTANCE_COUNT, - INSTANCE_TYPE, - strategy=strategy, - assemble_with=assemble_with, - output_path=output_path, - output_kms_key=kms_key, - accept=accept_type, - env=env, - max_concurrent_transforms=max_concurrent_transforms, - max_payload=max_payload, - tags=tags, - role=new_role, - volume_kms_key=kms_key, - entry_point=SERVING_SCRIPT_FILE, - vpc_config_override=vpc_config, - enable_network_isolation=True, - model_name=model_name, - ) - - create_model.assert_called_with( - role=new_role, - vpc_config_override=vpc_config, - entry_point=SERVING_SCRIPT_FILE, - enable_network_isolation=True, - name=model_name, - ) - model.transformer.assert_called_with( - INSTANCE_COUNT, - INSTANCE_TYPE, - accept=accept_type, - assemble_with=assemble_with, - env=env, - max_concurrent_transforms=max_concurrent_transforms, - max_payload=max_payload, - output_kms_key=kms_key, - output_path=output_path, - strategy=strategy, - tags=tags, - volume_kms_key=kms_key, - ) - - -@patch("sagemaker.tensorflow.estimator.TensorFlow.create_model") -def test_transformer_creation_without_optional_args(create_model, sagemaker_session): - model = Mock() - create_model.return_value = model - - tf = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - ) - tf.latest_training_job = _TrainingJob(sagemaker_session, "some-job-name") - tf.transformer(INSTANCE_COUNT, INSTANCE_TYPE) - - create_model.assert_called_with( - role=ROLE, - vpc_config_override="VPC_CONFIG_DEFAULT", - entry_point=None, - enable_network_isolation=False, - name=None, - ) - model.transformer.assert_called_with( - INSTANCE_COUNT, - INSTANCE_TYPE, - accept=None, - assemble_with=None, - env=None, - max_concurrent_transforms=None, - max_payload=None, - output_kms_key=None, - output_path=None, - strategy=None, - tags=None, - volume_kms_key=None, - ) - - -def test_attach_custom_image(sagemaker_session): - training_image = "1.dkr.ecr.us-west-2.amazonaws.com/tensorflow_with_custom_binary:1.0" - rjd = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - "sagemaker_job_name": '"neo"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=rjd - ) - - estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert estimator.image_name == training_image - assert estimator.train_image() == training_image - - -@patch("sagemaker.fw_utils.python_deprecation_warning") -def test_estimator_py2_deprecation_warning(warning, sagemaker_session): - estimator = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - py_version="py2", - ) - - assert estimator.py_version == "py2" - warning.assert_called_with("tensorflow", "2.1.0") - - -@patch("sagemaker.fw_utils.empty_framework_version_warning") -def test_empty_framework_version(warning, sagemaker_session): - estimator = TensorFlow( - entry_point=SCRIPT_PATH, - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - framework_version=None, - ) - - assert estimator.framework_version == defaults.TF_VERSION - warning.assert_called_with(defaults.TF_VERSION, estimator.LATEST_VERSION) - - -def test_py2_version_deprecated(sagemaker_session): - with pytest.raises(AttributeError) as e: - TensorFlow( - entry_point=SCRIPT_PATH, - framework_version="2.1.1", - role=ROLE, - sagemaker_session=sagemaker_session, - train_instance_count=INSTANCE_COUNT, - train_instance_type=INSTANCE_TYPE, - py_version="py2", - ) - - msg = ( - "Python 2 containers are only available with 2.1.0 and lower versions. " - "Please use a Python 3 container." - ) - assert msg in str(e.value) - - -def test_py2_version_is_not_deprecated(sagemaker_session): - estimator = _build_tf( - sagemaker_session=sagemaker_session, framework_version="1.15.0", py_version="py2" - ) - assert estimator.py_version == "py2" - estimator = _build_tf( - sagemaker_session=sagemaker_session, framework_version="2.0.0", py_version="py2" - ) - assert estimator.py_version == "py2" - - -def test_py3_is_default_version_before_tf1_14(sagemaker_session): - estimator = _build_tf(sagemaker_session=sagemaker_session, framework_version="1.13") - - assert estimator.py_version == "py2" - - estimator = _build_tf(sagemaker_session=sagemaker_session, framework_version="1.10") - - assert estimator.py_version == "py2" - - -def test_legacy_mode_framework_name(sagemaker_session): - tf = _build_tf(sagemaker_session=sagemaker_session, framework_version="1.10") - assert tf.__framework_name__ == "tensorflow" - - -def test_script_mode_create_model(sagemaker_session): - tf = _build_tf( - sagemaker_session=sagemaker_session, py_version="py3", enable_network_isolation=True - ) - tf._prepare_for_training() # set output_path and job name as if training happened - - model = tf.create_model() - - assert isinstance(model, serving.Model) - - assert model.model_data == tf.model_data - assert model.role == tf.role - assert model.name == tf._current_job_name - assert model.container_log_level == tf.container_log_level - assert model._framework_version == "1.11" - assert model.sagemaker_session == sagemaker_session - assert model.enable_network_isolation() - - -@patch("time.strftime", return_value=TIMESTAMP) -@patch("time.time", return_value=TIME) -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_script_mode(time, strftime, sagemaker_session): - tf = TensorFlow( - entry_point=SCRIPT_FILE, - role=ROLE, - sagemaker_session=sagemaker_session, - py_version="py3", - train_instance_type=INSTANCE_TYPE, - train_instance_count=1, - framework_version="1.11", - source_dir=DATA_DIR, - ) - - inputs = "s3://mybucket/train" - tf.fit(inputs=inputs) - - call_names = [c[0] for c in sagemaker_session.method_calls] - assert call_names == ["train", "logs_for_job"] - - expected_train_args = _create_train_job( - "1.11", script_mode=True, repo_name=SM_IMAGE_REPO_NAME, py_version="py3" - ) - expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs - - actual_train_args = sagemaker_session.method_calls[0][2] - assert actual_train_args == expected_train_args - - -@patch("time.strftime", return_value=TIMESTAMP) -@patch("time.time", return_value=TIME) -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_script_mode_ps(time, strftime, sagemaker_session): - tf = TensorFlow( - entry_point=SCRIPT_FILE, - role=ROLE, - sagemaker_session=sagemaker_session, - py_version="py3", - train_instance_type=INSTANCE_TYPE, - train_instance_count=1, - framework_version="1.11", - source_dir=DATA_DIR, - distributions=DISTRIBUTION_ENABLED, - ) - - inputs = "s3://mybucket/train" - tf.fit(inputs=inputs) - - call_names = [c[0] for c in sagemaker_session.method_calls] - assert call_names == ["train", "logs_for_job"] - - expected_train_args = _create_train_job( - "1.11", script_mode=True, ps=True, repo_name=SM_IMAGE_REPO_NAME, py_version="py3" - ) - expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs - expected_train_args["hyperparameters"][TensorFlow.LAUNCH_PS_ENV_NAME] = json.dumps(True) - - actual_train_args = sagemaker_session.method_calls[0][2] - assert actual_train_args == expected_train_args - - -@patch("time.strftime", return_value=TIMESTAMP) -@patch("time.time", return_value=TIME) -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_script_mode_mpi(time, strftime, sagemaker_session): - tf = TensorFlow( - entry_point=SCRIPT_FILE, - role=ROLE, - sagemaker_session=sagemaker_session, - py_version="py3", - train_instance_type=INSTANCE_TYPE, - train_instance_count=1, - framework_version="1.11", - source_dir=DATA_DIR, - distributions=DISTRIBUTION_MPI_ENABLED, - ) - - inputs = "s3://mybucket/train" - tf.fit(inputs=inputs) - - call_names = [c[0] for c in sagemaker_session.method_calls] - assert call_names == ["train", "logs_for_job"] - - expected_train_args = _create_train_job( - "1.11", script_mode=True, horovod=True, repo_name=SM_IMAGE_REPO_NAME, py_version="py3" - ) - expected_train_args["input_config"][0]["DataSource"]["S3DataSource"]["S3Uri"] = inputs - expected_train_args["hyperparameters"][TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True) - expected_train_args["hyperparameters"][TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2) - expected_train_args["hyperparameters"][TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps( - "options" - ) - - actual_train_args = sagemaker_session.method_calls[0][2] - assert actual_train_args == expected_train_args - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_script_mode_attach(sagemaker_session, tf_version): - training_image = "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-tensorflow-py3-cpu:{}-cpu-py3".format( - tf_version - ) - rjd = { - "AlgorithmSpecification": {"TrainingInputMode": "File", "TrainingImage": training_image}, - "HyperParameters": { - "sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"', - "sagemaker_program": '"iris-dnn-classifier.py"', - "sagemaker_enable_cloudwatch_metrics": "false", - "sagemaker_container_log_level": '"logging.INFO"', - "sagemaker_job_name": '"neo"', - }, - "RoleArn": "arn:aws:iam::366:role/SageMakerRole", - "ResourceConfig": { - "VolumeSizeInGB": 30, - "InstanceCount": 1, - "InstanceType": "ml.c4.xlarge", - }, - "StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60}, - "TrainingJobName": "neo", - "TrainingJobStatus": "Completed", - "TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo", - "OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"}, - "TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"}, - } - sagemaker_session.sagemaker_client.describe_training_job = Mock( - name="describe_training_job", return_value=rjd - ) - - estimator = TensorFlow.attach(training_job_name="neo", sagemaker_session=sagemaker_session) - assert estimator.latest_training_job.job_name == "neo" - assert estimator.py_version == "py3" - assert estimator.framework_version == tf_version - assert estimator.role == "arn:aws:iam::366:role/SageMakerRole" - assert estimator.train_instance_count == 1 - assert estimator.train_max_run == 24 * 60 * 60 - assert estimator.input_mode == "File" - assert estimator.input_mode == "File" - assert estimator.base_job_name == "neo" - assert estimator.output_path == "s3://place/output/neo" - assert estimator.output_kms_key == "" - assert estimator.hyperparameters() is not None - assert estimator.source_dir == "s3://some/sourcedir.tar.gz" - assert estimator.entry_point == "iris-dnn-classifier.py" - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_enable_sm_metrics(sagemaker_session): - tf = _build_tf(sagemaker_session, enable_sagemaker_metrics=True) - assert tf.enable_sagemaker_metrics - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_disable_sm_metrics(sagemaker_session): - tf = _build_tf(sagemaker_session, enable_sagemaker_metrics=False) - assert not tf.enable_sagemaker_metrics - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_disable_sm_metrics_if_fw_ver_is_less_than_1_15(sagemaker_session): - for fw_version in ["1.11", "1.12", "1.13", "1.14"]: - tf = _build_tf(sagemaker_session, framework_version=fw_version) - assert tf.enable_sagemaker_metrics is None - - -@patch("sagemaker.utils.create_tar_file", MagicMock()) -def test_tf_enable_sm_metrics_if_fw_ver_is_at_least_1_15(sagemaker_session): - for fw_version in ["1.15", "1.16", "2.0", "2.1"]: - tf = _build_tf(sagemaker_session, framework_version=fw_version) - assert tf.enable_sagemaker_metrics - - -def test_custom_image_estimator_deploy(sagemaker_session): - custom_image = "mycustomimage:latest" - tf = _build_tf(sagemaker_session) - tf.fit(inputs="s3://mybucket/train", job_name="new_name") - model = tf.create_model(image=custom_image) - assert model.image == custom_image