From 14651e370e7f6b6c20197ef3964d61efebdc51b1 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Tue, 20 Feb 2024 16:11:16 -0800 Subject: [PATCH 01/11] fix: add fixes for tarfile extractall functionality PEP-721 --- src/sagemaker/local/image.py | 4 ++- .../serve/model_server/djl_serving/prepare.py | 5 ++-- .../serve/model_server/tgi/prepare.py | 5 ++-- src/sagemaker/utils.py | 27 +++++++++++++++++-- src/sagemaker/workflow/_repack_model.py | 5 +++- src/sagemaker/workflow/_utils.py | 10 +++++-- tests/integ/s3_utils.py | 5 +++- tests/integ/utils.py | 1 + tests/unit/test_fw_utils.py | 5 ++-- tests/unit/test_utils.py | 15 +++++++++++ 10 files changed, 69 insertions(+), 13 deletions(-) diff --git a/src/sagemaker/local/image.py b/src/sagemaker/local/image.py index f38bc1fbe5..7893ee9260 100644 --- a/src/sagemaker/local/image.py +++ b/src/sagemaker/local/image.py @@ -40,6 +40,7 @@ import sagemaker.local.data import sagemaker.local.utils import sagemaker.utils +from sagemaker.utils import check_tarfile_data_filter_attribute CONTAINER_PREFIX = "algo" STUDIO_HOST_NAME = "sagemaker-local" @@ -686,7 +687,8 @@ def _prepare_serving_volumes(self, model_location): for filename in model_data_source.get_file_list(): if tarfile.is_tarfile(filename): with tarfile.open(filename) as tar: - tar.extractall(path=model_data_source.get_root_dir()) + check_tarfile_data_filter_attribute() + tar.extractall(path=model_data_source.get_root_dir(), filter="data") volumes.append(_Volume(model_data_source.get_root_dir(), "/opt/ml/model")) diff --git a/src/sagemaker/serve/model_server/djl_serving/prepare.py b/src/sagemaker/serve/model_server/djl_serving/prepare.py index 386c5fb66e..6bdada0b6c 100644 --- a/src/sagemaker/serve/model_server/djl_serving/prepare.py +++ b/src/sagemaker/serve/model_server/djl_serving/prepare.py @@ -20,7 +20,7 @@ from typing import List from pathlib import Path -from sagemaker.utils import _tmpdir +from sagemaker.utils import _tmpdir, check_tarfile_data_filter_attribute from sagemaker.s3 import S3Downloader from sagemaker.djl_inference import DJLModel from sagemaker.djl_inference.model import _read_existing_serving_properties @@ -53,7 +53,8 @@ def _extract_js_resource(js_model_dir: str, js_id: str): """Uncompress the jumpstart resource""" tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") with tarfile.open(str(tmp_sourcedir)) as resources: - resources.extractall(path=js_model_dir) + check_tarfile_data_filter_attribute() + resources.extractall(path=js_model_dir, filter="data") def _copy_jumpstart_artifacts(model_data: str, js_id: str, code_dir: Path): diff --git a/src/sagemaker/serve/model_server/tgi/prepare.py b/src/sagemaker/serve/model_server/tgi/prepare.py index fe1162e505..9b187dd2ed 100644 --- a/src/sagemaker/serve/model_server/tgi/prepare.py +++ b/src/sagemaker/serve/model_server/tgi/prepare.py @@ -19,7 +19,7 @@ from pathlib import Path from sagemaker.serve.utils.local_hardware import _check_disk_space, _check_docker_disk_usage -from sagemaker.utils import _tmpdir +from sagemaker.utils import _tmpdir, check_tarfile_data_filter_attribute from sagemaker.s3 import S3Downloader logger = logging.getLogger(__name__) @@ -29,7 +29,8 @@ def _extract_js_resource(js_model_dir: str, code_dir: Path, js_id: str): """Uncompress the jumpstart resource""" tmp_sourcedir = Path(js_model_dir).joinpath(f"infer-prepack-{js_id}.tar.gz") with tarfile.open(str(tmp_sourcedir)) as resources: - resources.extractall(path=code_dir) + check_tarfile_data_filter_attribute() + resources.extractall(path=code_dir, filter="data") def _copy_jumpstart_artifacts(model_data: str, js_id: str, code_dir: Path) -> bool: diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 15a3d128de..2c4f868c15 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -22,6 +22,7 @@ import random import re import shutil +import sys import tarfile import tempfile import time @@ -591,7 +592,8 @@ def _create_or_update_code_dir( download_file_from_url(source_directory, local_code_path, sagemaker_session) with tarfile.open(name=local_code_path, mode="r:gz") as t: - t.extractall(path=code_dir) + check_tarfile_data_filter_attribute() + t.extractall(path=code_dir, filter="data") elif source_directory: if os.path.exists(code_dir): @@ -628,7 +630,8 @@ def _extract_model(model_uri, sagemaker_session, tmp): else: local_model_path = model_uri.replace("file://", "") with tarfile.open(name=local_model_path, mode="r:gz") as t: - t.extractall(path=tmp_model_dir) + check_tarfile_data_filter_attribute() + t.extractall(path=tmp_model_dir, filter="data") return tmp_model_dir @@ -1489,3 +1492,23 @@ def format_tags(tags: Tags) -> List[TagsDict]: return [{"Key": str(k), "Value": str(v)} for k, v in tags.items()] return tags + + +class PythonVersionError(Exception): + pass + + +def check_tarfile_data_filter_attribute(): + """Check if tarfile has data_filter utility which has guardrails against untrusted + de-serialisation. + + Raises: + PythonVersionError: if `tarfile.data_filter` is not available. + """ + if not hasattr(tarfile, "data_filter"): + raise PythonVersionError( + f"Since tarfile extraction is unsafe the operation is prohibited " + f"per PEP-721. Please update your Python [{sys.version}] " + f"to latest patch [refer to https://www.python.org/downloads/] " + f"to consume the security patch" + ) diff --git a/src/sagemaker/workflow/_repack_model.py b/src/sagemaker/workflow/_repack_model.py index 3cfa6760b3..8117410e6f 100644 --- a/src/sagemaker/workflow/_repack_model.py +++ b/src/sagemaker/workflow/_repack_model.py @@ -33,6 +33,8 @@ # repacking is some short-lived hackery, right?? from distutils.dir_util import copy_tree +from sagemaker.utils import check_tarfile_data_filter_attribute + def repack(inference_script, model_archive, dependencies=None, source_dir=None): # pragma: no cover """Repack custom dependencies and code into an existing model TAR archive @@ -60,7 +62,8 @@ def repack(inference_script, model_archive, dependencies=None, source_dir=None): # extract the contents of the previous training job's model archive to the "src" # directory of this training job with tarfile.open(name=local_path, mode="r:gz") as tf: - tf.extractall(path=src_dir) + check_tarfile_data_filter_attribute() + tf.extractall(path=src_dir, filter="data") if source_dir: # copy /opt/ml/code to code/ diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 9c4fa114ab..1b88bfd924 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -32,7 +32,12 @@ Step, ConfigurableRetryStep, ) -from sagemaker.utils import _save_model, download_file_from_url, format_tags +from sagemaker.utils import ( + _save_model, + download_file_from_url, + format_tags, + check_tarfile_data_filter_attribute, +) from sagemaker.workflow.retry import RetryPolicy from sagemaker.workflow.utilities import trim_request_dict @@ -257,7 +262,8 @@ def _inject_repack_script_and_launcher(self): download_file_from_url(self._source_dir, old_targz_path, self.sagemaker_session) with tarfile.open(name=old_targz_path, mode="r:gz") as t: - t.extractall(path=targz_contents_dir) + check_tarfile_data_filter_attribute() + t.extractall(path=targz_contents_dir, filter="data") shutil.copy2(fname, os.path.join(targz_contents_dir, REPACK_SCRIPT)) with open( diff --git a/tests/integ/s3_utils.py b/tests/integ/s3_utils.py index 58a403341e..500dc4a33a 100644 --- a/tests/integ/s3_utils.py +++ b/tests/integ/s3_utils.py @@ -19,6 +19,8 @@ import boto3 from six.moves.urllib.parse import urlparse +from sagemaker.utils import check_tarfile_data_filter_attribute + def assert_s3_files_exist(sagemaker_session, s3_url, files): parsed_url = urlparse(s3_url) @@ -55,4 +57,5 @@ def extract_files_from_s3(s3_url, tmpdir, sagemaker_session): s3.Bucket(parsed_url.netloc).download_file(parsed_url.path.lstrip("/"), model) with tarfile.open(model, "r") as tar_file: - tar_file.extractall(tmpdir) + check_tarfile_data_filter_attribute() + tar_file.extractall(tmpdir, filter="data") diff --git a/tests/integ/utils.py b/tests/integ/utils.py index 0283f09f6e..adc12a5aa4 100644 --- a/tests/integ/utils.py +++ b/tests/integ/utils.py @@ -13,6 +13,7 @@ from __future__ import absolute_import import logging import shutil +import tarfile from functools import wraps from botocore.exceptions import ClientError diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py index fa419eb848..4600785159 100644 --- a/tests/unit/test_fw_utils.py +++ b/tests/unit/test_fw_utils.py @@ -24,7 +24,7 @@ from mock import Mock, patch from sagemaker import fw_utils -from sagemaker.utils import name_from_image +from sagemaker.utils import name_from_image, check_tarfile_data_filter_attribute from sagemaker.session_settings import SessionSettings from sagemaker.instance_group import InstanceGroup @@ -424,7 +424,8 @@ def list_tar_files(folder, tar_ball, tmpdir): startpath = str(tmpdir.ensure(folder, dir=True)) with tarfile.open(name=tar_ball, mode="r:gz") as t: - t.extractall(path=startpath) + check_tarfile_data_filter_attribute() + t.extractall(path=startpath, filter="data") def walk(): for root, dirs, files in os.walk(startpath): diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index d733752428..fafd3361d5 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -42,6 +42,8 @@ resolve_nested_dict_value_from_config, update_list_of_dicts_with_values_from_config, volume_size_supported, + PythonVersionError, + check_tarfile_data_filter_attribute, ) from tests.unit.sagemaker.workflow.helpers import CustomStep from sagemaker.workflow.parameters import ParameterString, ParameterInteger @@ -1748,3 +1750,16 @@ def test_instance_family_from_full_instance_type(self): for instance_type, family in instance_type_to_family_test_dict.items(): self.assertEqual(family, get_instance_type_family(instance_type)) + + +class TestCheckTarfileDataFilterAttribute(TestCase): + def test_check_tarfile_data_filter_attribute_unhappy_case(self): + with pytest.raises(PythonVersionError): + with patch("tarfile.data_filter", None): + delattr(tarfile, "data_filter") + check_tarfile_data_filter_attribute() + + def test_check_tarfile_data_filter_attribute_happy_case(self): + with patch("tarfile.data_filter", "some_value"): + delattr(tarfile, "data_filter") + check_tarfile_data_filter_attribute() From 239128153c532a3004cc23620ed69b52396dc827 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Tue, 20 Feb 2024 16:23:59 -0800 Subject: [PATCH 02/11] fix: add unit test fixes --- src/sagemaker/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 2c4f868c15..5d78905aa7 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -1495,6 +1495,7 @@ def format_tags(tags: Tags) -> List[TagsDict]: class PythonVersionError(Exception): + """Raise when a secure [/patched] version of Python is not used.""" pass @@ -1505,6 +1506,7 @@ def check_tarfile_data_filter_attribute(): Raises: PythonVersionError: if `tarfile.data_filter` is not available. """ + # The function and it's usages can be deprecated post support of python >= 3.12 if not hasattr(tarfile, "data_filter"): raise PythonVersionError( f"Since tarfile extraction is unsafe the operation is prohibited " From 55011f280e7d6ca273055d39517309a827df1005 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 09:17:07 -0800 Subject: [PATCH 03/11] fix: add unit test fixes --- src/sagemaker/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 5d78905aa7..339f7437a2 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -1496,6 +1496,7 @@ def format_tags(tags: Tags) -> List[TagsDict]: class PythonVersionError(Exception): """Raise when a secure [/patched] version of Python is not used.""" + pass From 2ee4ce1eb02d0a7f8d6dd47d8ae0e44b0676f230 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 09:42:45 -0800 Subject: [PATCH 04/11] fix: add unit test fixes --- src/sagemaker/utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 339f7437a2..3eeb857402 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -1497,8 +1497,6 @@ def format_tags(tags: Tags) -> List[TagsDict]: class PythonVersionError(Exception): """Raise when a secure [/patched] version of Python is not used.""" - pass - def check_tarfile_data_filter_attribute(): """Check if tarfile has data_filter utility which has guardrails against untrusted From de58ca766028fc5754d60129512a08a2bf21d2bd Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 10:20:49 -0800 Subject: [PATCH 05/11] fix flake8 --- src/sagemaker/utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/sagemaker/utils.py b/src/sagemaker/utils.py index 3eeb857402..a6d26db48b 100644 --- a/src/sagemaker/utils.py +++ b/src/sagemaker/utils.py @@ -1499,8 +1499,9 @@ class PythonVersionError(Exception): def check_tarfile_data_filter_attribute(): - """Check if tarfile has data_filter utility which has guardrails against untrusted - de-serialisation. + """Check if tarfile has data_filter utility. + + Tarfile-data_filter utility has guardrails against untrusted de-serialisation. Raises: PythonVersionError: if `tarfile.data_filter` is not available. From 116e6a5f7cc5430992faab5dfcb2f5bf2e5ea038 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 10:28:55 -0800 Subject: [PATCH 06/11] fix flake8 --- tests/integ/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integ/utils.py b/tests/integ/utils.py index adc12a5aa4..0283f09f6e 100644 --- a/tests/integ/utils.py +++ b/tests/integ/utils.py @@ -13,7 +13,6 @@ from __future__ import absolute_import import logging import shutil -import tarfile from functools import wraps from botocore.exceptions import ClientError From 8001d1ede171126c50437580d3e574da8dd9ec2a Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 11:46:47 -0800 Subject: [PATCH 07/11] fix flake8 --- tests/unit/test_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index fafd3361d5..8488a8308e 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1761,5 +1761,4 @@ def test_check_tarfile_data_filter_attribute_unhappy_case(self): def test_check_tarfile_data_filter_attribute_happy_case(self): with patch("tarfile.data_filter", "some_value"): - delattr(tarfile, "data_filter") check_tarfile_data_filter_attribute() From 298423ab8b0042d92b4a54f80c5fe0a4e42f9176 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Wed, 21 Feb 2024 16:45:07 -0800 Subject: [PATCH 08/11] fix unit tests --- .../serve/model_server/djl_serving/test_djl_prepare.py | 2 +- tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py b/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py index 40d3edb251..caa8884186 100644 --- a/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py +++ b/tests/unit/sagemaker/serve/model_server/djl_serving/test_djl_prepare.py @@ -272,4 +272,4 @@ def test_extract_js_resources_success(self, mock_tarfile, mock_path): mock_path.assert_called_once_with(js_model_dir) mock_path_obj.joinpath.assert_called_once_with(f"infer-prepack-{MOCK_JUMPSTART_ID}.tar.gz") - mock_resource_obj.extractall.assert_called_once_with(path=js_model_dir) + mock_resource_obj.extractall.assert_called_once_with(path=js_model_dir, filter="data") diff --git a/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py b/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py index c055be1f7d..c072f3cb99 100644 --- a/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py +++ b/tests/unit/sagemaker/serve/model_server/tgi/test_tgi_prepare.py @@ -156,4 +156,4 @@ def test_extract_js_resources_success(self, mock_tarfile, mock_path): mock_path.assert_called_once_with(js_model_dir) mock_path_obj.joinpath.assert_called_once_with(f"infer-prepack-{MOCK_JUMPSTART_ID}.tar.gz") - mock_resource_obj.extractall.assert_called_once_with(path=code_dir) + mock_resource_obj.extractall.assert_called_once_with(path=code_dir, filter="data") From cfc46c7ded40e9db218cf4a113e9829969a39e94 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Thu, 22 Feb 2024 16:35:00 -0800 Subject: [PATCH 09/11] remove fix for repack model --- src/sagemaker/workflow/_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 1b88bfd924..626623fa18 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -262,8 +262,7 @@ def _inject_repack_script_and_launcher(self): download_file_from_url(self._source_dir, old_targz_path, self.sagemaker_session) with tarfile.open(name=old_targz_path, mode="r:gz") as t: - check_tarfile_data_filter_attribute() - t.extractall(path=targz_contents_dir, filter="data") + t.extractall(path=targz_contents_dir) shutil.copy2(fname, os.path.join(targz_contents_dir, REPACK_SCRIPT)) with open( From 209285dace87291a3dc39d201b28517fd9d91028 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Thu, 22 Feb 2024 16:38:01 -0800 Subject: [PATCH 10/11] remove import --- src/sagemaker/workflow/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 626623fa18..7067a5a03c 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -36,7 +36,6 @@ _save_model, download_file_from_url, format_tags, - check_tarfile_data_filter_attribute, ) from sagemaker.workflow.retry import RetryPolicy from sagemaker.workflow.utilities import trim_request_dict From f174fa49fcbacce737262893be0df05238cf7824 Mon Sep 17 00:00:00 2001 From: Mufaddal Rohawala Date: Thu, 22 Feb 2024 19:42:51 -0800 Subject: [PATCH 11/11] remove import --- src/sagemaker/workflow/_repack_model.py | 5 +---- src/sagemaker/workflow/_utils.py | 4 +++- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/sagemaker/workflow/_repack_model.py b/src/sagemaker/workflow/_repack_model.py index 8117410e6f..3cfa6760b3 100644 --- a/src/sagemaker/workflow/_repack_model.py +++ b/src/sagemaker/workflow/_repack_model.py @@ -33,8 +33,6 @@ # repacking is some short-lived hackery, right?? from distutils.dir_util import copy_tree -from sagemaker.utils import check_tarfile_data_filter_attribute - def repack(inference_script, model_archive, dependencies=None, source_dir=None): # pragma: no cover """Repack custom dependencies and code into an existing model TAR archive @@ -62,8 +60,7 @@ def repack(inference_script, model_archive, dependencies=None, source_dir=None): # extract the contents of the previous training job's model archive to the "src" # directory of this training job with tarfile.open(name=local_path, mode="r:gz") as tf: - check_tarfile_data_filter_attribute() - tf.extractall(path=src_dir, filter="data") + tf.extractall(path=src_dir) if source_dir: # copy /opt/ml/code to code/ diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py index 7067a5a03c..1b88bfd924 100644 --- a/src/sagemaker/workflow/_utils.py +++ b/src/sagemaker/workflow/_utils.py @@ -36,6 +36,7 @@ _save_model, download_file_from_url, format_tags, + check_tarfile_data_filter_attribute, ) from sagemaker.workflow.retry import RetryPolicy from sagemaker.workflow.utilities import trim_request_dict @@ -261,7 +262,8 @@ def _inject_repack_script_and_launcher(self): download_file_from_url(self._source_dir, old_targz_path, self.sagemaker_session) with tarfile.open(name=old_targz_path, mode="r:gz") as t: - t.extractall(path=targz_contents_dir) + check_tarfile_data_filter_attribute() + t.extractall(path=targz_contents_dir, filter="data") shutil.copy2(fname, os.path.join(targz_contents_dir, REPACK_SCRIPT)) with open(