diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py index bd7ae440c5..3ace575682 100644 --- a/src/sagemaker/session.py +++ b/src/sagemaker/session.py @@ -41,6 +41,7 @@ secondary_training_status_changed, secondary_training_status_message, sts_regional_endpoint, + retries, ) from sagemaker import exceptions from sagemaker.session_settings import SessionSettings @@ -4699,21 +4700,30 @@ def _train_done(sagemaker_client, job_name, last_desc): """Placeholder docstring""" in_progress_statuses = ["InProgress", "Created"] - desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) - status = desc["TrainingJobStatus"] + for _ in retries( + max_retry_count=10, # 10*30 = 5min + exception_message_prefix="Waiting for schedule to leave 'Pending' status", + seconds_to_sleep=30, + ): + try: + desc = sagemaker_client.describe_training_job(TrainingJobName=job_name) + status = desc["TrainingJobStatus"] - if secondary_training_status_changed(desc, last_desc): - print() - print(secondary_training_status_message(desc, last_desc), end="") - else: - print(".", end="") - sys.stdout.flush() + if secondary_training_status_changed(desc, last_desc): + print() + print(secondary_training_status_message(desc, last_desc), end="") + else: + print(".", end="") + sys.stdout.flush() - if status in in_progress_statuses: - return desc, False + if status in in_progress_statuses: + return desc, False - print() - return desc, True + print() + return desc, True + except botocore.exceptions.ClientError as err: + if err.response["Error"]["Code"] == "AccessDeniedException": + pass def _processing_job_status(sagemaker_client, job_name):