diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py index fd3b7a5d16..2f8aafdf77 100644 --- a/sdk/cwl/arvados_cwl/__init__.py +++ b/sdk/cwl/arvados_cwl/__init__.py @@ -291,6 +291,7 @@ def add_arv_hints(): "http://arvados.org/cwl#OutputCollectionProperties", "http://arvados.org/cwl#KeepCacheTypeRequirement", "http://arvados.org/cwl#OutOfMemoryRetry", + "http://arvados.org/cwl#SpotInstanceRetry", ]) def exit_signal_handler(sigcode, frame): diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py index 84b98378f4..f45fdcc62d 100644 --- a/sdk/cwl/arvados_cwl/arvcontainer.py +++ b/sdk/cwl/arvados_cwl/arvcontainer.py @@ -429,6 +429,17 @@ class ArvadosContainer(JobBase): logger.debug("Container request was %s", container_request) self.output_callback({}, "permanentFail") + + def spot_instance_retry(self, record, container): + spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry") + if spot_instance_retry_req is None: + return False + if container["preemptionNotice"]: + return True + return False + + + def out_of_memory_retry(self, record, container): oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry") if oom_retry_req is None: @@ -485,7 +496,12 @@ class ArvadosContainer(JobBase): self.run(None) retried = True return - + if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container): + logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible") + self.job_runtime.enable_preemptible = False + self.run(None) + retried = True + return if rcode == 137: logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.", self.arvrunner.label(self)) diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py index 8e3a8ab85e..47b74ab35f 100644 --- a/sdk/cwl/tests/test_container.py +++ b/sdk/cwl/tests/test_container.py @@ -85,8 +85,7 @@ class TestContainer(unittest.TestCase): "construct_tool_object": runner.arv_make_tool, "fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access), "loader": Loader({}), - "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}), - "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__ + "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}) }) runtimeContext = arvados_cwl.context.ArvRuntimeContext( {"work_api": "containers", @@ -580,7 +579,7 @@ class TestContainer(unittest.TestCase): self.fail("RuntimeStatusLoggingHandler should not be called recursively") - # Test to make sure that an exception raised from + # Test to make sure trunner = mock.MagicMock()hat an exception raised from # get_current_container doesn't cause the logger to raise an # exception @mock.patch("arvados_cwl.util.get_current_container") @@ -1464,8 +1463,7 @@ class TestWorkflow(unittest.TestCase): "make_fs_access": make_fs_access, "loader": document_loader, "metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}, - "construct_tool_object": runner.arv_make_tool, - "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__}) + "construct_tool_object": runner.arv_make_tool}) runtimeContext = arvados_cwl.context.ArvRuntimeContext( {"work_api": "containers", "basedir": "", @@ -1710,3 +1708,53 @@ class TestWorkflow(unittest.TestCase): api._rootDesc = copy.deepcopy(get_rootDesc()) runner = arvados_cwl.executor.ArvCwlExecutor(api) self.assertEqual(runner.work_api, 'containers') + + @mock.patch("arvados.collection.Collection") + def test_spot_instance_retry(self, blah): + arvados_cwl.add_arv_hints() + + api = mock.MagicMock() + + runner = mock.MagicMock() + runner.api = api + runner.num_retries = 0 + runner.ignore_docker_for_reuse = False + runner.intermediate_output_ttl = 0 + runner.secret_store = cwltool.secrets.SecretStore() + + runner.api.containers().get().execute.return_value = { + "state": "Complete", + "output": "abc+123", + "exit_code": 138 # Want exit code to be failure + } + # Add assertions to make sure it reran as nonpreemptible + loadingContext, runtimeContext = self.helper(runner) + arvjob = arvados_cwl.ArvadosContainer(runner, + runtimeContext, + mock.MagicMock(), + {}, + None, + [], + [], + "testjob") + arvjob.output_callback = mock.MagicMock() + arvjob.collect_outputs = mock.MagicMock() + arvjob.successCodes = [0] + arvjob.outdir = "/var/spool/cwl" + arvjob.output_ttl = 3600 + arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1" + + arvjob.collect_outputs.return_value = {"out": "stuff"} + + arvjob.done({ + "state": "Final", + "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1", + "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2", + "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz", + "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz", + "modified_at": "2017-05-26T12:01:22Z", + "properties": {} + }) + + self.assertTrue(api.container_requests().create.called) + self.assertTrue(arvjob.attempt_count == 2) \ No newline at end of file