|
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
|
|
index fd3b7a5d16..2f8aafdf77 100644
|
|
--- a/sdk/cwl/arvados_cwl/__init__.py
|
|
+++ b/sdk/cwl/arvados_cwl/__init__.py
|
|
@@ -291,6 +291,7 @@ def add_arv_hints():
|
|
"http://arvados.org/cwl#OutputCollectionProperties",
|
|
"http://arvados.org/cwl#KeepCacheTypeRequirement",
|
|
"http://arvados.org/cwl#OutOfMemoryRetry",
|
|
+ "http://arvados.org/cwl#SpotInstanceRetry",
|
|
])
|
|
|
|
def exit_signal_handler(sigcode, frame):
|
|
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
|
|
index 84b98378f4..f45fdcc62d 100644
|
|
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
|
|
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
|
|
@@ -429,6 +429,17 @@ class ArvadosContainer(JobBase):
|
|
logger.debug("Container request was %s", container_request)
|
|
self.output_callback({}, "permanentFail")
|
|
|
|
+
|
|
+ def spot_instance_retry(self, record, container):
|
|
+ spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry")
|
|
+ if spot_instance_retry_req is None:
|
|
+ return False
|
|
+ if container["preemptionNotice"]:
|
|
+ return True
|
|
+ return False
|
|
+
|
|
+
|
|
+
|
|
def out_of_memory_retry(self, record, container):
|
|
oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
|
|
if oom_retry_req is None:
|
|
@@ -485,7 +496,12 @@ class ArvadosContainer(JobBase):
|
|
self.run(None)
|
|
retried = True
|
|
return
|
|
-
|
|
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container):
|
|
+ logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible")
|
|
+ self.job_runtime.enable_preemptible = False
|
|
+ self.run(None)
|
|
+ retried = True
|
|
+ return
|
|
if rcode == 137:
|
|
logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
|
|
self.arvrunner.label(self))
|
|
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
|
|
index 8e3a8ab85e..47b74ab35f 100644
|
|
--- a/sdk/cwl/tests/test_container.py
|
|
+++ b/sdk/cwl/tests/test_container.py
|
|
@@ -85,8 +85,7 @@ class TestContainer(unittest.TestCase):
|
|
"construct_tool_object": runner.arv_make_tool,
|
|
"fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access),
|
|
"loader": Loader({}),
|
|
- "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}),
|
|
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__
|
|
+ "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"})
|
|
})
|
|
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
|
|
{"work_api": "containers",
|
|
@@ -580,7 +579,7 @@ class TestContainer(unittest.TestCase):
|
|
self.fail("RuntimeStatusLoggingHandler should not be called recursively")
|
|
|
|
|
|
- # Test to make sure that an exception raised from
|
|
+ # Test to make sure trunner = mock.MagicMock()hat an exception raised from
|
|
# get_current_container doesn't cause the logger to raise an
|
|
# exception
|
|
@mock.patch("arvados_cwl.util.get_current_container")
|
|
@@ -1464,8 +1463,7 @@ class TestWorkflow(unittest.TestCase):
|
|
"make_fs_access": make_fs_access,
|
|
"loader": document_loader,
|
|
"metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"},
|
|
- "construct_tool_object": runner.arv_make_tool,
|
|
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__})
|
|
+ "construct_tool_object": runner.arv_make_tool})
|
|
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
|
|
{"work_api": "containers",
|
|
"basedir": "",
|
|
@@ -1710,3 +1708,53 @@ class TestWorkflow(unittest.TestCase):
|
|
api._rootDesc = copy.deepcopy(get_rootDesc())
|
|
runner = arvados_cwl.executor.ArvCwlExecutor(api)
|
|
self.assertEqual(runner.work_api, 'containers')
|
|
+
|
|
+ @mock.patch("arvados.collection.Collection")
|
|
+ def test_spot_instance_retry(self, blah):
|
|
+ arvados_cwl.add_arv_hints()
|
|
+
|
|
+ api = mock.MagicMock()
|
|
+
|
|
+ runner = mock.MagicMock()
|
|
+ runner.api = api
|
|
+ runner.num_retries = 0
|
|
+ runner.ignore_docker_for_reuse = False
|
|
+ runner.intermediate_output_ttl = 0
|
|
+ runner.secret_store = cwltool.secrets.SecretStore()
|
|
+
|
|
+ runner.api.containers().get().execute.return_value = {
|
|
+ "state": "Complete",
|
|
+ "output": "abc+123",
|
|
+ "exit_code": 138 # Want exit code to be failure
|
|
+ }
|
|
+ # Add assertions to make sure it reran as nonpreemptible
|
|
+ loadingContext, runtimeContext = self.helper(runner)
|
|
+ arvjob = arvados_cwl.ArvadosContainer(runner,
|
|
+ runtimeContext,
|
|
+ mock.MagicMock(),
|
|
+ {},
|
|
+ None,
|
|
+ [],
|
|
+ [],
|
|
+ "testjob")
|
|
+ arvjob.output_callback = mock.MagicMock()
|
|
+ arvjob.collect_outputs = mock.MagicMock()
|
|
+ arvjob.successCodes = [0]
|
|
+ arvjob.outdir = "/var/spool/cwl"
|
|
+ arvjob.output_ttl = 3600
|
|
+ arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1"
|
|
+
|
|
+ arvjob.collect_outputs.return_value = {"out": "stuff"}
|
|
+
|
|
+ arvjob.done({
|
|
+ "state": "Final",
|
|
+ "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
|
|
+ "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
|
|
+ "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
|
|
+ "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
|
|
+ "modified_at": "2017-05-26T12:01:22Z",
|
|
+ "properties": {}
|
|
+ })
|
|
+
|
|
+ self.assertTrue(api.container_requests().create.called)
|
|
+ self.assertTrue(arvjob.attempt_count == 2)
|
|
\ No newline at end of file
|