Project

General

Profile

Feature #19982 ยป diff_spot_instance.txt

Alex Coleman, 06/11/2024 05:54 PM

 
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
index fd3b7a5d16..2f8aafdf77 100644
--- a/sdk/cwl/arvados_cwl/__init__.py
+++ b/sdk/cwl/arvados_cwl/__init__.py
@@ -291,6 +291,7 @@ def add_arv_hints():
"http://arvados.org/cwl#OutputCollectionProperties",
"http://arvados.org/cwl#KeepCacheTypeRequirement",
"http://arvados.org/cwl#OutOfMemoryRetry",
+ "http://arvados.org/cwl#SpotInstanceRetry",
])
def exit_signal_handler(sigcode, frame):
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
index 84b98378f4..f45fdcc62d 100644
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
@@ -429,6 +429,17 @@ class ArvadosContainer(JobBase):
logger.debug("Container request was %s", container_request)
self.output_callback({}, "permanentFail")
+
+ def spot_instance_retry(self, record, container):
+ spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry")
+ if spot_instance_retry_req is None:
+ return False
+ if container["preemptionNotice"]:
+ return True
+ return False
+
+
+
def out_of_memory_retry(self, record, container):
oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
if oom_retry_req is None:
@@ -485,7 +496,12 @@ class ArvadosContainer(JobBase):
self.run(None)
retried = True
return
-
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container):
+ logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible")
+ self.job_runtime.enable_preemptible = False
+ self.run(None)
+ retried = True
+ return
if rcode == 137:
logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
self.arvrunner.label(self))
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
index 8e3a8ab85e..47b74ab35f 100644
--- a/sdk/cwl/tests/test_container.py
+++ b/sdk/cwl/tests/test_container.py
@@ -85,8 +85,7 @@ class TestContainer(unittest.TestCase):
"construct_tool_object": runner.arv_make_tool,
"fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access),
"loader": Loader({}),
- "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}),
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__
+ "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"})
})
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
{"work_api": "containers",
@@ -580,7 +579,7 @@ class TestContainer(unittest.TestCase):
self.fail("RuntimeStatusLoggingHandler should not be called recursively")
- # Test to make sure that an exception raised from
+ # Test to make sure trunner = mock.MagicMock()hat an exception raised from
# get_current_container doesn't cause the logger to raise an
# exception
@mock.patch("arvados_cwl.util.get_current_container")
@@ -1464,8 +1463,7 @@ class TestWorkflow(unittest.TestCase):
"make_fs_access": make_fs_access,
"loader": document_loader,
"metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"},
- "construct_tool_object": runner.arv_make_tool,
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__})
+ "construct_tool_object": runner.arv_make_tool})
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
{"work_api": "containers",
"basedir": "",
@@ -1710,3 +1708,53 @@ class TestWorkflow(unittest.TestCase):
api._rootDesc = copy.deepcopy(get_rootDesc())
runner = arvados_cwl.executor.ArvCwlExecutor(api)
self.assertEqual(runner.work_api, 'containers')
+
+ @mock.patch("arvados.collection.Collection")
+ def test_spot_instance_retry(self, blah):
+ arvados_cwl.add_arv_hints()
+
+ api = mock.MagicMock()
+
+ runner = mock.MagicMock()
+ runner.api = api
+ runner.num_retries = 0
+ runner.ignore_docker_for_reuse = False
+ runner.intermediate_output_ttl = 0
+ runner.secret_store = cwltool.secrets.SecretStore()
+
+ runner.api.containers().get().execute.return_value = {
+ "state": "Complete",
+ "output": "abc+123",
+ "exit_code": 138 # Want exit code to be failure
+ }
+ # Add assertions to make sure it reran as nonpreemptible
+ loadingContext, runtimeContext = self.helper(runner)
+ arvjob = arvados_cwl.ArvadosContainer(runner,
+ runtimeContext,
+ mock.MagicMock(),
+ {},
+ None,
+ [],
+ [],
+ "testjob")
+ arvjob.output_callback = mock.MagicMock()
+ arvjob.collect_outputs = mock.MagicMock()
+ arvjob.successCodes = [0]
+ arvjob.outdir = "/var/spool/cwl"
+ arvjob.output_ttl = 3600
+ arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1"
+
+ arvjob.collect_outputs.return_value = {"out": "stuff"}
+
+ arvjob.done({
+ "state": "Final",
+ "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
+ "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
+ "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
+ "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
+ "modified_at": "2017-05-26T12:01:22Z",
+ "properties": {}
+ })
+
+ self.assertTrue(api.container_requests().create.called)
+ self.assertTrue(arvjob.attempt_count == 2)
\ No newline at end of file
    (1-1/1)