1
|
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
|
2
|
index fd3b7a5d16..2f8aafdf77 100644
|
3
|
--- a/sdk/cwl/arvados_cwl/__init__.py
|
4
|
+++ b/sdk/cwl/arvados_cwl/__init__.py
|
5
|
@@ -291,6 +291,7 @@ def add_arv_hints():
|
6
|
"http://arvados.org/cwl#OutputCollectionProperties",
|
7
|
"http://arvados.org/cwl#KeepCacheTypeRequirement",
|
8
|
"http://arvados.org/cwl#OutOfMemoryRetry",
|
9
|
+ "http://arvados.org/cwl#SpotInstanceRetry",
|
10
|
])
|
11
|
|
12
|
def exit_signal_handler(sigcode, frame):
|
13
|
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
|
14
|
index 84b98378f4..f45fdcc62d 100644
|
15
|
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
|
16
|
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
|
17
|
@@ -429,6 +429,17 @@ class ArvadosContainer(JobBase):
|
18
|
logger.debug("Container request was %s", container_request)
|
19
|
self.output_callback({}, "permanentFail")
|
20
|
|
21
|
+
|
22
|
+ def spot_instance_retry(self, record, container):
|
23
|
+ spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry")
|
24
|
+ if spot_instance_retry_req is None:
|
25
|
+ return False
|
26
|
+ if container["preemptionNotice"]:
|
27
|
+ return True
|
28
|
+ return False
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
def out_of_memory_retry(self, record, container):
|
33
|
oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
|
34
|
if oom_retry_req is None:
|
35
|
@@ -485,7 +496,12 @@ class ArvadosContainer(JobBase):
|
36
|
self.run(None)
|
37
|
retried = True
|
38
|
return
|
39
|
-
|
40
|
+ if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container):
|
41
|
+ logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible")
|
42
|
+ self.job_runtime.enable_preemptible = False
|
43
|
+ self.run(None)
|
44
|
+ retried = True
|
45
|
+ return
|
46
|
if rcode == 137:
|
47
|
logger.warning("%s Container may have been killed for using too much RAM. Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
|
48
|
self.arvrunner.label(self))
|
49
|
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
|
50
|
index 8e3a8ab85e..47b74ab35f 100644
|
51
|
--- a/sdk/cwl/tests/test_container.py
|
52
|
+++ b/sdk/cwl/tests/test_container.py
|
53
|
@@ -85,8 +85,7 @@ class TestContainer(unittest.TestCase):
|
54
|
"construct_tool_object": runner.arv_make_tool,
|
55
|
"fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access),
|
56
|
"loader": Loader({}),
|
57
|
- "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}),
|
58
|
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__
|
59
|
+ "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"})
|
60
|
})
|
61
|
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
|
62
|
{"work_api": "containers",
|
63
|
@@ -580,7 +579,7 @@ class TestContainer(unittest.TestCase):
|
64
|
self.fail("RuntimeStatusLoggingHandler should not be called recursively")
|
65
|
|
66
|
|
67
|
- # Test to make sure that an exception raised from
|
68
|
+ # Test to make sure trunner = mock.MagicMock()hat an exception raised from
|
69
|
# get_current_container doesn't cause the logger to raise an
|
70
|
# exception
|
71
|
@mock.patch("arvados_cwl.util.get_current_container")
|
72
|
@@ -1464,8 +1463,7 @@ class TestWorkflow(unittest.TestCase):
|
73
|
"make_fs_access": make_fs_access,
|
74
|
"loader": document_loader,
|
75
|
"metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"},
|
76
|
- "construct_tool_object": runner.arv_make_tool,
|
77
|
- "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__})
|
78
|
+ "construct_tool_object": runner.arv_make_tool})
|
79
|
runtimeContext = arvados_cwl.context.ArvRuntimeContext(
|
80
|
{"work_api": "containers",
|
81
|
"basedir": "",
|
82
|
@@ -1710,3 +1708,53 @@ class TestWorkflow(unittest.TestCase):
|
83
|
api._rootDesc = copy.deepcopy(get_rootDesc())
|
84
|
runner = arvados_cwl.executor.ArvCwlExecutor(api)
|
85
|
self.assertEqual(runner.work_api, 'containers')
|
86
|
+
|
87
|
+ @mock.patch("arvados.collection.Collection")
|
88
|
+ def test_spot_instance_retry(self, blah):
|
89
|
+ arvados_cwl.add_arv_hints()
|
90
|
+
|
91
|
+ api = mock.MagicMock()
|
92
|
+
|
93
|
+ runner = mock.MagicMock()
|
94
|
+ runner.api = api
|
95
|
+ runner.num_retries = 0
|
96
|
+ runner.ignore_docker_for_reuse = False
|
97
|
+ runner.intermediate_output_ttl = 0
|
98
|
+ runner.secret_store = cwltool.secrets.SecretStore()
|
99
|
+
|
100
|
+ runner.api.containers().get().execute.return_value = {
|
101
|
+ "state": "Complete",
|
102
|
+ "output": "abc+123",
|
103
|
+ "exit_code": 138 # Want exit code to be failure
|
104
|
+ }
|
105
|
+ # Add assertions to make sure it reran as nonpreemptible
|
106
|
+ loadingContext, runtimeContext = self.helper(runner)
|
107
|
+ arvjob = arvados_cwl.ArvadosContainer(runner,
|
108
|
+ runtimeContext,
|
109
|
+ mock.MagicMock(),
|
110
|
+ {},
|
111
|
+ None,
|
112
|
+ [],
|
113
|
+ [],
|
114
|
+ "testjob")
|
115
|
+ arvjob.output_callback = mock.MagicMock()
|
116
|
+ arvjob.collect_outputs = mock.MagicMock()
|
117
|
+ arvjob.successCodes = [0]
|
118
|
+ arvjob.outdir = "/var/spool/cwl"
|
119
|
+ arvjob.output_ttl = 3600
|
120
|
+ arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1"
|
121
|
+
|
122
|
+ arvjob.collect_outputs.return_value = {"out": "stuff"}
|
123
|
+
|
124
|
+ arvjob.done({
|
125
|
+ "state": "Final",
|
126
|
+ "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
|
127
|
+ "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
|
128
|
+ "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
|
129
|
+ "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
|
130
|
+ "modified_at": "2017-05-26T12:01:22Z",
|
131
|
+ "properties": {}
|
132
|
+ })
|
133
|
+
|
134
|
+ self.assertTrue(api.container_requests().create.called)
|
135
|
+ self.assertTrue(arvjob.attempt_count == 2)
|
136
|
\ No newline at end of file
|