Project

General

Profile

Feature #19982 ยป diff_spot_instance.txt

Alex Coleman, 06/11/2024 05:54 PM

 
1
diff --git a/sdk/cwl/arvados_cwl/__init__.py b/sdk/cwl/arvados_cwl/__init__.py
2
index fd3b7a5d16..2f8aafdf77 100644
3
--- a/sdk/cwl/arvados_cwl/__init__.py
4
+++ b/sdk/cwl/arvados_cwl/__init__.py
5
@@ -291,6 +291,7 @@ def add_arv_hints():
6
         "http://arvados.org/cwl#OutputCollectionProperties",
7
         "http://arvados.org/cwl#KeepCacheTypeRequirement",
8
         "http://arvados.org/cwl#OutOfMemoryRetry",
9
+        "http://arvados.org/cwl#SpotInstanceRetry",
10
     ])
11
 
12
 def exit_signal_handler(sigcode, frame):
13
diff --git a/sdk/cwl/arvados_cwl/arvcontainer.py b/sdk/cwl/arvados_cwl/arvcontainer.py
14
index 84b98378f4..f45fdcc62d 100644
15
--- a/sdk/cwl/arvados_cwl/arvcontainer.py
16
+++ b/sdk/cwl/arvados_cwl/arvcontainer.py
17
@@ -429,6 +429,17 @@ class ArvadosContainer(JobBase):
18
             logger.debug("Container request was %s", container_request)
19
             self.output_callback({}, "permanentFail")
20
 
21
+
22
+    def spot_instance_retry(self, record, container):
23
+        spot_instance_retry_req, _ = self.get_requirement("http://arvados.org/cwl#SpotInstanceRetry")
24
+        if spot_instance_retry_req is None:
25
+            return False
26
+        if container["preemptionNotice"]:
27
+            return True
28
+        return False
29
+    
30
+        
31
+
32
     def out_of_memory_retry(self, record, container):
33
         oom_retry_req, _ = self.get_requirement("http://arvados.org/cwl#OutOfMemoryRetry")
34
         if oom_retry_req is None:
35
@@ -485,7 +496,12 @@ class ArvadosContainer(JobBase):
36
                     self.run(None)
37
                     retried = True
38
                     return
39
-
40
+                if processStatus == "permanentFail" and self.attempt_count == 1 and self.spot_instance_retry(record, container):
41
+                    logger.warning("%s Container failed with preemptible instance reclaimed, trying again nonpreemptible")
42
+                    self.job_runtime.enable_preemptible = False
43
+                    self.run(None)
44
+                    retried = True
45
+                    return
46
                 if rcode == 137:
47
                     logger.warning("%s Container may have been killed for using too much RAM.  Try resubmitting with a higher 'ramMin' or use the arv:OutOfMemoryRetry feature.",
48
                                  self.arvrunner.label(self))
49
diff --git a/sdk/cwl/tests/test_container.py b/sdk/cwl/tests/test_container.py
50
index 8e3a8ab85e..47b74ab35f 100644
51
--- a/sdk/cwl/tests/test_container.py
52
+++ b/sdk/cwl/tests/test_container.py
53
@@ -85,8 +85,7 @@ class TestContainer(unittest.TestCase):
54
              "construct_tool_object": runner.arv_make_tool,
55
              "fetcher_constructor": functools.partial(arvados_cwl.CollectionFetcher, api_client=runner.api, fs_access=fs_access),
56
              "loader": Loader({}),
57
-             "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"}),
58
-             "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__
59
+             "metadata": cmap({"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"})
60
              })
61
         runtimeContext = arvados_cwl.context.ArvRuntimeContext(
62
             {"work_api": "containers",
63
@@ -580,7 +579,7 @@ class TestContainer(unittest.TestCase):
64
             self.fail("RuntimeStatusLoggingHandler should not be called recursively")
65
 
66
 
67
-    # Test to make sure that an exception raised from
68
+    # Test to make sure trunner = mock.MagicMock()hat an exception raised from
69
     # get_current_container doesn't cause the logger to raise an
70
     # exception
71
     @mock.patch("arvados_cwl.util.get_current_container")
72
@@ -1464,8 +1463,7 @@ class TestWorkflow(unittest.TestCase):
73
              "make_fs_access": make_fs_access,
74
              "loader": document_loader,
75
              "metadata": {"cwlVersion": INTERNAL_VERSION, "http://commonwl.org/cwltool#original_cwlVersion": "v1.0"},
76
-             "construct_tool_object": runner.arv_make_tool,
77
-             "default_docker_image": "arvados/jobs:"+arvados_cwl.__version__})
78
+             "construct_tool_object": runner.arv_make_tool})
79
         runtimeContext = arvados_cwl.context.ArvRuntimeContext(
80
             {"work_api": "containers",
81
              "basedir": "",
82
@@ -1710,3 +1708,53 @@ class TestWorkflow(unittest.TestCase):
83
         api._rootDesc = copy.deepcopy(get_rootDesc())
84
         runner = arvados_cwl.executor.ArvCwlExecutor(api)
85
         self.assertEqual(runner.work_api, 'containers')
86
+    
87
+    @mock.patch("arvados.collection.Collection")
88
+    def test_spot_instance_retry(self, blah):
89
+        arvados_cwl.add_arv_hints()
90
+
91
+        api = mock.MagicMock()
92
+
93
+        runner = mock.MagicMock()
94
+        runner.api = api
95
+        runner.num_retries = 0
96
+        runner.ignore_docker_for_reuse = False
97
+        runner.intermediate_output_ttl = 0
98
+        runner.secret_store = cwltool.secrets.SecretStore()
99
+
100
+        runner.api.containers().get().execute.return_value = {
101
+            "state": "Complete",
102
+            "output": "abc+123",
103
+            "exit_code": 138 # Want exit code to be failure
104
+        }
105
+        # Add assertions to make sure it reran as nonpreemptible
106
+        loadingContext, runtimeContext = self.helper(runner)
107
+        arvjob = arvados_cwl.ArvadosContainer(runner,
108
+                                              runtimeContext,
109
+                                              mock.MagicMock(),
110
+                                              {},
111
+                                              None,
112
+                                              [],
113
+                                              [],
114
+                                              "testjob")
115
+        arvjob.output_callback = mock.MagicMock()
116
+        arvjob.collect_outputs = mock.MagicMock()
117
+        arvjob.successCodes = [0]
118
+        arvjob.outdir = "/var/spool/cwl"
119
+        arvjob.output_ttl = 3600
120
+        arvjob.uuid = "zzzzz-xvhdp-zzzzzzzzzzzzzz1"
121
+
122
+        arvjob.collect_outputs.return_value = {"out": "stuff"}
123
+
124
+        arvjob.done({
125
+            "state": "Final",
126
+            "log_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz1",
127
+            "output_uuid": "zzzzz-4zz18-zzzzzzzzzzzzzz2",
128
+            "uuid": "zzzzz-xvhdp-zzzzzzzzzzzzzzz",
129
+            "container_uuid": "zzzzz-8i9sb-zzzzzzzzzzzzzzz",
130
+            "modified_at": "2017-05-26T12:01:22Z",
131
+            "properties": {}
132
+        })
133
+
134
+        self.assertTrue(api.container_requests().create.called)
135
+        self.assertTrue(arvjob.attempt_count == 2)
136
\ No newline at end of file
    (1-1/1)