Project

General

Profile

Bug #21190

Updated by Tom Clegg 6 months ago

An instance was shut down due to multiple failed probes, but the container that was running on the instance remained in 'Running' state.    Shouldn't have the dispatcher marked the container as cancelled?    Or rescheduled it?   

 <pre> 
 Nov    9 05:42:18 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceType":"m4large","PID":185885,"Priority":562948253941774988,"State":"Queued","level":"info","msg":"adding container to queue","time":"2023-11-09T05:42:18.948527443Z"} 
 Nov    9 05:55:33 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","IdleBehavior":"run","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"State":"booting","level":"info","msg":"instance appeared in cloud","time":"2023-11-09T05:55:33.929679063Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Command":"systemctl is-system-running","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"level":"info","msg":"boot probe succeeded","stderr":"","stdout":"running\n","time":"2023-11-09T05:57:01.197430954Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"level":"info","msg":"crunch-run process started","time":"2023-11-09T05:57:01.649759362Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"level":"info","msg":"crunch-run process started","time":"2023-11-09T05:57:01.649759362Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"cmd":"sudo sh -c 'set -e; dstdir=\"/tmp/arvados-crunch-run/\"; dstfile=\"/tmp/arvados-crunch-run/crunch-run~493931196c222d127ddac3e798507810\"; mkdir -p \"$dstdir\"; touch \"$dstfile\"; chmod 0755 \"$dstdir\" \"$dstfile\"; cat \u003e\"$dstfile\"'","hash":"493931196c222d127ddac3e798507810","level":"info","msg":"installing runner binary on worker","path":"/tmp/arvados-crunch-run/crunch-run~493931196c222d127ddac3e798507810","time":"2023-11-09T05:57:01.203216535Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"ProbeStart":"2023-11-09T05:57:01.190511228Z","level":"info","msg":"instance booted; will try probeRunning","time":"2023-11-09T05:57:01.572380813Z"} 
 Nov    9 05:57:01 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"ProbeStart":"2023-11-09T05:57:01.190511228Z","RunningContainers":0,"State":"idle","level":"info","msg":"probes succeeded, instance is in service","time":"2023-11-09T05:57:01.607380217Z"} 
 Nov    9 06:23:40 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Command":"sudo /tmp/arvados-crunch-run/crunch-run~493931196c222d127ddac3e798507810 --list","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"error":"dial tcp 172.25.138.199:22: i/o timeout","level":"warning","msg":"probe failed","stderr":"","stdout":"","time":"2023-11-09T06:23:40.129518363Z"} 
 Nov    9 06:24:49 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Command":"sudo /tmp/arvados-crunch-run/crunch-run~493931196c222d127ddac3e798507810 --list","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"error":"dial tcp 172.25.138.199:22: i/o timeout","level":"warning","msg":"probe failed","stderr":"","stdout":"","time":"2023-11-09T06:24:49.340496750Z"} 
 Nov    9 06:24:49 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","Duration":1034553577012,"Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"Since":"2023-11-09T06:06:34.786359676Z","State":"running","level":"warning","msg":"instance unresponsive, shutting down","time":"2023-11-09T06:24:49.340590651Z"} 
 Nov    9 06:25:27 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"level":"info","msg":"crunch-run process abandoned","time":"2023-11-09T06:25:27.132571823Z"} 
 Nov    9 06:25:27 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"Address":"172.25.138.199","ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","Instance":"i-0da7c27cae59bb18b","InstanceType":"m4large","PID":185885,"level":"info","msg":"crunch-run process abandoned","time":"2023-11-09T06:25:27.132571823Z"} 
 Nov    9 06:25:27 ip-172-25-144-184 arvados-dispatch-cloud[185885]: {"ClusterID":"xxxxx","Instance":"i-0da7c27cae59bb18b","PID":185885,"WorkerState":"shutdown","level":"info","msg":"instance disappeared in cloud","time":"2023-11-09T06:25:27.129985051Z"} 
 Nov    9 18:10:22 ip-172-25-144-184 arvados-dispatch-cloud[245221]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceType":"m4large","PID":245221,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T18:10:22.189212178Z"} 
 Nov    9 19:52:17 ip-172-25-144-184 arvados-dispatch-cloud[251616]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceType":"m4large","PID":251616,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T19:52:17.781172241Z"} 
 Nov    9 20:11:47 ip-172-25-144-184 arvados-dispatch-cloud[252730]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceTypes":"m4large, c4.large","PID":252730,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T20:11:47.677520485Z"} 
 Nov    9 20:32:09 ip-172-25-144-184 arvados-dispatch-cloud[255676]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceTypes":"c5alarge, c5large, m5alarge, m4large, c4.large","PID":255676,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T20:32:09.325508159Z"} 
 Nov    9 20:44:20 ip-172-25-144-184 arvados-dispatch-cloud[258192]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceType":"c5alarge","PID":258192,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T20:44:20.215070124Z"} 
 Nov    9 20:44:49 ip-172-25-144-184 arvados-dispatch-cloud[258553]: {"ClusterID":"xxxxx","ContainerUUID":"xxxxx-dz642-m4el5nk6i59n69p","InstanceTypes":"c5alarge, c5large, m5alarge, m4large, c4.large","PID":258553,"Priority":562948253941774988,"State":"Running","level":"info","msg":"adding container to queue","time":"2023-11-09T20:44:49.877242519Z"} 
 </pre>

Back