Project

General

Profile

Bug #8288

Updated by Tom Clegg about 8 years ago

srun / arv-mount / crunchstat doesn't exit because apparently although the docker container has exited and `docker run` has returned, the docker daemon still has an open file on the keep mount that `arv-mount` created.  

 <pre> 
 # ps auxwww | awk '$1=="crunch"' 
 crunch      8848    0.0    0.0 608616 31096 ?          Sl     Jan22     0:01 /usr/bin/python2.7 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep --exec crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid -poll=10000 /usr/bin/docker run --name=z8ta6-ot0gb-v0p7tcarqimrnmj-0 --attach=stdout --attach=stderr --attach=stdin -i --cidfile=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid --sig-proxy --memory=188032493k --memory-swap=190129641k --volume=/data/crunch-tmp/crunch-job/src:/data/crunch-tmp/crunch-job/src:ro --volume=/data/crunch-tmp/crunch-job/opt:/data/crunch-tmp/crunch-job/opt:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/by_pdh:/keep:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/tmp:/keep_tmp --volume=/tmp --env=TASK_SEQUENCE=0 --env=TASK_KEEPMOUNT=/keep --env=JOB_PARAMETER_INPUTS_COLLECTION=e8ccf6ac8348615a9873149f4ff07353+14726 --env=CRUNCH_SRC_COMMIT=15ee3e583ff9eeb273b52826c8944b43ae21e8bd --env=TASK_QSEQUENCE=0 --env=CRUNCH_INSTALL=/data/crunch-tmp/crunch-job/opt --env=CRUNCH_GIT_ARCHIVE_HASH=085e92f18ba93c6b8eb2507dbfbdf929 --env=CRUNCH_REFRESH_TRIGGER=/tmp/crunch_refresh_trigger --env=ARVADOS_API_TOKEN=2xb1o74nhxb7h2k2fculw4k6xahvss7v1ux8lhmd52conglznk --env=JOB_PARAMETER_SKIP_SQ_SN_REGEX=_decoy$ --env=CRUNCH_WORK=/data/crunch-tmp/crunch-job/work --env=CRUNCH_TMP=/data/crunch-tmp/crunch-job --env=TASK_TMPDIR=/tmp/crunch-job-task-work/humgen-03-02.1 --env=JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=CRUNCH_JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=TASK_SLOT_NUMBER=1 --env=CRUNCH_SRC_URL=/var/lib/arvados/internal.git --env=TASK_SLOT_NODE=humgen-03-02 --env=JOB_SCRIPT=bcftools-exp-gvcf-mpileup-cram-chunked.py --env=CRUNCH_NODE_SLOTS=1 --env=JOB_PARAMETER_REFERENCE_COLLECTION=a83bd4e5a26a64612322f21515d93bab+6190 --env=JOB_PARAMETER_GENOME_CHUNKS=400 --env=CRUNCH_JOB_DOCKER_BIN=/usr/bin/docker --env=TASK_WORK=/tmp/crunch-job-task-work/humgen-03-02.1 --env=TASK_KEEPMOUNT_TMP=/keep_tmp --env=ARVADOS_API_HOST=api.arvados.sanger.ac.uk --env=JOB_WORK=/tmp/crunch-job-work --env=TASK_UUID=z8ta6-ot0gb-v0p7tcarqimrnmj --env=CRUNCH_SRC=/data/crunch-tmp/crunch-job/src --env=HOME=/tmp/crunch-job-task-work/humgen-03-02.1 bfdf5bfbb2858302a6b82242018f6d39aade1707e25541710ff85b804c35056c /bin/sh -c python -c "from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version">&2 2>/dev/null; mkdir -p "/tmp/crunch-job-work" "/tmp/crunch-job-task-work/humgen-03-02.1" && if which stdbuf >/dev/null ; then     exec    stdbuf --output=0 --error=0    \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; else     exec \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; fi 
 # pstree -Aap 8848 
 arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep--e 
   |-{arv-mount},8860 
   `-{arv-mount},8863 
 # fuser -v -m /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
                      USER PID ACCESS COMMAND 
 /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: 
                      root       kernel mount /data 
                      crunch       8848 ..c.. arv-mount 
                      root        37715 F.... docker 
 # ps auxwww | grep 37715 | grep -v grep 
 root       37715    0.1    0.0 12555452 94712 ?        Ssl     2015 166:27 /usr/bin/docker -d --graph=/data/docker 
 # docker ps 
 CONTAINER ID          IMAGE                 COMMAND               CREATED               STATUS                PORTS                 NAMES 
 # docker version 
 Client version: 1.7.1 
 Client API version: 1.19 
 Go version (client): go1.4.2 
 Git commit (client): 786b29d 
 OS/Arch (client): linux/amd64 
 Server version: 1.7.1 
 Server API version: 1.19 
 Go version (server): go1.4.2 
 Git commit (server): 786b29d 
 OS/Arch (server): linux/amd64 
 # lsof -p 37715 
 COMMAND     PID USER     FD     TYPE               DEVICE SIZE/OFF        NODE NAME 
 docker    37715 root    cwd      DIR                  8,6       4096           2 / 
 docker    37715 root    rtd      DIR                  8,6       4096           2 / 
 docker    37715 root    txt      REG                  8,6 16296881     1977972 /usr/bin/docker 
 docker    37715 root      0u     CHR                  1,3        0t0        1029 /dev/null 
 docker    37715 root      1u     CHR                136,2        0t0           5 /dev/pts/2 
 docker    37715 root      2u     CHR                136,2        0t0           5 /dev/pts/2 
 docker    37715 root      3u    unix 0xffff881881334280        0t0    48220470 /var/run/docker.sock 
 docker    37715 root      4u    unix 0xffff882ff7a0f700        0t0     1501908 /var/run/docker.sock 
 docker    37715 root      5r     CHR                  1,9        0t0        1034 /dev/urandom 
 docker    37715 root      6u    0000                  0,9          0        6847 anon_inode 
 docker    37715 root      9u    unix 0xffff881ff0922300        0t0    48310485 /var/run/docker.sock 
 docker    37715 root     11u    unix 0xffff881ff0927a80        0t0    48326780 /var/run/docker.sock 
 docker    37715 root     12u    unix 0xffff88191467b800        0t0    48698892 /var/run/docker.sock 
 docker    37715 root     14u    unix 0xffff880106038000        0t0    48603069 /var/run/docker.sock 
 docker    37715 root     16u    unix 0xffff882005fa6c80        0t0    48334610 /var/run/docker.sock 
 docker    37715 root     17u     REG                252,0    1996800 366018572 /data/docker/linkgraph.db 
 docker    37715 root     19u     REG                252,0    1996800 366018572 /data/docker/linkgraph.db 
 docker    37715 root     23u    unix 0xffff881ff0922d80        0t0    48247780 /var/run/docker.sock 
 docker    37715 root     24u    unix 0xffff88191467e580        0t0    48663201 /var/run/docker.sock 
 docker    37715 root     25u    unix 0xffff88010603c980        0t0    48683402 /var/run/docker.sock 
 docker    37715 root     26u    unix 0xffff881ef218b100        0t0    48310488 /var/run/docker.sock 
 docker    37715 root     27u    unix 0xffff882005fa6580        0t0    48309653 /var/run/docker.sock 
 # stat /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
   File: `/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep' 
   Size: 4096       	 Blocks: 8            IO Block: 4096     directory 
 Device: fc00h/64512d 	 Inode: 262078526     Links: 2 
 Access: (0755/drwxr-xr-x)    Uid: (15324/    crunch)     Gid: ( 1593/ arvados) 
 Access: 2016-01-22 21:49:38.729986521 +0000 
 Modify: 2016-01-22 21:49:38.441989497 +0000 
 Change: 2016-01-22 21:49:38.441989497 +0000 
  Birth: - 
 # stat -f /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
   File: "/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep" 
     ID: 6fef924393f3e2f8 Namelen: 255       Type: ext2/ext3 
 Block size: 4096         Fundamental block size: 4096 
 Blocks: Total: 17485339344 Free: 16349728046 Available: 16174171418 
 Inodes: Total: 1097203712 Free: 1096592099 
 # uname -a 
 Linux humgen-03-02 3.13.0-63-generic #104~precise1-Ubuntu SMP Tue Aug 18 17:03:03 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux 
 # readlink /proc/37715/fd/6 
 anon_inode:[eventpoll] 
 # fusermount -u /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
 fusermount: failed to unmount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: Invalid argument 
 # mount -t fuse 
 # umount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
 umount: /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: not mounted 
 # rm -rf /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep 
 # pstree -Aasp 8848 
 init,1 
   `-slurmstepd,8845 
       `-arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other ... 
           |-{arv-mount},8860 
           `-{arv-mount},8863 
 # strace -f -p 8848 
 Process 8848 attached with 3 threads - interrupt to quit 
 [pid    8863] futex(0x7fcc728c7a84, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...> 
 [pid    8860] select(0, NULL, NULL, NULL, {1, 754286} <unfinished ...> 
 [pid    8848] futex(0x32c6e80, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...> 
 [pid    8860] <... select resumed> )        = 0 (Timeout) 
 [pid    8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74 
 [pid    8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70 
 [pid    8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76 
 [pid    8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78 
 [pid    8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80 
 [pid    8860] select(0, NULL, NULL, NULL, {10, 0}) = 0 (Timeout) 
 [pid    8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74 
 [pid    8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70 
 [pid    8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76 
 [pid    8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78 
 [pid    8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80 
 [pid    8860] select(0, NULL, NULL, NULL, {10, 0}^C <unfinished ...> 
 Process 8848 detached 
 Process 8860 detached 
 Process 8863 detached 
 # kill 8860 
 </pre>

Back