Project

General

Profile

Actions

Bug #8288

closed

arv-mount / crunchstat in a crunch job fails to exit because reasons

Added by Joshua Randall over 8 years ago. Updated about 8 years ago.

Status:
Resolved
Priority:
Normal
Assigned To:
Category:
FUSE
Target version:
Story points:
0.5

Description

srun / arv-mount / crunchstat doesn't exit because apparently although the docker container has exited and `docker run` has returned, the docker daemon still has an open file on the keep mount that `arv-mount` created.

# ps auxwww | awk '$1=="crunch"'
crunch    8848  0.0  0.0 608616 31096 ?        Sl   Jan22   0:01 /usr/bin/python2.7 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep --exec crunchstat -cgroup-root=/sys/fs/cgroup -cgroup-parent=docker -cgroup-cid=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid -poll=10000 /usr/bin/docker run --name=z8ta6-ot0gb-v0p7tcarqimrnmj-0 --attach=stdout --attach=stderr --attach=stdin -i --cidfile=/data/crunch-tmp/crunch-job/z8ta6-ot0gb-v0p7tcarqimrnmj-0.cid --sig-proxy --memory=188032493k --memory-swap=190129641k --volume=/data/crunch-tmp/crunch-job/src:/data/crunch-tmp/crunch-job/src:ro --volume=/data/crunch-tmp/crunch-job/opt:/data/crunch-tmp/crunch-job/opt:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/by_pdh:/keep:ro --volume=/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep/tmp:/keep_tmp --volume=/tmp --env=TASK_SEQUENCE=0 --env=TASK_KEEPMOUNT=/keep --env=JOB_PARAMETER_INPUTS_COLLECTION=e8ccf6ac8348615a9873149f4ff07353+14726 --env=CRUNCH_SRC_COMMIT=15ee3e583ff9eeb273b52826c8944b43ae21e8bd --env=TASK_QSEQUENCE=0 --env=CRUNCH_INSTALL=/data/crunch-tmp/crunch-job/opt --env=CRUNCH_GIT_ARCHIVE_HASH=085e92f18ba93c6b8eb2507dbfbdf929 --env=CRUNCH_REFRESH_TRIGGER=/tmp/crunch_refresh_trigger --env=ARVADOS_API_TOKEN=2xb1o74nhxb7h2k2fculw4k6xahvss7v1ux8lhmd52conglznk --env=JOB_PARAMETER_SKIP_SQ_SN_REGEX=_decoy$ --env=CRUNCH_WORK=/data/crunch-tmp/crunch-job/work --env=CRUNCH_TMP=/data/crunch-tmp/crunch-job --env=TASK_TMPDIR=/tmp/crunch-job-task-work/humgen-03-02.1 --env=JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=CRUNCH_JOB_UUID=z8ta6-8i9sb-fsm22dece9hd043 --env=TASK_SLOT_NUMBER=1 --env=CRUNCH_SRC_URL=/var/lib/arvados/internal.git --env=TASK_SLOT_NODE=humgen-03-02 --env=JOB_SCRIPT=bcftools-exp-gvcf-mpileup-cram-chunked.py --env=CRUNCH_NODE_SLOTS=1 --env=JOB_PARAMETER_REFERENCE_COLLECTION=a83bd4e5a26a64612322f21515d93bab+6190 --env=JOB_PARAMETER_GENOME_CHUNKS=400 --env=CRUNCH_JOB_DOCKER_BIN=/usr/bin/docker --env=TASK_WORK=/tmp/crunch-job-task-work/humgen-03-02.1 --env=TASK_KEEPMOUNT_TMP=/keep_tmp --env=ARVADOS_API_HOST=api.arvados.sanger.ac.uk --env=JOB_WORK=/tmp/crunch-job-work --env=TASK_UUID=z8ta6-ot0gb-v0p7tcarqimrnmj --env=CRUNCH_SRC=/data/crunch-tmp/crunch-job/src --env=HOME=/tmp/crunch-job-task-work/humgen-03-02.1 bfdf5bfbb2858302a6b82242018f6d39aade1707e25541710ff85b804c35056c /bin/sh -c python -c "from pkg_resources import get_distribution as get; print \"Using Arvados SDK version\", get(\"arvados-python-client\").version">&2 2>/dev/null; mkdir -p "/tmp/crunch-job-work" "/tmp/crunch-job-task-work/humgen-03-02.1" && if which stdbuf >/dev/null ; then   exec  stdbuf --output=0 --error=0  \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; else   exec \/data\/crunch\-tmp\/crunch\-job\/src\/crunch_scripts\/bcftools\-exp\-gvcf\-mpileup\-cram\-chunked\.py ; fi
# pstree -Aap 8848
arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep--e
  |-{arv-mount},8860
  `-{arv-mount},8863
# fuser -v -m /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
                     USER PID ACCESS COMMAND
/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep:
                     root     kernel mount /data
                     crunch     8848 ..c.. arv-mount
                     root      37715 F.... docker
# ps auxwww | grep 37715 | grep -v grep
root     37715  0.1  0.0 12555452 94712 ?      Ssl   2015 166:27 /usr/bin/docker -d --graph=/data/docker
# docker ps
CONTAINER ID        IMAGE               COMMAND             CREATED             STATUS              PORTS               NAMES
# docker version
Client version: 1.7.1
Client API version: 1.19
Go version (client): go1.4.2
Git commit (client): 786b29d
OS/Arch (client): linux/amd64
Server version: 1.7.1
Server API version: 1.19
Go version (server): go1.4.2
Git commit (server): 786b29d
OS/Arch (server): linux/amd64
# lsof -p 37715
COMMAND   PID USER   FD   TYPE             DEVICE SIZE/OFF      NODE NAME
docker  37715 root  cwd    DIR                8,6     4096         2 /
docker  37715 root  rtd    DIR                8,6     4096         2 /
docker  37715 root  txt    REG                8,6 16296881   1977972 /usr/bin/docker
docker  37715 root    0u   CHR                1,3      0t0      1029 /dev/null
docker  37715 root    1u   CHR              136,2      0t0         5 /dev/pts/2
docker  37715 root    2u   CHR              136,2      0t0         5 /dev/pts/2
docker  37715 root    3u  unix 0xffff881881334280      0t0  48220470 /var/run/docker.sock
docker  37715 root    4u  unix 0xffff882ff7a0f700      0t0   1501908 /var/run/docker.sock
docker  37715 root    5r   CHR                1,9      0t0      1034 /dev/urandom
docker  37715 root    6u  0000                0,9        0      6847 anon_inode
docker  37715 root    9u  unix 0xffff881ff0922300      0t0  48310485 /var/run/docker.sock
docker  37715 root   11u  unix 0xffff881ff0927a80      0t0  48326780 /var/run/docker.sock
docker  37715 root   12u  unix 0xffff88191467b800      0t0  48698892 /var/run/docker.sock
docker  37715 root   14u  unix 0xffff880106038000      0t0  48603069 /var/run/docker.sock
docker  37715 root   16u  unix 0xffff882005fa6c80      0t0  48334610 /var/run/docker.sock
docker  37715 root   17u   REG              252,0  1996800 366018572 /data/docker/linkgraph.db
docker  37715 root   19u   REG              252,0  1996800 366018572 /data/docker/linkgraph.db
docker  37715 root   23u  unix 0xffff881ff0922d80      0t0  48247780 /var/run/docker.sock
docker  37715 root   24u  unix 0xffff88191467e580      0t0  48663201 /var/run/docker.sock
docker  37715 root   25u  unix 0xffff88010603c980      0t0  48683402 /var/run/docker.sock
docker  37715 root   26u  unix 0xffff881ef218b100      0t0  48310488 /var/run/docker.sock
docker  37715 root   27u  unix 0xffff882005fa6580      0t0  48309653 /var/run/docker.sock
# stat /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
  File: `/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep'
  Size: 4096          Blocks: 8          IO Block: 4096   directory
Device: fc00h/64512d    Inode: 262078526   Links: 2
Access: (0755/drwxr-xr-x)  Uid: (15324/  crunch)   Gid: ( 1593/ arvados)
Access: 2016-01-22 21:49:38.729986521 +0000
Modify: 2016-01-22 21:49:38.441989497 +0000
Change: 2016-01-22 21:49:38.441989497 +0000
 Birth: -
# stat -f /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
  File: "/data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep" 
    ID: 6fef924393f3e2f8 Namelen: 255     Type: ext2/ext3
Block size: 4096       Fundamental block size: 4096
Blocks: Total: 17485339344 Free: 16349728046 Available: 16174171418
Inodes: Total: 1097203712 Free: 1096592099
# uname -a
Linux humgen-03-02 3.13.0-63-generic #104~precise1-Ubuntu SMP Tue Aug 18 17:03:03 UTC 2015 x86_64 x86_64 x86_64 GNU/Linux
# readlink /proc/37715/fd/6
anon_inode:[eventpoll]
# fusermount -u /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
fusermount: failed to unmount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: Invalid argument
# mount -t fuse
# umount /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
umount: /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep: not mounted
# rm -rf /data/crunch-tmp/crunch-job/task/humgen-03-02.1.keep
# pstree -Aasp 8848
init,1
  `-slurmstepd,8845
      `-arv-mount,8848 /usr/local/bin/arv-mount --read-write --mount-by-pdh=by_pdh --mount-tmp=tmp --crunchstat-interval=10 --allow-other ...
          |-{arv-mount},8860
          `-{arv-mount},8863
# strace -f -p 8848
Process 8848 attached with 3 threads - interrupt to quit
[pid  8863] futex(0x7fcc728c7a84, FUTEX_WAIT_PRIVATE, 1, NULL <unfinished ...>
[pid  8860] select(0, NULL, NULL, NULL, {1, 754286} <unfinished ...>
[pid  8848] futex(0x32c6e80, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid  8860] <... select resumed> )      = 0 (Timeout)
[pid  8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74
[pid  8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70
[pid  8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76
[pid  8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78
[pid  8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80
[pid  8860] select(0, NULL, NULL, NULL, {10, 0}) = 0 (Timeout)
[pid  8860] write(2, "crunchstat: keepcalls 0 put 0 ge"..., 74) = 74
[pid  8860] write(2, "crunchstat: net:keep0 0 tx 0 rx "..., 70) = 70
[pid  8860] write(2, "crunchstat: keepcache 0 hit 0 mi"..., 76) = 76
[pid  8860] write(2, "crunchstat: fuseops 0 write 0 re"..., 78) = 78
[pid  8860] write(2, "crunchstat: blkio:0:0 0 write 0 "..., 80) = 80
[pid  8860] select(0, NULL, NULL, NULL, {10, 0}^C <unfinished ...>
Process 8848 detached
Process 8860 detached
Process 8863 detached
# kill 8860

Subtasks 1 (0 open1 closed)

Task #8355: review 8288-poll-client-close-timeoutResolvedPeter Amstutz01/23/2016Actions

Related issues

Related to Arvados - Feature #8163: [FUSE] arv-mount should detect and log any files/dirs that are still open after unmountingNewActions
Related to Arvados - Bug #8388: [SDKs] WebSocketClient Exception 'NoneType' object is not callableIn ProgressActions
Actions

Also available in: Atom PDF