Pipeline template template

Run-Command Template

  "NAME":{
   "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2",
   "repository":"arvados",
   "script":"run-command",
   "script_parameters":{
    "input":{
     "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" 
    },
    "reference":{
     "dataclass":"Collection",
     "default":"3514b8e5da0e8d109946bc809b20a78a+5698",
     "link_name":"human_g1k_v37 reference data",
     "title":"NAME Input Reference genome (FASTA)" 
    },
    "command":[
     "java",
     "-Xmx60g",
     "-jar",
     "$(dir $(gatk3))/GenomeAnalysisTK.jar",
     "-T",
     "PrintReads",
     "-R",
     "$(glob $(dir $(reference))/*.fasta)",
     {
      "foreach":"iterator",
      "command":[
       "-I",
       "$(iterator)" 
      ]
     },
     "-BQSR",
     "$(bqsr_table)",
     "-nct",
     "16",
     "-o",
     "$(outputname)" 
    ],
    "outputname":{
     "value":{
      "list":"iterator",
      "index":"0",
      "command":"$(basename $(iterator)).bqsrCal.bam" 
     }
    },
    "bqsr_table":{
     "value":{
      "list":"iterator",
      "index":"0",
      "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" 
     }
    },
    "input_dir":"$(dir $(input))",
    "task.foreach":[
     "iterator" 
    ],
    "iterator":{
     "value":{
      "group":"input_dir",
      "regex":"(.*)\\.realigned.bam" 
     }
    },
    "gatk3":{
     "dataclass":"Collection",
     "default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
     "link_name":"Genome Analysis Toolkit 3.2-2",
     "title":"NAME Input Version of GATK3 jar" 
    },
    "dbsnp":{
     "dataclass":"Collection",
     "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712",
     "title":"Single Nucleotide Polymorphism database",
     "description":"NAME Input DBsnp" 
    }
   },
   "runtime_constraints":{
    "max_tasks_per_node":1,
    "min_nodes":1,
    "docker_image":"bcosc/arv-base-java",
    "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" 
   },
   "output_name":false

Crunch Script Template Template

 "NAME":{
   "script_version":"GIT_BRANCH_NAME",
   "repository":"GIT_REPO_NAME",
   "script":"GIT_SCRIPT_NAME",
   "script_parameters":{
    "input":{
     "output_of":"PREVIOUS_JOB_NAME" 
    },
    "samtools":{
     "required":true,
     "dataclass":"Collection",
     "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147",
     "title":"NAME Input Samtools Collection" 
    },
    "bcftools":{
     "required":true,
     "dataclass":"Collection",
     "default":"6a0c51bea360b487aa5c9d130435cd00+14097",
     "title":"NAME Input BCFtools Collection" 
    },
    "gatk_jar":{
     "required":true,
     "dataclass":"Collection",
     "default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
     "title":"NAME Input GATK Collection" 
    },
    "reference":{
     "required":true,
     "dataclass":"Collection",
     "default":"3514b8e5da0e8d109946bc809b20a78a+5698",
     "title":"NAME Input Reference Collection" 
    },
    "picard":{
     "required":true,
     "dataclass":"Collection",
     "default":"0eaa58017c3689414a9e644a2297df5c+165",
     "title":"NAME Input Picard Collection" 
    },
    "bedtools_bin":{
     "required":true,
     "dataclass":"Collection",
     "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584",
     "title":"NAME Input Bedtools Collection" 
    },
    "bed_path":{
     "required":true,
     "dataclass":"Collection",
     "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516",
     "title":"NAME Input Bed Collection" 
    },
    "bwa_bin": {
     "required": true,
     "dataclass": "Collection",
     "default": "39c6f22d40001074f4200a72559ae7eb+5745",
     "title": "NAME Input BWA Binary" 
    },
    "gatk_ref":{
     "required":true,
     "dataclass":"Collection",
     "default":"25b68283b442c1a921ac826296103426+9636",
     "title":"NAME GATK Reference Collection" 
    },
    "tabix": {
     "required": true,
     "dataclass": "Collection",
     "default": "180c32253e97ab7a117f8c9c15e95e8b+1131",
     "title": "NAME Input Tabix/Bgzip" 
    }
   },
   "runtime_constraints":{
    "max_tasks_per_node":1,
    "min_nodes":1,
    "docker_image":"bcosc/arv-base-java",
    "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c" 
   }
  },

Crunch Script Template
#!/usr/bin/env python

import arvados
import subprocess
import os
import sys
import re
from arvados.collection import Collection as coll
import arvados_tools
import shutil

arvados_tools.spawn_new_task_per_file('input','.*(bam|fastq)$',if_sequence=0, and_end_task=True)

this_job = arvados.current_job()
this_task = arvados.current_task()
tmpdir = arvados.current_task().tmpdir

input_1 = this_task['parameters']['input_1']
input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1)
#tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir
#shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir
#os.chdir(tmpdir)

NAME_args = ['','','','','','','','','']
print NAME_args
NAME_out_file = os.path.join(tmpdir,input_1.replace('',''))

NAME_out_handle = open(NAME_out_file,'w')
NAME_pipe = subprocess.Popen(NAME_args,stdout=NAME_out_handle)
NAME_pipe.wait()
print NAME_pipe.returncode
NAME_out_handle.close()

NAME_pipe = subprocess.check_output(NAME_args)

arvados_tools.write_tmpdir(tmpdir)

samtools_path = arvados_tools.get_file_path('samtools','^samtools$')
gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$')
reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$')
dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$')
bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$')
tabix_path = arvados_tools.get_file_path('tabix','^tabix$')
bwa_path = arvados_tools.get_file_path('bwa_bin','^bwa$')

tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$')
# Copy picard over to tmpdir because java cannot hand "+" characters
picard_path = os.path.join(tmpdir,"picard.jar")
shutil.copyfile(tmp_picard_path,picard_path)

others:
bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$')

gatk_inserttool_args = []
gatk_inserttool_output_name
gatk_inserttool_output_path
gatk_inserttool_pipe = subprocess.check_output()
samtools_inserttool_args = []

Script Parameter Template

for grabbing random script parameters

num_files = this_job['script_parameters']['param']

Random stuff

Latest arvados_sdk_version:

https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python

Random tools I use

os.path.join(arvados.get_job_param_mount("param"),name)

Pipe through tools

bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE)
output_file = open(output_bam_path,'w')
samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file)
bwa_pipe.wait()
samtools_pipe.wait()
output_file.close()

Get name of file without n extensions

base_input_split = re.split('(\.)',input_1)
base_input_list = base_input_split[0:len(base_input_split)-n*2]
base_name = ''.join(base_input_list)
print base_name