Pipeline template template » History » Revision 14
Revision 13 (Bryan Cosca, 05/21/2015 09:22 PM) → Revision 14/25 (Bryan Cosca, 05/22/2015 03:59 PM)
h1. Pipeline template template
h2. Run-Command Template
<pre>
"NAME":{
"script_version":"29009a1c1f8a9653042c5853832881aca4141cf2",
"repository":"arvados",
"script":"run-command",
"script_parameters":{
"input":{
"output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME"
},
"reference":{
"dataclass":"Collection",
"default":"3514b8e5da0e8d109946bc809b20a78a+5698",
"link_name":"human_g1k_v37 reference data",
"title":"NAME Input Reference genome (FASTA)"
},
"command":[
"java",
"-Xmx60g",
"-jar",
"$(dir $(gatk3))/GenomeAnalysisTK.jar",
"-T",
"PrintReads",
"-R",
"$(glob $(dir $(reference))/*.fasta)",
{
"foreach":"iterator",
"command":[
"-I",
"$(iterator)"
]
},
"-BQSR",
"$(bqsr_table)",
"-nct",
"16",
"-o",
"$(outputname)"
],
"outputname":{
"value":{
"list":"iterator",
"index":"0",
"command":"$(basename $(iterator)).bqsrCal.bam"
}
},
"bqsr_table":{
"value":{
"list":"iterator",
"index":"0",
"command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table"
}
},
"input_dir":"$(dir $(input))",
"task.foreach":[
"iterator"
],
"iterator":{
"value":{
"group":"input_dir",
"regex":"(.*)\\.realigned.bam"
}
},
"gatk3":{
"dataclass":"Collection",
"default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
"link_name":"Genome Analysis Toolkit 3.2-2",
"title":"NAME Input Version of GATK3 jar"
},
"dbsnp":{
"dataclass":"Collection",
"default":"8ac324bfa3dfff1ff81ed34b433869b1+6712",
"title":"Single Nucleotide Polymorphism database",
"description":"NAME Input DBsnp"
}
},
"runtime_constraints":{
"max_tasks_per_node":1,
"min_nodes":1,
"docker_image":"bcosc/arv-base-java",
"arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2"
},
"output_name":false
</pre>
h2. Crunch Script Template Template
<pre>
"NAME":{
"script_version":"GIT_BRANCH_NAME",
"repository":"GIT_REPO_NAME",
"script":"GIT_SCRIPT_NAME",
"script_parameters":{
"input":{
"output_of":"PREVIOUS_JOB_NAME"
},
"samtools":{
"required":true,
"dataclass":"Collection",
"default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147",
"title":"NAME Input Samtools Collection"
},
"bcftools":{
"required":true,
"dataclass":"Collection",
"default":"6a0c51bea360b487aa5c9d130435cd00+14097",
"title":"NAME Input BCFtools Collection"
},
"gatk_jar":{
"required":true,
"dataclass":"Collection",
"default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
"title":"NAME Input GATK Collection"
},
"reference":{
"required":true,
"dataclass":"Collection",
"default":"3514b8e5da0e8d109946bc809b20a78a+5698",
"title":"NAME Input Reference Collection"
},
"picard":{
"required":true,
"dataclass":"Collection",
"default":"0eaa58017c3689414a9e644a2297df5c+165",
"title":"NAME Input Picard Collection"
},
"bedtools_bin":{
"required":true,
"dataclass":"Collection",
"default":"b2f86c26e05e7a0686e7f39a86d406bf+34584",
"title":"NAME Input Bedtools Collection"
},
"bed_path":{
"required":true,
"dataclass":"Collection",
"default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516",
"title":"NAME Input Bed Collection"
}
},
"runtime_constraints":{
"max_tasks_per_node":1,
"min_nodes":1,
"docker_image":"bcosc/arv-base-java",
"arvados_sdk_version":"749b87143ebb0bdcbe2d49deee9c66f6de9f86dd"
},
"output_name":false
},
</pre>
h2. Crunch Script Template
<pre>
#!/usr/bin/env python
import arvados
import subprocess
import os
import sys
import re
from arvados.collection import Collection as coll
import arvados_tools
import shutil
arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True)
this_job = arvados.current_job()
this_task = arvados.current_task()
tmpdir = arvados.current_task().tmpdir
input_1 = this_task['parameters']['input_1']
input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1)
tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir
shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir
</pre>
<pre>
samtools_path = arvados_tools.get_file_path('samtools','^samtools$')
gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$')
reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$')
dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$')
bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$')
tabix_path = arvados_tools.get_file_path('tabix','^tabix$')
others:
bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$')
</pre>
<pre>
gatk_inserttool_args = []
gatk_inserttool_output_name
gatk_inserttool_output_path
gatk_inserttool_pipe = subprocess.check_output()
</pre>
<pre>
samtools_inserttool_args = []
</pre>
h2. Script Parameter Template
h2. Random stuff
h2. Latest arvados_sdk_version:
https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python
h2. Random tools I use
os.path.join(arvados.get_job_param_mount("param"),name)
h2. Pipe through tools
bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE)
output_file = open(output_bam_path,'w')
samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file)
bwa_pipe.wait()
samtools_pipe.wait()
output_file.close()