Pipeline template template » History » Revision 22
« Previous |
Revision 22/25
(diff)
| Next »
Bryan Cosca, 09/25/2015 07:16 PM
Pipeline template template¶
Run-Command Template¶
"NAME":{ "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", "repository":"arvados", "script":"run-command", "script_parameters":{ "input":{ "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" }, "reference":{ "dataclass":"Collection", "default":"3514b8e5da0e8d109946bc809b20a78a+5698", "link_name":"human_g1k_v37 reference data", "title":"NAME Input Reference genome (FASTA)" }, "command":[ "java", "-Xmx60g", "-jar", "$(dir $(gatk3))/GenomeAnalysisTK.jar", "-T", "PrintReads", "-R", "$(glob $(dir $(reference))/*.fasta)", { "foreach":"iterator", "command":[ "-I", "$(iterator)" ] }, "-BQSR", "$(bqsr_table)", "-nct", "16", "-o", "$(outputname)" ], "outputname":{ "value":{ "list":"iterator", "index":"0", "command":"$(basename $(iterator)).bqsrCal.bam" } }, "bqsr_table":{ "value":{ "list":"iterator", "index":"0", "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" } }, "input_dir":"$(dir $(input))", "task.foreach":[ "iterator" ], "iterator":{ "value":{ "group":"input_dir", "regex":"(.*)\\.realigned.bam" } }, "gatk3":{ "dataclass":"Collection", "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", "link_name":"Genome Analysis Toolkit 3.2-2", "title":"NAME Input Version of GATK3 jar" }, "dbsnp":{ "dataclass":"Collection", "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", "title":"Single Nucleotide Polymorphism database", "description":"NAME Input DBsnp" } }, "runtime_constraints":{ "max_tasks_per_node":1, "min_nodes":1, "docker_image":"bcosc/arv-base-java", "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" }, "output_name":false
Crunch Script Template Template¶
"NAME":{ "script_version":"GIT_BRANCH_NAME", "repository":"GIT_REPO_NAME", "script":"GIT_SCRIPT_NAME", "script_parameters":{ "input":{ "output_of":"PREVIOUS_JOB_NAME" }, "samtools":{ "required":true, "dataclass":"Collection", "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", "title":"NAME Input Samtools Collection" }, "bcftools":{ "required":true, "dataclass":"Collection", "default":"6a0c51bea360b487aa5c9d130435cd00+14097", "title":"NAME Input BCFtools Collection" }, "gatk_jar":{ "required":true, "dataclass":"Collection", "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", "title":"NAME Input GATK Collection" }, "reference":{ "required":true, "dataclass":"Collection", "default":"3514b8e5da0e8d109946bc809b20a78a+5698", "title":"NAME Input Reference Collection" }, "picard":{ "required":true, "dataclass":"Collection", "default":"0eaa58017c3689414a9e644a2297df5c+165", "title":"NAME Input Picard Collection" }, "bedtools_bin":{ "required":true, "dataclass":"Collection", "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", "title":"NAME Input Bedtools Collection" }, "bed_path":{ "required":true, "dataclass":"Collection", "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", "title":"NAME Input Bed Collection" }, "bwa_bin": { "required": true, "dataclass": "Collection", "default": "39c6f22d40001074f4200a72559ae7eb+5745", "title": "NAME Input BWA Binary" }, "gatk_ref":{ "required":true, "dataclass":"Collection", "default":"25b68283b442c1a921ac826296103426+9636", "title":"NAME GATK Reference Collection" }, "tabix": { "required": true, "dataclass": "Collection", "default": "180c32253e97ab7a117f8c9c15e95e8b+1131", "title": "NAME Input Tabix/Bgzip" } }, "runtime_constraints":{ "max_tasks_per_node":1, "min_nodes":1, "docker_image":"bcosc/arv-base-java", "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c" } },
Crunch Script Template
#!/usr/bin/env python
import arvados
import subprocess
import os
import sys
import re
from arvados.collection import Collection as coll
import arvados_tools
import shutil
arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True)
this_job = arvados.current_job()
this_task = arvados.current_task()
tmpdir = arvados.current_task().tmpdir
input_1 = this_task['parameters']['input_1']
input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1)
tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir
shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir
samtools_path = arvados_tools.get_file_path('samtools','^samtools$') gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') tabix_path = arvados_tools.get_file_path('tabix','^tabix$') tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$') # Copy picard over to tmpdir because java cannot hand "+" characters picard_path = os.path.join(tmpdir,"picard.jar") shutil.copyfile(tmp_picard_path,picard_path) others: bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$')
gatk_inserttool_args = [] gatk_inserttool_output_name gatk_inserttool_output_path gatk_inserttool_pipe = subprocess.check_output()
samtools_inserttool_args = []
Script Parameter Template¶
for grabbing random script parameters
num_files = this_job['script_parameters']['param']
Random stuff¶
Latest arvados_sdk_version:¶
https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python
Random tools I use¶
os.path.join(arvados.get_job_param_mount("param"),name)
Pipe through tools¶
bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE)
output_file = open(output_bam_path,'w')
samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file)
bwa_pipe.wait()
samtools_pipe.wait()
output_file.close()
Get name of file without n extensions¶
base_input_split = re.split('(\.)',input_1)
base_input_list = base_input_split[0:len(base_input_split)-n*2]
base_name = ''.join(base_input_list)
print base_name
Updated by Bryan Cosca over 9 years ago · 25 revisions