Pipeline template template » History » Revision 22
Revision 21 (Bryan Cosca, 07/31/2015 03:34 PM) → Revision 22/25 (Bryan Cosca, 09/25/2015 07:16 PM)
h1. Pipeline template template h2. Run-Command Template <pre> "NAME":{ "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", "repository":"arvados", "script":"run-command", "script_parameters":{ "input":{ "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" }, "reference":{ "dataclass":"Collection", "default":"3514b8e5da0e8d109946bc809b20a78a+5698", "link_name":"human_g1k_v37 reference data", "title":"NAME Input Reference genome (FASTA)" }, "command":[ "java", "-Xmx60g", "-jar", "$(dir $(gatk3))/GenomeAnalysisTK.jar", "-T", "PrintReads", "-R", "$(glob $(dir $(reference))/*.fasta)", { "foreach":"iterator", "command":[ "-I", "$(iterator)" ] }, "-BQSR", "$(bqsr_table)", "-nct", "16", "-o", "$(outputname)" ], "outputname":{ "value":{ "list":"iterator", "index":"0", "command":"$(basename $(iterator)).bqsrCal.bam" } }, "bqsr_table":{ "value":{ "list":"iterator", "index":"0", "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" } }, "input_dir":"$(dir $(input))", "task.foreach":[ "iterator" ], "iterator":{ "value":{ "group":"input_dir", "regex":"(.*)\\.realigned.bam" } }, "gatk3":{ "dataclass":"Collection", "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", "link_name":"Genome Analysis Toolkit 3.2-2", "title":"NAME Input Version of GATK3 jar" }, "dbsnp":{ "dataclass":"Collection", "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", "title":"Single Nucleotide Polymorphism database", "description":"NAME Input DBsnp" } }, "runtime_constraints":{ "max_tasks_per_node":1, "min_nodes":1, "docker_image":"bcosc/arv-base-java", "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" }, "output_name":false </pre> h2. Crunch Script Template Template <pre> "NAME":{ "script_version":"GIT_BRANCH_NAME", "repository":"GIT_REPO_NAME", "script":"GIT_SCRIPT_NAME", "script_parameters":{ "input":{ "output_of":"PREVIOUS_JOB_NAME" }, "samtools":{ "required":true, "dataclass":"Collection", "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", "title":"NAME Input Samtools Collection" }, "bcftools":{ "required":true, "dataclass":"Collection", "default":"6a0c51bea360b487aa5c9d130435cd00+14097", "title":"NAME Input BCFtools Collection" }, "gatk_jar":{ "required":true, "dataclass":"Collection", "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", "title":"NAME Input GATK Collection" }, "reference":{ "required":true, "dataclass":"Collection", "default":"3514b8e5da0e8d109946bc809b20a78a+5698", "title":"NAME Input Reference Collection" }, "picard":{ "required":true, "dataclass":"Collection", "default":"0eaa58017c3689414a9e644a2297df5c+165", "title":"NAME Input Picard Collection" }, "bedtools_bin":{ "required":true, "dataclass":"Collection", "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", "title":"NAME Input Bedtools Collection" }, "bed_path":{ "required":true, "dataclass":"Collection", "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", "title":"NAME Input Bed Collection" }, "bwa_bin": { "required": true, "dataclass": "Collection", "default": "39c6f22d40001074f4200a72559ae7eb+5745", "title": "NAME Input BWA Binary" }, "gatk_ref":{ "required":true, "dataclass":"Collection", "default":"25b68283b442c1a921ac826296103426+9636", "title":"NAME GATK Reference Collection" }, "tabix": { "required": true, "dataclass": "Collection", "default": "180c32253e97ab7a117f8c9c15e95e8b+1131", "title": "NAME Input Tabix/Bgzip" } }, "runtime_constraints":{ "max_tasks_per_node":1, "min_nodes":1, "docker_image":"bcosc/arv-base-java", "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c" "arvados_sdk_version":"749b87143ebb0bdcbe2d49deee9c66f6de9f86dd" } }, "output_name":false }, </pre> h2. Crunch Script Template <pre> #!/usr/bin/env python import arvados import subprocess import os import sys import re from arvados.collection import Collection as coll import arvados_tools import shutil arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True) this_job = arvados.current_job() this_task = arvados.current_task() tmpdir = arvados.current_task().tmpdir input_1 = this_task['parameters']['input_1'] input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir </pre> <pre> samtools_path = arvados_tools.get_file_path('samtools','^samtools$') gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') tabix_path = arvados_tools.get_file_path('tabix','^tabix$') tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$') # Copy picard over to tmpdir because java cannot hand "+" characters picard_path = os.path.join(tmpdir,"picard.jar") shutil.copyfile(tmp_picard_path,picard_path) others: bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$') </pre> <pre> gatk_inserttool_args = [] gatk_inserttool_output_name gatk_inserttool_output_path gatk_inserttool_pipe = subprocess.check_output() </pre> <pre> samtools_inserttool_args = [] </pre> h2. Script Parameter Template for grabbing random script parameters num_files = this_job['script_parameters']['param'] h2. Random stuff h2. Latest arvados_sdk_version: https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python h2. Random tools I use os.path.join(arvados.get_job_param_mount("param"),name) h2. Pipe through tools bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE) output_file = open(output_bam_path,'w') samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file) bwa_pipe.wait() samtools_pipe.wait() output_file.close() h2. Get name of file without n extensions base_input_split = re.split('(\.)',input_1) base_input_list = base_input_split[0:len(base_input_split)-n*2] base_name = ''.join(base_input_list) print base_name