Pipeline template template » History » Version 14
Bryan Cosca, 05/22/2015 03:59 PM
1 | 1 | Bryan Cosca | h1. Pipeline template template |
2 | |||
3 | h2. Run-Command Template |
4 | |||
5 | 4 | Bryan Cosca | <pre> |
6 | 7 | Bryan Cosca | "NAME":{ |
7 | 6 | Bryan Cosca | "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", |
8 | 4 | Bryan Cosca | "repository":"arvados", |
9 | "script":"run-command", |
10 | "script_parameters":{ |
11 | "input":{ |
12 | 7 | Bryan Cosca | "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" |
13 | 4 | Bryan Cosca | }, |
14 | "reference":{ |
15 | "dataclass":"Collection", |
16 | "default":"3514b8e5da0e8d109946bc809b20a78a+5698", |
17 | "link_name":"human_g1k_v37 reference data", |
18 | 7 | Bryan Cosca | "title":"NAME Input Reference genome (FASTA)" |
19 | 4 | Bryan Cosca | }, |
20 | "command":[ |
21 | "java", |
22 | "-Xmx60g", |
23 | "-jar", |
24 | "$(dir $(gatk3))/GenomeAnalysisTK.jar", |
25 | "-T", |
26 | "PrintReads", |
27 | "-R", |
28 | "$(glob $(dir $(reference))/*.fasta)", |
29 | { |
30 | "foreach":"iterator", |
31 | "command":[ |
32 | "-I", |
33 | "$(iterator)" |
34 | ] |
35 | }, |
36 | "-BQSR", |
37 | "$(bqsr_table)", |
38 | "-nct", |
39 | "16", |
40 | "-o", |
41 | "$(outputname)" |
42 | ], |
43 | "outputname":{ |
44 | "value":{ |
45 | "list":"iterator", |
46 | "index":"0", |
47 | "command":"$(basename $(iterator)).bqsrCal.bam" |
48 | } |
49 | }, |
50 | "bqsr_table":{ |
51 | "value":{ |
52 | "list":"iterator", |
53 | "index":"0", |
54 | "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" |
55 | } |
56 | }, |
57 | "input_dir":"$(dir $(input))", |
58 | "task.foreach":[ |
59 | "iterator" |
60 | ], |
61 | "iterator":{ |
62 | "value":{ |
63 | "group":"input_dir", |
64 | "regex":"(.*)\\.realigned.bam" |
65 | 1 | Bryan Cosca | } |
66 | 4 | Bryan Cosca | }, |
67 | "gatk3":{ |
68 | "dataclass":"Collection", |
69 | "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", |
70 | 1 | Bryan Cosca | "link_name":"Genome Analysis Toolkit 3.2-2", |
71 | 7 | Bryan Cosca | "title":"NAME Input Version of GATK3 jar" |
72 | 4 | Bryan Cosca | }, |
73 | "dbsnp":{ |
74 | "dataclass":"Collection", |
75 | "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", |
76 | "title":"Single Nucleotide Polymorphism database", |
77 | 7 | Bryan Cosca | "description":"NAME Input DBsnp" |
78 | 4 | Bryan Cosca | } |
79 | }, |
80 | "runtime_constraints":{ |
81 | "max_tasks_per_node":1, |
82 | "min_nodes":1, |
83 | "docker_image":"bcosc/arv-base-java", |
84 | 6 | Bryan Cosca | "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" |
85 | 4 | Bryan Cosca | }, |
86 | "output_name":false |
87 | 6 | Bryan Cosca | |
88 | 4 | Bryan Cosca | </pre> |
89 | 1 | Bryan Cosca | |
90 | 8 | Bryan Cosca | h2. Crunch Script Template Template |
91 | |||
92 | <pre> |
93 | 9 | Bryan Cosca | "NAME":{ |
94 | "script_version":"GIT_BRANCH_NAME", |
95 | "repository":"GIT_REPO_NAME", |
96 | "script":"GIT_SCRIPT_NAME", |
97 | 8 | Bryan Cosca | "script_parameters":{ |
98 | "input":{ |
99 | 9 | Bryan Cosca | "output_of":"PREVIOUS_JOB_NAME" |
100 | 8 | Bryan Cosca | }, |
101 | "samtools":{ |
102 | "required":true, |
103 | "dataclass":"Collection", |
104 | "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", |
105 | 9 | Bryan Cosca | "title":"NAME Input Samtools Collection" |
106 | 8 | Bryan Cosca | }, |
107 | 13 | Bryan Cosca | "bcftools":{ |
108 | "required":true, |
109 | "dataclass":"Collection", |
110 | "default":"6a0c51bea360b487aa5c9d130435cd00+14097", |
111 | "title":"NAME Input BCFtools Collection" |
112 | }, |
113 | 8 | Bryan Cosca | "gatk_jar":{ |
114 | "required":true, |
115 | "dataclass":"Collection", |
116 | 1 | Bryan Cosca | "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", |
117 | 9 | Bryan Cosca | "title":"NAME Input GATK Collection" |
118 | 1 | Bryan Cosca | }, |
119 | "reference":{ |
120 | "required":true, |
121 | "dataclass":"Collection", |
122 | "default":"3514b8e5da0e8d109946bc809b20a78a+5698", |
123 | 9 | Bryan Cosca | "title":"NAME Input Reference Collection" |
124 | }, |
125 | "picard":{ |
126 | "required":true, |
127 | "dataclass":"Collection", |
128 | "default":"0eaa58017c3689414a9e644a2297df5c+165", |
129 | "title":"NAME Input Picard Collection" |
130 | }, |
131 | "bedtools_bin":{ |
132 | "required":true, |
133 | "dataclass":"Collection", |
134 | "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", |
135 | "title":"NAME Input Bedtools Collection" |
136 | }, |
137 | "bed_path":{ |
138 | "required":true, |
139 | "dataclass":"Collection", |
140 | "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", |
141 | "title":"NAME Input Bed Collection" |
142 | 8 | Bryan Cosca | } |
143 | }, |
144 | "runtime_constraints":{ |
145 | "max_tasks_per_node":1, |
146 | "min_nodes":1, |
147 | "docker_image":"bcosc/arv-base-java", |
148 | "arvados_sdk_version":"749b87143ebb0bdcbe2d49deee9c66f6de9f86dd" |
149 | }, |
150 | "output_name":false |
151 | }, |
152 | </pre> |
153 | |||
154 | 1 | Bryan Cosca | h2. Crunch Script Template |
155 | <pre> |
156 | #!/usr/bin/env python |
157 | |||
158 | import arvados |
159 | import subprocess |
160 | import os |
161 | import sys |
162 | import re |
163 | from arvados.collection import Collection as coll |
164 | import arvados_tools |
165 | import shutil |
166 | |||
167 | arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True) |
168 | |||
169 | this_job = arvados.current_job() |
170 | this_task = arvados.current_task() |
171 | tmpdir = arvados.current_task().tmpdir |
172 | |||
173 | input_1 = this_task['parameters']['input_1'] |
174 | input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) |
175 | tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir |
176 | shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir |
177 | </pre> |
178 | |||
179 | |||
180 | <pre> |
181 | samtools_path = arvados_tools.get_file_path('samtools','^samtools$') |
182 | gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') |
183 | reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') |
184 | dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') |
185 | 11 | Bryan Cosca | bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') |
186 | tabix_path = arvados_tools.get_file_path('tabix','^tabix$') |
187 | 12 | Bryan Cosca | |
188 | others: |
189 | bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$') |
190 | |||
191 | 1 | Bryan Cosca | </pre> |
192 | |||
193 | <pre> |
194 | gatk_inserttool_args = [] |
195 | gatk_inserttool_output_name |
196 | gatk_inserttool_output_path |
197 | gatk_inserttool_pipe = subprocess.check_output() |
198 | </pre> |
199 | |||
200 | <pre> |
201 | samtools_inserttool_args = [] |
202 | </pre> |
203 | |||
204 | h2. Script Parameter Template |
205 | |||
206 | 10 | Bryan Cosca | h2. Random stuff |
207 | 1 | Bryan Cosca | |
208 | 3 | Bryan Cosca | h2. Latest arvados_sdk_version: |
209 | 2 | Bryan Cosca | |
210 | https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python |
211 | 10 | Bryan Cosca | |
212 | h2. Random tools I use |
213 | |||
214 | os.path.join(arvados.get_job_param_mount("param"),name) |
215 | 14 | Bryan Cosca | |
216 | h2. Pipe through tools |
217 | |||
218 | bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE) |
219 | output_file = open(output_bam_path,'w') |
220 | samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file) |
221 | bwa_pipe.wait() |
222 | samtools_pipe.wait() |
223 | output_file.close() |