Project

General

Profile

Pipeline template template » History » Version 24

Bryan Cosca, 09/28/2015 03:29 PM

1 1 Bryan Cosca
h1. Pipeline template template
2
3
h2. Run-Command Template
4
5 4 Bryan Cosca
<pre>
6 7 Bryan Cosca
  "NAME":{
7 6 Bryan Cosca
   "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2",
8 4 Bryan Cosca
   "repository":"arvados",
9
   "script":"run-command",
10
   "script_parameters":{
11
    "input":{
12 7 Bryan Cosca
     "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME"
13 4 Bryan Cosca
    },
14
    "reference":{
15
     "dataclass":"Collection",
16
     "default":"3514b8e5da0e8d109946bc809b20a78a+5698",
17
     "link_name":"human_g1k_v37 reference data",
18 7 Bryan Cosca
     "title":"NAME Input Reference genome (FASTA)"
19 4 Bryan Cosca
    },
20
    "command":[
21
     "java",
22
     "-Xmx60g",
23
     "-jar",
24
     "$(dir $(gatk3))/GenomeAnalysisTK.jar",
25
     "-T",
26
     "PrintReads",
27
     "-R",
28
     "$(glob $(dir $(reference))/*.fasta)",
29
     {
30
      "foreach":"iterator",
31
      "command":[
32
       "-I",
33
       "$(iterator)"
34
      ]
35
     },
36
     "-BQSR",
37
     "$(bqsr_table)",
38
     "-nct",
39
     "16",
40
     "-o",
41
     "$(outputname)"
42
    ],
43
    "outputname":{
44
     "value":{
45
      "list":"iterator",
46
      "index":"0",
47
      "command":"$(basename $(iterator)).bqsrCal.bam"
48
     }
49
    },
50
    "bqsr_table":{
51
     "value":{
52
      "list":"iterator",
53
      "index":"0",
54
      "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table"
55
     }
56
    },
57
    "input_dir":"$(dir $(input))",
58
    "task.foreach":[
59
     "iterator"
60
    ],
61
    "iterator":{
62
     "value":{
63
      "group":"input_dir",
64
      "regex":"(.*)\\.realigned.bam"
65 1 Bryan Cosca
     }
66 4 Bryan Cosca
    },
67
    "gatk3":{
68
     "dataclass":"Collection",
69
     "default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
70 1 Bryan Cosca
     "link_name":"Genome Analysis Toolkit 3.2-2",
71 7 Bryan Cosca
     "title":"NAME Input Version of GATK3 jar"
72 4 Bryan Cosca
    },
73
    "dbsnp":{
74
     "dataclass":"Collection",
75
     "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712",
76
     "title":"Single Nucleotide Polymorphism database",
77 7 Bryan Cosca
     "description":"NAME Input DBsnp"
78 4 Bryan Cosca
    }
79
   },
80
   "runtime_constraints":{
81
    "max_tasks_per_node":1,
82
    "min_nodes":1,
83
    "docker_image":"bcosc/arv-base-java",
84 6 Bryan Cosca
    "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" 
85 4 Bryan Cosca
   },
86
   "output_name":false
87 6 Bryan Cosca
  
88 4 Bryan Cosca
</pre>
89 1 Bryan Cosca
90 8 Bryan Cosca
h2. Crunch Script Template Template
91
92
<pre>
93 9 Bryan Cosca
 "NAME":{
94
   "script_version":"GIT_BRANCH_NAME",
95
   "repository":"GIT_REPO_NAME",
96
   "script":"GIT_SCRIPT_NAME",
97 8 Bryan Cosca
   "script_parameters":{
98
    "input":{
99 9 Bryan Cosca
     "output_of":"PREVIOUS_JOB_NAME"
100 8 Bryan Cosca
    },
101
    "samtools":{
102
     "required":true,
103
     "dataclass":"Collection",
104
     "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147",
105 9 Bryan Cosca
     "title":"NAME Input Samtools Collection"
106 8 Bryan Cosca
    },
107 13 Bryan Cosca
    "bcftools":{
108
     "required":true,
109
     "dataclass":"Collection",
110
     "default":"6a0c51bea360b487aa5c9d130435cd00+14097",
111
     "title":"NAME Input BCFtools Collection"
112
    },
113 8 Bryan Cosca
    "gatk_jar":{
114
     "required":true,
115
     "dataclass":"Collection",
116 1 Bryan Cosca
     "default":"2e98fdc8e90f4c48a0714b711767c9ce+76",
117 9 Bryan Cosca
     "title":"NAME Input GATK Collection"
118 1 Bryan Cosca
    },
119
    "reference":{
120
     "required":true,
121
     "dataclass":"Collection",
122
     "default":"3514b8e5da0e8d109946bc809b20a78a+5698",
123 9 Bryan Cosca
     "title":"NAME Input Reference Collection"
124
    },
125
    "picard":{
126
     "required":true,
127
     "dataclass":"Collection",
128
     "default":"0eaa58017c3689414a9e644a2297df5c+165",
129
     "title":"NAME Input Picard Collection"
130
    },
131
    "bedtools_bin":{
132
     "required":true,
133
     "dataclass":"Collection",
134
     "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584",
135
     "title":"NAME Input Bedtools Collection"
136
    },
137
    "bed_path":{
138
     "required":true,
139
     "dataclass":"Collection",
140
     "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516",
141
     "title":"NAME Input Bed Collection"
142 18 Bryan Cosca
    },
143
    "bwa_bin": {
144
     "required": true,
145
     "dataclass": "Collection",
146
     "default": "39c6f22d40001074f4200a72559ae7eb+5745",
147
     "title": "NAME Input BWA Binary"
148
    },
149 19 Bryan Cosca
    "gatk_ref":{
150
     "required":true,
151
     "dataclass":"Collection",
152
     "default":"25b68283b442c1a921ac826296103426+9636",
153
     "title":"NAME GATK Reference Collection"
154 20 Bryan Cosca
    },
155
    "tabix": {
156
     "required": true,
157
     "dataclass": "Collection",
158
     "default": "180c32253e97ab7a117f8c9c15e95e8b+1131",
159
     "title": "NAME Input Tabix/Bgzip"
160 19 Bryan Cosca
    }
161 8 Bryan Cosca
   },
162
   "runtime_constraints":{
163
    "max_tasks_per_node":1,
164
    "min_nodes":1,
165
    "docker_image":"bcosc/arv-base-java",
166 22 Bryan Cosca
    "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c"
167
   }
168 8 Bryan Cosca
  },
169
</pre>
170
171 1 Bryan Cosca
h2. Crunch Script Template
172
<pre>
173
#!/usr/bin/env python
174
175
import arvados
176
import subprocess
177
import os
178
import sys
179
import re
180
from arvados.collection import Collection as coll
181
import arvados_tools
182
import shutil
183
184 23 Bryan Cosca
arvados_tools.spawn_new_task_per_file('input','.*(bam|fastq)$',if_sequence=0, and_end_task=True)
185 1 Bryan Cosca
186
this_job = arvados.current_job()
187
this_task = arvados.current_task()
188
tmpdir = arvados.current_task().tmpdir
189
190
input_1 = this_task['parameters']['input_1']
191
input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1)
192 23 Bryan Cosca
#tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir
193
#shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir
194
#os.chdir(tmpdir)
195
196
NAME_args = ['','','','','','','','','']
197
print NAME_args
198
NAME_out_file = input_1_path.replace('','')
199
200
NAME_out_handle = open(NAME_out_file,'w')
201
NAME_pipe = subprocess.Popen(NAME_args,stdout=NAME_out_handle)
202
NAME_pipe.wait()
203
print NAME_pipe.returncode
204
NAME_out_handle.close()
205
206
NAME_pipe = subprocess.check_output(NAME_args)
207
208
arvados_tools.write_tmpdir(tmpdir)
209
210 1 Bryan Cosca
</pre> 
211
212
213
<pre>
214
samtools_path = arvados_tools.get_file_path('samtools','^samtools$')
215
gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$')
216
reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$')
217
dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$')
218 11 Bryan Cosca
bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$')
219
tabix_path = arvados_tools.get_file_path('tabix','^tabix$')
220 24 Bryan Cosca
bwa_path = arvados_tools.get_file_path('bwa_bin','^bwa$')
221 12 Bryan Cosca
222 17 Bryan Cosca
tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$')
223
# Copy picard over to tmpdir because java cannot hand "+" characters
224
picard_path = os.path.join(tmpdir,"picard.jar")
225
shutil.copyfile(tmp_picard_path,picard_path)
226
227 12 Bryan Cosca
others:
228
bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$')
229 19 Bryan Cosca
230 12 Bryan Cosca
231 1 Bryan Cosca
</pre>
232
233
<pre>
234
gatk_inserttool_args = []
235
gatk_inserttool_output_name
236
gatk_inserttool_output_path
237
gatk_inserttool_pipe = subprocess.check_output()
238
</pre>
239
240
<pre>
241
samtools_inserttool_args = []
242
</pre>
243
244
h2. Script Parameter Template
245
246 21 Bryan Cosca
for grabbing random script parameters
247
248
num_files = this_job['script_parameters']['param']
249
250
251 10 Bryan Cosca
h2. Random stuff
252 1 Bryan Cosca
253 3 Bryan Cosca
h2. Latest arvados_sdk_version: 
254 2 Bryan Cosca
255
https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python
256 10 Bryan Cosca
257
h2. Random tools I use
258
259
os.path.join(arvados.get_job_param_mount("param"),name)
260 14 Bryan Cosca
261
h2. Pipe through tools
262
263
bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE)
264
output_file = open(output_bam_path,'w')
265
samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file)
266
bwa_pipe.wait()
267
samtools_pipe.wait()
268
output_file.close()
269 15 Bryan Cosca
270
h2. Get name of file without n extensions
271 16 Bryan Cosca
272 15 Bryan Cosca
base_input_split = re.split('(\.)',input_1)
273
base_input_list = base_input_split[0:len(base_input_split)-n*2]
274
base_name = ''.join(base_input_list)
275
print base_name