Project

General

Profile

Pipeline template template » History » Revision 24

Revision 23 (Bryan Cosca, 09/25/2015 07:35 PM) → Revision 24/25 (Bryan Cosca, 09/28/2015 03:29 PM)

h1. Pipeline template template 

 h2. Run-Command Template 

 <pre> 
   "NAME":{ 
    "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", 
    "repository":"arvados", 
    "script":"run-command", 
    "script_parameters":{ 
     "input":{ 
      "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" 
     }, 
     "reference":{ 
      "dataclass":"Collection", 
      "default":"3514b8e5da0e8d109946bc809b20a78a+5698", 
      "link_name":"human_g1k_v37 reference data", 
      "title":"NAME Input Reference genome (FASTA)" 
     }, 
     "command":[ 
      "java", 
      "-Xmx60g", 
      "-jar", 
      "$(dir $(gatk3))/GenomeAnalysisTK.jar", 
      "-T", 
      "PrintReads", 
      "-R", 
      "$(glob $(dir $(reference))/*.fasta)", 
      { 
       "foreach":"iterator", 
       "command":[ 
        "-I", 
        "$(iterator)" 
       ] 
      }, 
      "-BQSR", 
      "$(bqsr_table)", 
      "-nct", 
      "16", 
      "-o", 
      "$(outputname)" 
     ], 
     "outputname":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(basename $(iterator)).bqsrCal.bam" 
      } 
     }, 
     "bqsr_table":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" 
      } 
     }, 
     "input_dir":"$(dir $(input))", 
     "task.foreach":[ 
      "iterator" 
     ], 
     "iterator":{ 
      "value":{ 
       "group":"input_dir", 
       "regex":"(.*)\\.realigned.bam" 
      } 
     }, 
     "gatk3":{ 
      "dataclass":"Collection", 
      "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", 
      "link_name":"Genome Analysis Toolkit 3.2-2", 
      "title":"NAME Input Version of GATK3 jar" 
     }, 
     "dbsnp":{ 
      "dataclass":"Collection", 
      "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", 
      "title":"Single Nucleotide Polymorphism database", 
      "description":"NAME Input DBsnp" 
     } 
    }, 
    "runtime_constraints":{ 
     "max_tasks_per_node":1, 
     "min_nodes":1, 
     "docker_image":"bcosc/arv-base-java", 
     "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2"  
    }, 
    "output_name":false 
  
 </pre> 

 h2. Crunch Script Template Template 

 <pre> 
  "NAME":{ 
    "script_version":"GIT_BRANCH_NAME", 
    "repository":"GIT_REPO_NAME", 
    "script":"GIT_SCRIPT_NAME", 
    "script_parameters":{ 
     "input":{ 
      "output_of":"PREVIOUS_JOB_NAME" 
     }, 
     "samtools":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", 
      "title":"NAME Input Samtools Collection" 
     }, 
     "bcftools":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"6a0c51bea360b487aa5c9d130435cd00+14097", 
      "title":"NAME Input BCFtools Collection" 
     }, 
     "gatk_jar":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", 
      "title":"NAME Input GATK Collection" 
     }, 
     "reference":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"3514b8e5da0e8d109946bc809b20a78a+5698", 
      "title":"NAME Input Reference Collection" 
     }, 
     "picard":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"0eaa58017c3689414a9e644a2297df5c+165", 
      "title":"NAME Input Picard Collection" 
     }, 
     "bedtools_bin":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", 
      "title":"NAME Input Bedtools Collection" 
     }, 
     "bed_path":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", 
      "title":"NAME Input Bed Collection" 
     }, 
     "bwa_bin": { 
      "required": true, 
      "dataclass": "Collection", 
      "default": "39c6f22d40001074f4200a72559ae7eb+5745", 
      "title": "NAME Input BWA Binary" 
     }, 
     "gatk_ref":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"25b68283b442c1a921ac826296103426+9636", 
      "title":"NAME GATK Reference Collection" 
     }, 
     "tabix": { 
      "required": true, 
      "dataclass": "Collection", 
      "default": "180c32253e97ab7a117f8c9c15e95e8b+1131", 
      "title": "NAME Input Tabix/Bgzip" 
     } 
    }, 
    "runtime_constraints":{ 
     "max_tasks_per_node":1, 
     "min_nodes":1, 
     "docker_image":"bcosc/arv-base-java", 
     "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c" 
    } 
   }, 
 </pre> 

 h2. Crunch Script Template 
 <pre> 
 #!/usr/bin/env python 

 import arvados 
 import subprocess 
 import os 
 import sys 
 import re 
 from arvados.collection import Collection as coll 
 import arvados_tools 
 import shutil 

 arvados_tools.spawn_new_task_per_file('input','.*(bam|fastq)$',if_sequence=0, and_end_task=True) 

 this_job = arvados.current_job() 
 this_task = arvados.current_task() 
 tmpdir = arvados.current_task().tmpdir 

 input_1 = this_task['parameters']['input_1'] 
 input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) 
 #tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir 
 #shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir 
 #os.chdir(tmpdir) 

 NAME_args = ['','','','','','','','',''] 
 print NAME_args 
 NAME_out_file = input_1_path.replace('','') 

 NAME_out_handle = open(NAME_out_file,'w') 
 NAME_pipe = subprocess.Popen(NAME_args,stdout=NAME_out_handle) 
 NAME_pipe.wait() 
 print NAME_pipe.returncode 
 NAME_out_handle.close() 

 NAME_pipe = subprocess.check_output(NAME_args) 

 arvados_tools.write_tmpdir(tmpdir) 

 </pre>  


 <pre> 
 samtools_path = arvados_tools.get_file_path('samtools','^samtools$') 
 gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') 
 reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') 
 dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') 
 bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') 
 tabix_path = arvados_tools.get_file_path('tabix','^tabix$') 
 bwa_path = arvados_tools.get_file_path('bwa_bin','^bwa$') 

 tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$') 
 # Copy picard over to tmpdir because java cannot hand "+" characters 
 picard_path = os.path.join(tmpdir,"picard.jar") 
 shutil.copyfile(tmp_picard_path,picard_path) 

 others: 
 bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$') 


 </pre> 

 <pre> 
 gatk_inserttool_args = [] 
 gatk_inserttool_output_name 
 gatk_inserttool_output_path 
 gatk_inserttool_pipe = subprocess.check_output() 
 </pre> 

 <pre> 
 samtools_inserttool_args = [] 
 </pre> 

 h2. Script Parameter Template 

 for grabbing random script parameters 

 num_files = this_job['script_parameters']['param'] 


 h2. Random stuff 

 h2. Latest arvados_sdk_version:  

 https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python 

 h2. Random tools I use 

 os.path.join(arvados.get_job_param_mount("param"),name) 

 h2. Pipe through tools 

 bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE) 
 output_file = open(output_bam_path,'w') 
 samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file) 
 bwa_pipe.wait() 
 samtools_pipe.wait() 
 output_file.close() 

 h2. Get name of file without n extensions 

 base_input_split = re.split('(\.)',input_1) 
 base_input_list = base_input_split[0:len(base_input_split)-n*2] 
 base_name = ''.join(base_input_list) 
 print base_name