Project

General

Profile

Pipeline template template » History » Revision 4

Revision 3 (Bryan Cosca, 05/21/2015 05:25 PM) → Revision 4/25 (Bryan Cosca, 05/21/2015 06:22 PM)

h1. Pipeline template template 

 h2. Run-Command Template 

 <pre> 
   "PrintReads":{ 
    "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", 
    "repository":"arvados", 
    "script":"run-command", 
    "script_parameters":{ 
     "input":{ 
      "output_of":"Indel-Realignment" 
     }, 
     "reference":{ 
      "dataclass":"Collection", 
      "default":"3514b8e5da0e8d109946bc809b20a78a+5698", 
      "link_name":"human_g1k_v37 reference data", 
      "title":"Base-Recalibration Input Reference genome (FASTA)" 
     }, 
     "command":[ 
      "java", 
      "-Xmx60g", 
      "-jar", 
      "$(dir $(gatk3))/GenomeAnalysisTK.jar", 
      "-T", 
      "PrintReads", 
      "-R", 
      "$(glob $(dir $(reference))/*.fasta)", 
      { 
       "foreach":"iterator", 
       "command":[ 
        "-I", 
        "$(iterator)" 
       ] 
      }, 
      "-BQSR", 
      "$(bqsr_table)", 
      "-nct", 
      "16", 
      "-o", 
      "$(outputname)" 
     ], 
     "outputname":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(basename $(iterator)).bqsrCal.bam" 
      } 
     }, 
     "bqsr_table":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" 
      } 
     }, 
     "input_dir":"$(dir $(input))", 
     "task.foreach":[ 
      "iterator" 
     ], 
     "iterator":{ 
      "value":{ 
       "group":"input_dir", 
       "regex":"(.*)\\.realigned.bam" 
      } 
     }, 
     "bqsr":{ 
      "output_of":"Base-Recalibration" 
     }, 
     "gatk3":{ 
      "dataclass":"Collection", 
      "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", 
      "link_name":"Genome Analysis Toolkit 3.2-2", 
      "title":"PrintReads Input Version of GATK3 jar" 
     }, 
     "dbsnp":{ 
      "dataclass":"Collection", 
      "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", 
      "title":"Single Nucleotide Polymorphism database", 
      "description":"Base-Recalibration Input DBsnp" 
     } 
    }, 
    "runtime_constraints":{ 
     "max_tasks_per_node":1, 
     "min_nodes":1, 
     "docker_image":"bcosc/arv-base-java", 
     "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" 
    }, 
    "output_name":false 
   } 
 </pre> 

 


 h2. Crunch Script Template 
 <pre> 
 #!/usr/bin/env python 

 import arvados 
 import subprocess 
 import os 
 import sys 
 import re 
 from arvados.collection import Collection as coll 
 import arvados_tools 
 import shutil 

 arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True) 

 this_job = arvados.current_job() 
 this_task = arvados.current_task() 
 tmpdir = arvados.current_task().tmpdir 

 input_1 = this_task['parameters']['input_1'] 
 input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) 
 tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir 
 shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir 
 </pre>  


 <pre> 
 samtools_path = arvados_tools.get_file_path('samtools','^samtools$') 
 gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') 
 reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') 
 dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') 
 </pre> 

 <pre> 
 gatk_inserttool_args = [] 
 gatk_inserttool_output_name 
 gatk_inserttool_output_path 
 gatk_inserttool_pipe = subprocess.check_output() 
 </pre> 

 <pre> 
 samtools_inserttool_args = [] 
 </pre> 

 h2. Script Parameter Template 


 h2. Latest arvados_sdk_version:  

 https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python