Project

General

Profile

Pipeline template template » History » Revision 15

Revision 14 (Bryan Cosca, 05/22/2015 03:59 PM) → Revision 15/25 (Bryan Cosca, 05/22/2015 06:18 PM)

h1. Pipeline template template 

 h2. Run-Command Template 

 <pre> 
   "NAME":{ 
    "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", 
    "repository":"arvados", 
    "script":"run-command", 
    "script_parameters":{ 
     "input":{ 
      "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" 
     }, 
     "reference":{ 
      "dataclass":"Collection", 
      "default":"3514b8e5da0e8d109946bc809b20a78a+5698", 
      "link_name":"human_g1k_v37 reference data", 
      "title":"NAME Input Reference genome (FASTA)" 
     }, 
     "command":[ 
      "java", 
      "-Xmx60g", 
      "-jar", 
      "$(dir $(gatk3))/GenomeAnalysisTK.jar", 
      "-T", 
      "PrintReads", 
      "-R", 
      "$(glob $(dir $(reference))/*.fasta)", 
      { 
       "foreach":"iterator", 
       "command":[ 
        "-I", 
        "$(iterator)" 
       ] 
      }, 
      "-BQSR", 
      "$(bqsr_table)", 
      "-nct", 
      "16", 
      "-o", 
      "$(outputname)" 
     ], 
     "outputname":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(basename $(iterator)).bqsrCal.bam" 
      } 
     }, 
     "bqsr_table":{ 
      "value":{ 
       "list":"iterator", 
       "index":"0", 
       "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" 
      } 
     }, 
     "input_dir":"$(dir $(input))", 
     "task.foreach":[ 
      "iterator" 
     ], 
     "iterator":{ 
      "value":{ 
       "group":"input_dir", 
       "regex":"(.*)\\.realigned.bam" 
      } 
     }, 
     "gatk3":{ 
      "dataclass":"Collection", 
      "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", 
      "link_name":"Genome Analysis Toolkit 3.2-2", 
      "title":"NAME Input Version of GATK3 jar" 
     }, 
     "dbsnp":{ 
      "dataclass":"Collection", 
      "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", 
      "title":"Single Nucleotide Polymorphism database", 
      "description":"NAME Input DBsnp" 
     } 
    }, 
    "runtime_constraints":{ 
     "max_tasks_per_node":1, 
     "min_nodes":1, 
     "docker_image":"bcosc/arv-base-java", 
     "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2"  
    }, 
    "output_name":false 
  
 </pre> 

 h2. Crunch Script Template Template 

 <pre> 
  "NAME":{ 
    "script_version":"GIT_BRANCH_NAME", 
    "repository":"GIT_REPO_NAME", 
    "script":"GIT_SCRIPT_NAME", 
    "script_parameters":{ 
     "input":{ 
      "output_of":"PREVIOUS_JOB_NAME" 
     }, 
     "samtools":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", 
      "title":"NAME Input Samtools Collection" 
     }, 
     "bcftools":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"6a0c51bea360b487aa5c9d130435cd00+14097", 
      "title":"NAME Input BCFtools Collection" 
     }, 
     "gatk_jar":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", 
      "title":"NAME Input GATK Collection" 
     }, 
     "reference":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"3514b8e5da0e8d109946bc809b20a78a+5698", 
      "title":"NAME Input Reference Collection" 
     }, 
     "picard":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"0eaa58017c3689414a9e644a2297df5c+165", 
      "title":"NAME Input Picard Collection" 
     }, 
     "bedtools_bin":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", 
      "title":"NAME Input Bedtools Collection" 
     }, 
     "bed_path":{ 
      "required":true, 
      "dataclass":"Collection", 
      "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", 
      "title":"NAME Input Bed Collection" 
     } 
    }, 
    "runtime_constraints":{ 
     "max_tasks_per_node":1, 
     "min_nodes":1, 
     "docker_image":"bcosc/arv-base-java", 
     "arvados_sdk_version":"749b87143ebb0bdcbe2d49deee9c66f6de9f86dd" 
    }, 
    "output_name":false 
   }, 
 </pre> 

 h2. Crunch Script Template 
 <pre> 
 #!/usr/bin/env python 

 import arvados 
 import subprocess 
 import os 
 import sys 
 import re 
 from arvados.collection import Collection as coll 
 import arvados_tools 
 import shutil 

 arvados_tools.spawn_new_task_per_file('input','.*realigned.bqsrCal.bam$',if_sequence=0, and_end_task=True) 

 this_job = arvados.current_job() 
 this_task = arvados.current_task() 
 tmpdir = arvados.current_task().tmpdir 

 input_1 = this_task['parameters']['input_1'] 
 input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) 
 tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir 
 shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir 
 </pre>  


 <pre> 
 samtools_path = arvados_tools.get_file_path('samtools','^samtools$') 
 gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') 
 reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') 
 dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') 
 bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') 
 tabix_path = arvados_tools.get_file_path('tabix','^tabix$') 

 others: 
 bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$') 

 </pre> 

 <pre> 
 gatk_inserttool_args = [] 
 gatk_inserttool_output_name 
 gatk_inserttool_output_path 
 gatk_inserttool_pipe = subprocess.check_output() 
 </pre> 

 <pre> 
 samtools_inserttool_args = [] 
 </pre> 

 h2. Script Parameter Template 

 h2. Random stuff 

 h2. Latest arvados_sdk_version:  

 https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python 

 h2. Random tools I use 

 os.path.join(arvados.get_job_param_mount("param"),name) 

 h2. Pipe through tools 

 bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE) 
 output_file = open(output_bam_path,'w') 
 samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file) 
 bwa_pipe.wait() 
 samtools_pipe.wait() 
 output_file.close() 

 h2. Get name of file without n extensions 
 base_input_split = re.split('(\.)',input_1) 
 base_input_list = base_input_split[0:len(base_input_split)-n*2] 
 base_name = ''.join(base_input_list) 
 print base_name