################################ #This pipeline is for single sample variants detection including #Single Nucleotide Variants and Short Indels from #Whole Genome Sequencing (WGS) or Whole Exome Sequencing (WES) #with a minimum 10x of average depth of coverage. # #Sequencing platform: illumina HiSeq/MiSeq #Pair-end reads with length from 50bp to 150bp. ################################ #env settings, Java Runtime required. (openjdk-7 is used here) CUROVERSE_HOME=/mnt/curoverse APP=$CUROVERSE_HOME/app DATA=$CUROVERSE_HOME/data ################################ #prepare applications ################################ #1. BWA wget http://downloads.sourceforge.net/project/bio-bwa/bwa-0.7.9a.tar.bz2 tar -jxvf bwa-0.7.9a.tar.bz2 cd bwa-0.7.9a && make && cd $CUROVERSE_HOME mkdir -p $APP/bwa/0.7.9a/ find bwa-0.7.9a -executable -type f -print0 | xargs -0 -I {} cp {} $APP/bwa/0.7.9a/ rm -fr bwa* #2. samtools wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2 tar -jxvf samtools-0.1.19.tar.bz2 cd samtools-0.1.19 && make && cd $CUROVERSE_HOME mkdir -p $APP/samtools/0.1.19/ find samtools-0.1.19 -executable -type f -print0 | xargs -0 -I {} mv {} $APP/samtools/0.1.19/ rm -fr samtools* #3. picard wget http://downloads.sourceforge.net/project/picard/picard-tools/1.114/picard-tools-1.114.zip unzip picard-tools-1.114.zip mkdir -p $APP/picard/1.114/ mv picard-tools-1.114/* $APP/picard/1.114/ rm -fr picard* #4. GATK #no automatic way to install GATK. Register and download the GATK package. unpack it to /mnt/curoverse/app/gatk/3.1-1/ ################################ #prepare reference data ################################ mkdir -p $DATA/fasta/hg19/ && cd $DATA/fasta/hg19/ for i in $(seq 1 22) X Y M; do echo $i; wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr${i}.fa.gz; done gunzip *.gz for i in $(seq 1 22) X Y M; do cat chr${i}.fa >> hg19.fasta; done rm -fr chr*.fasta #create index $APP/samtools/0.1.19/samtools faidx hg19.fasta #create dictionary java -Xmx8g -jar $APP/picard/0.1.19/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict java -Xmx8g -jar /home/hadoop/curoverse/app/picard/1.114/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict cd $CUROVERSE_HOME ################################ #prepare library files (dbSNP, 1000G, etc) ################################ mkdir -p $DATA/lib/hg19/ && cd $DATA/lib/hg19/ #dbSNP138 #wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp138Common.txt.gz #GATK resource bundle GATK_FTP=ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19 #download library file for GATK's VariantRecalibrator for f in hapmap_3.3.hg19.vcf.gz 1000G_omni2.5.hg19.vcf.gz 1000G_phase1.indels.hg19.vcf.gz Mills_and_1000G_gold_standard.indels.hg19.vcf.gz do curl --remote-name ${GATK_FTP}/${f} done ################################ #Demo sample data (NA12878 from 1000G) #NA12878: pair-end, WGS, 50x depth, Illumina HiSeq 2000 system #URL: http://www.ebi.ac.uk/ena/data/view/ERX237515 #NA12878_R1: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz #NA12878_R2: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz ################################ #~100GB disk space required to download the NA12878 wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz ################################ #Begin of pipeline #For testing purpose, two PE fastq files R1 and R2 were generated by pIRS (http://code.google.com/p/pirs/) #and only chr1 of hg19 was used as reference genome cd /mnt/ #0.1 create simulated PE Reads on 50x coverage pirs simulate -i $DATA/hg19.chr1/hg19.chr1.fa \ -s ~/app/pIRS_111/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz \ -d ~/app/pIRS_111/Profiles/GC-depth_Profiles/humNew.gcdep_200.dat \ -b ~/app/pIRS_111/Profiles/InDel_Profiles/phixv2.InDel.matrix \ -l 100 -x 50 -Q 33 -o A R1=/mnt/A_100_500_1.fq.gz R2=/mnt/A_100_500_2.fq.gz #0.2 build genome index for BWA $APP/app/bwa/0.7.9a/bwa index -p hg19.chr1 chr1.fa #1. align with BWA, with 4 threads (-t 4). Reading Group is required (-R ...). time $APP/bwa/0.7.9a/bwa mem -t 4 -R '@RG\tID:group_id\tPL:illumina\tSM:sample_id' $DATA/hg19.chr1/hg19.chr1 ${R1} ${R2} > A.chr1.sam #2. Sort into BAM java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/SortSam.jar \ CREATE_INDEX=True SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT \ INPUT=A.chr1.sam OUTPUT=A.chr1.sort.bam #3. remove PCR duplicate java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/MarkDuplicates.jar \ CREATE_INDEX=true REMOVE_DUPLICATES=True ASSUME_SORTED=True VALIDATION_STRINGENCY=LENIENT METRICS_FILE=/dev/null \ INPUT=A.chr1.sort.bam OUTPUT=A.chr1.sort.dedup.bam #4. fix mate information java -Djava.io.tmpdir=/tmp/ -jar $APP/picard/1.114/FixMateInformation.jar \ SO=coordinate VALIDATION_STRINGENCY=LENIENT CREATE_INDEX=true \ INPUT=A.chr1.sort.dedup.bam OUTPUT=A.chr1.sort.dedup.mate.bam #5. local alignment around Indels. create de novo intervals java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T RealignerTargetCreator \ -I A.chr1.sort.dedup.mate.bam -o A.intervals java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T IndelRealigner \ -targetIntervals A.intervals \ -I A.chr1.sort.dedup.mate.bam -o A.chr1.sort.dedup.mate.relaign.bam #6. base recalibrator, use dbSNP139 as "truth". If we know the population of subject sample, we can #use specific SNP datasets extract from 1000G with same population. java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T BaseRecalibrator \ -knownSites $DATA/dbsnp_138.hg19.vcf \ -o A.recal \ -I A.chr1.sort.dedup.mate.relaign.bam java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T PrintReads \ -BQSR A.recal \ -o A.chr1.sort.dedup.mate.relaign.recal.bam \ -I A.chr1.sort.dedup.mate.relaign.bam #7. call variants with GATK's UnifiedGenotyper (faster, lots of raw calls, must run filter after) time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T UnifiedGenotyper \ -glm BOTH \ -D $DATA/dbsnp_138.hg19.vcf \ -metrics A.snps.metrics \ -stand_call_conf 50.0 -stand_emit_conf 10.0 \ -o A.ug.vcf \ -I A.chr1.sort.dedup.mate.relaign.recal.bam #8. call variants with GATK's HaplotypeCaller (2x slower than UG, much less accurate calls) time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \ -T HaplotypeCaller \ --dbsnp $DATA/dbsnp_138.hg19.vcf \ -stand_call_conf 50.0 -stand_emit_conf 10.0 \ -o A.hc.vcf \ -I A.chr1.sort.dedup.mate.relaign.recal.bam #End of pipeline ################################