Single_Sample_SNV_Pipeline.txt

Tom Clegg, 06/18/2014 01:04 AM

 
1
################################
2
#This pipeline is for single sample variants detection including
3
#Single Nucleotide Variants and Short Indels from 
4
#Whole Genome Sequencing (WGS) or Whole Exome Sequencing (WES)
5
#with a minimum 10x of average depth of coverage.
6
#
7
#Sequencing platform: illumina HiSeq/MiSeq
8
#Pair-end reads with length from 50bp to 150bp.
9
################################
10

    
11
#env settings, Java Runtime required. (openjdk-7 is used here) 
12
CUROVERSE_HOME=/mnt/curoverse
13
APP=$CUROVERSE_HOME/app
14
DATA=$CUROVERSE_HOME/data
15

    
16
################################
17
#prepare applications
18
################################
19

    
20
#1. BWA
21
wget http://downloads.sourceforge.net/project/bio-bwa/bwa-0.7.9a.tar.bz2
22
tar -jxvf bwa-0.7.9a.tar.bz2
23
cd bwa-0.7.9a && make && cd $CUROVERSE_HOME
24
mkdir -p $APP/bwa/0.7.9a/
25
find bwa-0.7.9a -executable -type f -print0 | xargs -0 -I {} cp {} $APP/bwa/0.7.9a/
26
rm -fr bwa* 
27

    
28
#2. samtools
29
wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2
30
tar -jxvf samtools-0.1.19.tar.bz2
31
cd samtools-0.1.19 && make && cd $CUROVERSE_HOME
32
mkdir -p $APP/samtools/0.1.19/
33
find  samtools-0.1.19 -executable -type f -print0 | xargs -0 -I {} mv {} $APP/samtools/0.1.19/
34
rm -fr samtools*
35

    
36
#3. picard
37
wget http://downloads.sourceforge.net/project/picard/picard-tools/1.114/picard-tools-1.114.zip
38
unzip picard-tools-1.114.zip
39
mkdir -p $APP/picard/1.114/
40
mv picard-tools-1.114/* $APP/picard/1.114/
41
rm -fr picard*
42

    
43

    
44
#4. GATK
45
#no automatic way to install GATK.
46
Register and download the GATK package. unpack it to /mnt/curoverse/app/gatk/3.1-1/
47

    
48

    
49
################################
50
#prepare reference data
51
################################
52

    
53
mkdir -p $DATA/fasta/hg19/ && cd $DATA/fasta/hg19/
54

    
55
for i in $(seq 1 22) X Y M; do echo $i; wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr${i}.fa.gz; done
56
gunzip *.gz
57
for i in $(seq 1 22) X Y M; do cat chr${i}.fa >> hg19.fasta; done
58
rm -fr chr*.fasta
59

    
60
#create index
61
$APP/samtools/0.1.19/samtools faidx hg19.fasta
62
#create dictionary
63
java -Xmx8g -jar $APP/picard/0.1.19/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict
64

    
65
java -Xmx8g -jar /home/hadoop/curoverse/app/picard/1.114/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict
66

    
67
cd $CUROVERSE_HOME
68

    
69
################################
70
#prepare library files (dbSNP, 1000G, etc)
71
################################
72
mkdir -p $DATA/lib/hg19/ && cd $DATA/lib/hg19/
73

    
74
#dbSNP138
75
#wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp138Common.txt.gz
76

    
77
#GATK resource bundle
78
GATK_FTP=ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19
79

    
80
#download library file for GATK's VariantRecalibrator
81
for f in hapmap_3.3.hg19.vcf.gz 1000G_omni2.5.hg19.vcf.gz 1000G_phase1.indels.hg19.vcf.gz Mills_and_1000G_gold_standard.indels.hg19.vcf.gz
82
do
83
curl --remote-name ${GATK_FTP}/${f}
84
done
85

    
86
################################
87
#Demo sample data (NA12878 from 1000G)
88
#NA12878: pair-end, WGS, 50x depth, Illumina HiSeq 2000 system  
89
#URL: http://www.ebi.ac.uk/ena/data/view/ERX237515
90
#NA12878_R1: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz
91
#NA12878_R2: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz
92
################################
93

    
94
#~100GB disk space required to download the NA12878
95
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz
96
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz
97

    
98
################################
99
#Begin of pipeline
100

    
101
#For testing purpose, two PE fastq files R1 and R2 were generated by pIRS (http://code.google.com/p/pirs/)
102
#and only chr1 of hg19 was used as reference genome
103

    
104
cd /mnt/
105

    
106
#0.1 create simulated PE Reads on 50x coverage
107
pirs simulate -i $DATA/hg19.chr1/hg19.chr1.fa \
108
-s ~/app/pIRS_111/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz \
109
-d ~/app/pIRS_111/Profiles/GC-depth_Profiles/humNew.gcdep_200.dat \
110
-b ~/app/pIRS_111/Profiles/InDel_Profiles/phixv2.InDel.matrix \
111
-l 100 -x 50 -Q 33 -o A
112

    
113
R1=/mnt/A_100_500_1.fq.gz
114
R2=/mnt/A_100_500_2.fq.gz
115

    
116
#0.2 build genome index for BWA
117
$APP/app/bwa/0.7.9a/bwa index -p hg19.chr1 chr1.fa
118

    
119
#1. align with BWA, with 4 threads (-t 4). Reading Group is required (-R ...).
120
time $APP/bwa/0.7.9a/bwa mem -t 4 -R '@RG\tID:group_id\tPL:illumina\tSM:sample_id' $DATA/hg19.chr1/hg19.chr1 ${R1} ${R2} > A.chr1.sam
121

    
122
#2. Sort into BAM
123
java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/SortSam.jar \
124
CREATE_INDEX=True SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT \
125
INPUT=A.chr1.sam OUTPUT=A.chr1.sort.bam
126

    
127
#3. remove PCR duplicate
128
java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/MarkDuplicates.jar \
129
CREATE_INDEX=true REMOVE_DUPLICATES=True ASSUME_SORTED=True VALIDATION_STRINGENCY=LENIENT METRICS_FILE=/dev/null \
130
INPUT=A.chr1.sort.bam OUTPUT=A.chr1.sort.dedup.bam
131

    
132
#4. fix mate information
133
java -Djava.io.tmpdir=/tmp/ -jar $APP/picard/1.114/FixMateInformation.jar \
134
SO=coordinate VALIDATION_STRINGENCY=LENIENT CREATE_INDEX=true \
135
INPUT=A.chr1.sort.dedup.bam OUTPUT=A.chr1.sort.dedup.mate.bam
136

    
137
#5. local alignment around Indels. create de novo intervals 
138
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
139
-T RealignerTargetCreator \
140
-I A.chr1.sort.dedup.mate.bam -o A.intervals
141

    
142
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
143
-T IndelRealigner \
144
-targetIntervals A.intervals \
145
-I A.chr1.sort.dedup.mate.bam -o A.chr1.sort.dedup.mate.relaign.bam
146

    
147
#6. base recalibrator, use dbSNP139 as "truth". If we know the population of subject sample, we can 
148
#use specific SNP datasets extract from 1000G with same population.
149
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
150
-T BaseRecalibrator \
151
-knownSites $DATA/dbsnp_138.hg19.vcf \
152
-o A.recal \
153
-I A.chr1.sort.dedup.mate.relaign.bam
154

    
155
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
156
-T PrintReads \
157
-BQSR A.recal \
158
-o A.chr1.sort.dedup.mate.relaign.recal.bam \
159
-I A.chr1.sort.dedup.mate.relaign.bam
160

    
161
#7. call variants with GATK's UnifiedGenotyper (faster, lots of raw calls, must run filter after)
162
time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
163
-T UnifiedGenotyper \
164
-glm BOTH \
165
-D $DATA/dbsnp_138.hg19.vcf \
166
-metrics A.snps.metrics \
167
-stand_call_conf 50.0 -stand_emit_conf 10.0 \
168
-o A.ug.vcf \
169
-I A.chr1.sort.dedup.mate.relaign.recal.bam
170

    
171
#8. call variants with GATK's HaplotypeCaller (2x slower than UG, much less accurate calls)
172
time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
173
-T HaplotypeCaller \
174
--dbsnp $DATA/dbsnp_138.hg19.vcf \
175
-stand_call_conf 50.0 -stand_emit_conf 10.0 \
176
-o A.hc.vcf \
177
-I A.chr1.sort.dedup.mate.relaign.recal.bam
178

    
179
#End of pipeline
180
################################