1
|
################################
|
2
|
#This pipeline is for single sample variants detection including
|
3
|
#Single Nucleotide Variants and Short Indels from
|
4
|
#Whole Genome Sequencing (WGS) or Whole Exome Sequencing (WES)
|
5
|
#with a minimum 10x of average depth of coverage.
|
6
|
#
|
7
|
#Sequencing platform: illumina HiSeq/MiSeq
|
8
|
#Pair-end reads with length from 50bp to 150bp.
|
9
|
################################
|
10
|
|
11
|
#env settings, Java Runtime required. (openjdk-7 is used here)
|
12
|
CUROVERSE_HOME=/mnt/curoverse
|
13
|
APP=$CUROVERSE_HOME/app
|
14
|
DATA=$CUROVERSE_HOME/data
|
15
|
|
16
|
################################
|
17
|
#prepare applications
|
18
|
################################
|
19
|
|
20
|
#1. BWA
|
21
|
wget http://downloads.sourceforge.net/project/bio-bwa/bwa-0.7.9a.tar.bz2
|
22
|
tar -jxvf bwa-0.7.9a.tar.bz2
|
23
|
cd bwa-0.7.9a && make && cd $CUROVERSE_HOME
|
24
|
mkdir -p $APP/bwa/0.7.9a/
|
25
|
find bwa-0.7.9a -executable -type f -print0 | xargs -0 -I {} cp {} $APP/bwa/0.7.9a/
|
26
|
rm -fr bwa*
|
27
|
|
28
|
#2. samtools
|
29
|
wget http://downloads.sourceforge.net/project/samtools/samtools/0.1.19/samtools-0.1.19.tar.bz2
|
30
|
tar -jxvf samtools-0.1.19.tar.bz2
|
31
|
cd samtools-0.1.19 && make && cd $CUROVERSE_HOME
|
32
|
mkdir -p $APP/samtools/0.1.19/
|
33
|
find samtools-0.1.19 -executable -type f -print0 | xargs -0 -I {} mv {} $APP/samtools/0.1.19/
|
34
|
rm -fr samtools*
|
35
|
|
36
|
#3. picard
|
37
|
wget http://downloads.sourceforge.net/project/picard/picard-tools/1.114/picard-tools-1.114.zip
|
38
|
unzip picard-tools-1.114.zip
|
39
|
mkdir -p $APP/picard/1.114/
|
40
|
mv picard-tools-1.114/* $APP/picard/1.114/
|
41
|
rm -fr picard*
|
42
|
|
43
|
|
44
|
#4. GATK
|
45
|
#no automatic way to install GATK.
|
46
|
Register and download the GATK package. unpack it to /mnt/curoverse/app/gatk/3.1-1/
|
47
|
|
48
|
|
49
|
################################
|
50
|
#prepare reference data
|
51
|
################################
|
52
|
|
53
|
mkdir -p $DATA/fasta/hg19/ && cd $DATA/fasta/hg19/
|
54
|
|
55
|
for i in $(seq 1 22) X Y M; do echo $i; wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/chromosomes/chr${i}.fa.gz; done
|
56
|
gunzip *.gz
|
57
|
for i in $(seq 1 22) X Y M; do cat chr${i}.fa >> hg19.fasta; done
|
58
|
rm -fr chr*.fasta
|
59
|
|
60
|
#create index
|
61
|
$APP/samtools/0.1.19/samtools faidx hg19.fasta
|
62
|
#create dictionary
|
63
|
java -Xmx8g -jar $APP/picard/0.1.19/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict
|
64
|
|
65
|
java -Xmx8g -jar /home/hadoop/curoverse/app/picard/1.114/CreateSequenceDictionary.jar R=hg19.fasta O=hg19.dict
|
66
|
|
67
|
cd $CUROVERSE_HOME
|
68
|
|
69
|
################################
|
70
|
#prepare library files (dbSNP, 1000G, etc)
|
71
|
################################
|
72
|
mkdir -p $DATA/lib/hg19/ && cd $DATA/lib/hg19/
|
73
|
|
74
|
#dbSNP138
|
75
|
#wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/snp138Common.txt.gz
|
76
|
|
77
|
#GATK resource bundle
|
78
|
GATK_FTP=ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/2.8/hg19
|
79
|
|
80
|
#download library file for GATK's VariantRecalibrator
|
81
|
for f in hapmap_3.3.hg19.vcf.gz 1000G_omni2.5.hg19.vcf.gz 1000G_phase1.indels.hg19.vcf.gz Mills_and_1000G_gold_standard.indels.hg19.vcf.gz
|
82
|
do
|
83
|
curl --remote-name ${GATK_FTP}/${f}
|
84
|
done
|
85
|
|
86
|
################################
|
87
|
#Demo sample data (NA12878 from 1000G)
|
88
|
#NA12878: pair-end, WGS, 50x depth, Illumina HiSeq 2000 system
|
89
|
#URL: http://www.ebi.ac.uk/ena/data/view/ERX237515
|
90
|
#NA12878_R1: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz
|
91
|
#NA12878_R2: ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz
|
92
|
################################
|
93
|
|
94
|
#~100GB disk space required to download the NA12878
|
95
|
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_1.fastq.gz
|
96
|
wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR262/ERR262997/ERR262997_2.fastq.gz
|
97
|
|
98
|
################################
|
99
|
#Begin of pipeline
|
100
|
|
101
|
#For testing purpose, two PE fastq files R1 and R2 were generated by pIRS (http://code.google.com/p/pirs/)
|
102
|
#and only chr1 of hg19 was used as reference genome
|
103
|
|
104
|
cd /mnt/
|
105
|
|
106
|
#0.1 create simulated PE Reads on 50x coverage
|
107
|
pirs simulate -i $DATA/hg19.chr1/hg19.chr1.fa \
|
108
|
-s ~/app/pIRS_111/Profiles/Base-Calling_Profiles/humNew.PE100.matrix.gz \
|
109
|
-d ~/app/pIRS_111/Profiles/GC-depth_Profiles/humNew.gcdep_200.dat \
|
110
|
-b ~/app/pIRS_111/Profiles/InDel_Profiles/phixv2.InDel.matrix \
|
111
|
-l 100 -x 50 -Q 33 -o A
|
112
|
|
113
|
R1=/mnt/A_100_500_1.fq.gz
|
114
|
R2=/mnt/A_100_500_2.fq.gz
|
115
|
|
116
|
#0.2 build genome index for BWA
|
117
|
$APP/app/bwa/0.7.9a/bwa index -p hg19.chr1 chr1.fa
|
118
|
|
119
|
#1. align with BWA, with 4 threads (-t 4). Reading Group is required (-R ...).
|
120
|
time $APP/bwa/0.7.9a/bwa mem -t 4 -R '@RG\tID:group_id\tPL:illumina\tSM:sample_id' $DATA/hg19.chr1/hg19.chr1 ${R1} ${R2} > A.chr1.sam
|
121
|
|
122
|
#2. Sort into BAM
|
123
|
java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/SortSam.jar \
|
124
|
CREATE_INDEX=True SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT \
|
125
|
INPUT=A.chr1.sam OUTPUT=A.chr1.sort.bam
|
126
|
|
127
|
#3. remove PCR duplicate
|
128
|
java -Xmx4g -Djava.io.tmpdir=/tmp -jar $APP/picard/1.114/MarkDuplicates.jar \
|
129
|
CREATE_INDEX=true REMOVE_DUPLICATES=True ASSUME_SORTED=True VALIDATION_STRINGENCY=LENIENT METRICS_FILE=/dev/null \
|
130
|
INPUT=A.chr1.sort.bam OUTPUT=A.chr1.sort.dedup.bam
|
131
|
|
132
|
#4. fix mate information
|
133
|
java -Djava.io.tmpdir=/tmp/ -jar $APP/picard/1.114/FixMateInformation.jar \
|
134
|
SO=coordinate VALIDATION_STRINGENCY=LENIENT CREATE_INDEX=true \
|
135
|
INPUT=A.chr1.sort.dedup.bam OUTPUT=A.chr1.sort.dedup.mate.bam
|
136
|
|
137
|
#5. local alignment around Indels. create de novo intervals
|
138
|
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
139
|
-T RealignerTargetCreator \
|
140
|
-I A.chr1.sort.dedup.mate.bam -o A.intervals
|
141
|
|
142
|
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
143
|
-T IndelRealigner \
|
144
|
-targetIntervals A.intervals \
|
145
|
-I A.chr1.sort.dedup.mate.bam -o A.chr1.sort.dedup.mate.relaign.bam
|
146
|
|
147
|
#6. base recalibrator, use dbSNP139 as "truth". If we know the population of subject sample, we can
|
148
|
#use specific SNP datasets extract from 1000G with same population.
|
149
|
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
150
|
-T BaseRecalibrator \
|
151
|
-knownSites $DATA/dbsnp_138.hg19.vcf \
|
152
|
-o A.recal \
|
153
|
-I A.chr1.sort.dedup.mate.relaign.bam
|
154
|
|
155
|
java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
156
|
-T PrintReads \
|
157
|
-BQSR A.recal \
|
158
|
-o A.chr1.sort.dedup.mate.relaign.recal.bam \
|
159
|
-I A.chr1.sort.dedup.mate.relaign.bam
|
160
|
|
161
|
#7. call variants with GATK's UnifiedGenotyper (faster, lots of raw calls, must run filter after)
|
162
|
time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
163
|
-T UnifiedGenotyper \
|
164
|
-glm BOTH \
|
165
|
-D $DATA/dbsnp_138.hg19.vcf \
|
166
|
-metrics A.snps.metrics \
|
167
|
-stand_call_conf 50.0 -stand_emit_conf 10.0 \
|
168
|
-o A.ug.vcf \
|
169
|
-I A.chr1.sort.dedup.mate.relaign.recal.bam
|
170
|
|
171
|
#8. call variants with GATK's HaplotypeCaller (2x slower than UG, much less accurate calls)
|
172
|
time java -Xmx4g -jar $APP/gatk/3.1-1/GenomeAnalysisTK.jar -R $DATA/hg19.chr1/chr1.fa \
|
173
|
-T HaplotypeCaller \
|
174
|
--dbsnp $DATA/dbsnp_138.hg19.vcf \
|
175
|
-stand_call_conf 50.0 -stand_emit_conf 10.0 \
|
176
|
-o A.hc.vcf \
|
177
|
-I A.chr1.sort.dedup.mate.relaign.recal.bam
|
178
|
|
179
|
#End of pipeline
|
180
|
################################
|