0
|
1 #!/bin/bash
|
|
2
|
|
3 test="N"
|
|
4
|
|
5
|
|
6 function usage(){
|
|
7 echo "usage: $0 todo"
|
|
8 }
|
|
9
|
|
10 function runfilter(){
|
|
11 ifile=$1
|
|
12 columnname=$2
|
|
13 threshold=$3
|
|
14
|
|
15 if [[ $threshold == "-1" ]]
|
|
16 then
|
|
17 echo "not filtering"
|
|
18 return
|
|
19 fi
|
|
20
|
|
21 echo "filtering: $columnname, $threshold"
|
|
22 cat $ifile
|
|
23
|
|
24 #get column number corresponding to column header
|
|
25 column=`awk 'BEGIN{
|
|
26 FS="\t";
|
|
27 col=-1
|
|
28 }{
|
|
29 if(FNR==1){
|
|
30 for(i=1;i<=NF;i++){
|
|
31 if($i == "'"${columnname}"'")
|
|
32 col=i
|
|
33 }
|
|
34 print col
|
|
35 }
|
|
36 }' $ifile `
|
|
37
|
|
38 if [ $column == -1 ]
|
|
39 then
|
|
40 echo "no such column, exiting"
|
|
41 return
|
|
42 fi
|
|
43
|
|
44 #perform filtering using the threshold
|
|
45 awk 'BEGIN{
|
|
46 FS="\t";
|
|
47 OFS="\t";
|
|
48 }{
|
|
49 if(FNR==1)
|
|
50 print $0;
|
|
51 if(FNR>1){
|
|
52 if( $"'"${column}"'" == "" ) # empty column, then print
|
|
53 print $0
|
|
54 else if ("'"${threshold}"'" == "text"){} #if set to text dont check threshold
|
|
55
|
|
56 else if ($"'"${column}"'" < "'"${threshold}"'") #else do check it
|
|
57 print $0
|
|
58 }
|
|
59 }' $ifile > tmpfile
|
|
60
|
|
61 mv tmpfile $ifile
|
|
62 }
|
|
63
|
|
64 # arguments: originalfile,resultfile,chrcol,startcol,endcol,refcol,obscol,addcols
|
|
65 function joinresults(){
|
|
66 ofile=$1
|
|
67 rfile=$2
|
|
68 colchr=$3
|
|
69 colstart=$4
|
|
70 colend=$5
|
|
71 colref=$6
|
|
72 colobs=$7
|
|
73 addcols=$8 #e.g. "B.col1,B.col2"
|
|
74
|
|
75 test="N"
|
|
76
|
|
77 # echo "joining result with original file"
|
|
78 if [ $test == "Y" ]
|
|
79 then
|
|
80 echo "ofile: $ofile"
|
|
81 head $ofile
|
|
82 echo "rfile: $rfile"
|
|
83 head $rfile
|
|
84 fi
|
|
85 numlines=`wc $rfile | cut -d" " -f2`
|
|
86
|
|
87 # if empty results file, just add header fields
|
|
88 if [[ ! -s $rfile ]]
|
|
89 then
|
|
90 dummycol=${addcols:2}
|
|
91 outputcol=${dummycol//",B."/" "}
|
|
92 numcommas=`echo "$addcols" | grep -o "," | wc -l`
|
|
93
|
|
94 awk 'BEGIN{FS="\t";OFS="\t"}{
|
|
95 if(FNR==1)
|
|
96 print $0,"'"$outputcol"'";
|
|
97 else{
|
|
98 printf $0
|
|
99 for(i=0;i<="'"$numcommas"'"+1;i++)
|
|
100 printf "\t"
|
|
101 printf "\n"
|
|
102 }
|
|
103 }END{}' $ofile > tempofile
|
|
104
|
|
105 mv tempofile $ofile
|
|
106 return
|
|
107 fi
|
|
108
|
|
109
|
|
110 #get input file column names for cgatools join
|
|
111 col_chr_name=`head -1 $rfile | cut -f${colchr}`
|
|
112 col_start_name=`head -1 $rfile | cut -f${colstart}`
|
|
113 col_end_name=`head -1 $rfile | cut -f${colend}`
|
|
114 col_ref_name=`head -1 $rfile | cut -f${colref}`
|
|
115 col_obs_name=`head -1 $rfile | cut -f${colobs}`
|
|
116
|
|
117 #get annotation file column names for cgatools join
|
|
118 chr_name=`head -1 $ofile | cut -f${chrcol}`
|
|
119 start_name=`head -1 $ofile | cut -f${startcol}`
|
|
120 end_name=`head -1 $ofile | cut -f${endcol}`
|
|
121 ref_name=`head -1 $ofile | cut -f${refcol}`
|
|
122 obs_name=`head -1 $ofile | cut -f${obscol}`
|
|
123
|
|
124 if [ $test == "Y" ]
|
|
125 then
|
|
126 echo "input file"
|
|
127 echo "chr col: $col_chr_name ($colchr)"
|
|
128 echo "start col: $col_start_name ($colstart)"
|
|
129 echo "end col: $col_end_name ($colend)"
|
|
130 echo "ref col: $col_ref_name ($colref)"
|
|
131 echo "obs col: $col_obs_name ($colobs)"
|
|
132 echo ""
|
|
133 echo "annotation file"
|
|
134 echo "chr col: $chr_name ($chrcol)"
|
|
135 echo "start col: $start_name ($startcol)"
|
|
136 echo "end col: $end_name ($endcol)"
|
|
137 echo "ref col: $ref_name ($refcol)"
|
|
138 echo "obs col: $obs_name ($obscol)"
|
|
139 fi
|
|
140
|
|
141 #perform join
|
|
142 cgatools join --beta \
|
|
143 --input $ofile $rfile \
|
|
144 --output temporiginal \
|
|
145 --match ${chr_name}:${col_chr_name} \
|
|
146 --match ${start_name}:${col_start_name} \
|
|
147 --match ${end_name}:${col_end_name} \
|
|
148 --match ${ref_name}:${col_ref_name} \
|
|
149 --match ${obs_name}:${col_obs_name} \
|
|
150 --select A.*,$addcols \
|
|
151 --always-dump \
|
|
152 --output-mode compact
|
|
153
|
|
154 #replace originalfile
|
|
155 sed -i 's/^>//g' temporiginal #join sometimes adds a '>' symbol to header
|
|
156 mv temporiginal originalfile
|
|
157
|
|
158 if [ $test == "Y" ]
|
|
159 then
|
|
160 echo "joining complete"
|
|
161 head originalfile
|
|
162 echo ""
|
|
163 fi
|
|
164
|
|
165 }
|
|
166
|
|
167
|
|
168
|
|
169
|
|
170 set -- `getopt -n$0 -u -a --longoptions="inputfile: buildver: humandb: varfile: VCF: chrcol: startcol: endcol: refcol: obscol: vartypecol: convertcoords: geneanno: verdbsnp: tfbs: mce: cytoband: segdup: dgv: gwas: ver1000g: cg46: cg69: impactscores: esp: gerp: cosmic61: cosmic63: cosmic64: cosmic65: outall: outfilt: outinvalid: scriptsdir: dorunannovar: dofilter: filt_dbsnp: filt1000GALL: filt1000GAFR: filt1000GAMR: filt1000GASN: filt1000GEUR: filtESP6500ALL: filtESP6500EA: filtESP6500AA: filtcg46: filtcg69: dummy:" "h:" "$@"` || usage
|
|
171 [ $# -eq 0 ] && usage
|
|
172
|
|
173
|
|
174
|
|
175 while [ $# -gt 0 ]
|
|
176 do
|
|
177 case "$1" in
|
|
178 --inputfile) infile=$2;shift;; # inputfile
|
|
179 --buildver) buildver=$2;shift;; # hg18 or hg19
|
|
180 --humandb) humandb=$2;shift;; # location of humandb database
|
|
181 --varfile) varfile=$2;shift;; # Y or N
|
|
182 --VCF) vcf=$2;shift;; #Y or N
|
|
183 --chrcol) chrcol=$2;shift;; # which column has chr
|
|
184 --startcol) startcol=$2;shift;; # which column has start
|
|
185 --endcol) endcol=$2;shift;; # which column has end
|
|
186 --refcol) refcol=$2;shift;; # which column has ref
|
|
187 --obscol) obscol=$2;shift;; # which column has alt
|
|
188 --vartypecol) vartypecol=$2;shift;; # which column has vartype
|
|
189 --convertcoords) convertcoords=$2;shift;; # Y or N convert coordinate from CG to 1-based?
|
|
190 --geneanno) geneanno=$2;shift;; # comma-separated list of strings refSeq, knowngene, ensgene
|
|
191 --verdbsnp) verdbsnp=$2;shift;; #comma-separated list of dbsnp version to annotate with (e.g. "132,135NonFlagged,137")"
|
|
192 --tfbs) tfbs=$2;shift;; # Y or N
|
|
193 --mce) mce=$2;shift;; # Y or N
|
|
194 --cytoband) cytoband=$2;shift;; # Y or N
|
|
195 --segdup) segdup=$2;shift;; # Y or N
|
|
196 --dgv) dgv=$2;shift;; # Y or N
|
|
197 --gwas) gwas=$2;shift;; # Y or N
|
|
198 --ver1000g) ver1000g=$2;shift;; # Y or N
|
|
199 --cg46) cg46=$2;shift;;
|
|
200 --cg69) cg69=$2;shift;;
|
|
201 --impactscores) impactscores=$2;shift;; # Y or N
|
|
202 --scriptsdir) scriptsdir=$2;shift;; # Y or N
|
|
203 --esp) esp=$2;shift;; # Y or N
|
|
204 --gerp) gerp=$2;shift;; # Y or N
|
|
205 --cosmic61) cosmic61=$2;shift;; # Y or N
|
|
206 --cosmic63) cosmic63=$2;shift;; # Y or N
|
|
207 --cosmic64) cosmic64=$2;shift;; # Y or N
|
|
208 --cosmic65) cosmic65=$2;shift;; # Y or N
|
|
209 --filt_dbsnp) filt_dbsnp=$2;shift;;
|
|
210 --filt1000GALL) threshold_1000g_ALL=$2;shift;; #threshold value
|
|
211 --filt1000GAFR) threshold_1000g_AFR=$2;shift;; #threshold value
|
|
212 --filt1000GAMR) threshold_1000g_AMR=$2;shift;; #threshold value
|
|
213 --filt1000GASN) threshold_1000g_ASN=$2;shift;; #threshold value
|
|
214 --filt1000GEUR) threshold_1000g_EUR=$2;shift;; #threshold value
|
|
215 --filtESP6500ALL) threshold_ESP6500_ALL=$2;shift;; #threshold value
|
|
216 --filtESP6500EA) threshold_ESP6500_EA=$2;shift;; #threshold value
|
|
217 --filtESP6500AA) threshold_ESP6500_AA=$2;shift;; #threshold value
|
|
218 --filtcg46) threshold_cg46=$2;shift;;
|
|
219 --filtcg69) threshold_cg69=$2;shift;;
|
|
220 --outall) outfile_all=$2;shift;; # file
|
|
221 --outfilt) outfile_filt=$2;shift;; # file
|
|
222 --outinvalid) outfile_invalid=$2;shift;; #file
|
|
223 --dorunannovar) dorunannovar=$2;shift;; #Y or N
|
|
224 --dofilter) dofilter=$2;shift;; #Y or N
|
|
225 -h) shift;;
|
|
226 --) shift;break;;
|
|
227 -*) usage;;
|
|
228 *) break;;
|
|
229 esac
|
|
230 shift
|
|
231 done
|
|
232
|
|
233
|
|
234 if [ $test == "Y" ]
|
|
235 then
|
|
236 echo "dorunannovar: $dorunannovar"
|
|
237 echo "infile: $infile"
|
|
238 echo "buildver: $buildver"
|
|
239 echo "annovardb: $humandb"
|
|
240 echo "verdbnsp: $verdbsnp"
|
|
241 echo "geneanno: $geneanno"
|
|
242 echo "tfbs: $tfbs"
|
|
243 echo "mce: $mce"
|
|
244 echo "cytoband: $cytoband"
|
|
245 echo "segdup: $segdup"
|
|
246 echo "dgv: $dgv"
|
|
247 echo "gwas: $gwas"
|
|
248 echo "g1000: ${g1000}"
|
|
249 echo "cg46: ${cg46}"
|
|
250 echo "cg69: ${cg69}"
|
|
251 echo "impactscores: $impactscores"
|
|
252 echo "esp: $esp"
|
|
253 echo "gerp: $gerp"
|
|
254 echo "cosmic: $cosmic"
|
|
255 echo "outfile: $outfile_all"
|
|
256 echo "outinvalid: $outfile_invalid"
|
|
257 echo "outfiltered: $outfile_filt"
|
|
258 echo "varfile: $varfile"
|
|
259 echo "vcf" $vcf
|
|
260 echo "chrcol: $chrcol"
|
|
261 echo "startcol: $startcol"
|
|
262 echo "endcol: $endcol"
|
|
263 echo "refcol: $refcol"
|
|
264 echo "obscol: $obscol"
|
|
265 echo "convertcoords: $convertcoords"
|
|
266 echo "vartypecol: $vartypecol"
|
|
267 echo "dofilter: $dofilter"
|
|
268 echo "threshold_1000g_ALL : $threshold_1000g_ALL"
|
|
269 echo "threshold_1000g_AFR : $threshold_1000g_AFR"
|
|
270 echo "threshold_1000g_AMR : $threshold_1000g_AMR"
|
|
271 echo "threshold_1000g_ASN : $threshold_1000g_ASN"
|
|
272 echo "threshold_1000g_EUR : $threshold_1000g_EUR"
|
|
273 echo "threshold_ESP6500_ALL: $threshold_ESP6500_ALL"
|
|
274 echo "threshold_ESP6500_EA : $threshold_ESP6500_EA"
|
|
275 echo "threshold_ESP6500_AA : $threshold_ESP6500_AA"
|
|
276
|
|
277 fi
|
|
278
|
|
279
|
|
280 refgene="N"
|
|
281 knowngene="N"
|
|
282 ensgene="N"
|
|
283 #parse geneanno param
|
|
284 if [[ $geneanno =~ "refSeq" ]]
|
|
285 then
|
|
286 refgene="Y"
|
|
287 fi
|
|
288 if [[ $geneanno =~ "knowngene" ]]
|
|
289 then
|
|
290 knowngene="Y"
|
|
291 fi
|
|
292 if [[ $geneanno =~ "ensgene" ]]
|
|
293 then
|
|
294 ensgene="Y"
|
|
295 fi
|
|
296
|
|
297
|
|
298 #parse verdbsnp/1000g/esp strings
|
|
299 dbsnpstr=${verdbsnp//,/ }
|
|
300 filt_dbsnpstr=${filt_dbsnp//,/ }
|
|
301 g1000str=${ver1000g//,/ }
|
|
302 espstr=${esp//,/ }
|
|
303
|
|
304 if [ $test == "Y" ]
|
|
305 then
|
|
306 echo "annotate dbsnp: $dbsnpstr"
|
|
307 echo "annotate esp: $espstr"
|
|
308 echo "filter dbsnp: $filt_dbsnpstr"
|
|
309 fi
|
|
310
|
|
311 mutationtaster="N"
|
|
312 avsift="N"
|
|
313 lrt="N"
|
|
314 polyphen2="N"
|
|
315 phylop="N"
|
|
316 ljbsift="N"
|
|
317 #parse impactscores param
|
|
318 if [[ $impactscores =~ "mutationtaster" ]]
|
|
319 then
|
|
320 mutationtaster="Y"
|
|
321 fi
|
|
322 if [[ $impactscores =~ "sift" ]]
|
|
323 then
|
|
324 avsift="Y"
|
|
325 fi
|
|
326 if [[ $impactscores =~ "lrt" ]]
|
|
327 then
|
|
328 lrt="Y"
|
|
329 fi
|
|
330 if [[ $impactscores =~ "ljbsift" ]]
|
|
331 then
|
|
332 ljbsift="Y"
|
|
333 fi
|
|
334 if [[ $impactscores =~ "pp2" ]]
|
|
335 then
|
|
336 polyphen2="Y"
|
|
337 fi
|
|
338 if [[ $impactscores =~ "phylop" ]]
|
|
339 then
|
|
340 phylop="Y"
|
|
341 fi
|
|
342
|
|
343 if [[ $varfile == "Y" ]]
|
|
344 then
|
|
345 convertcoords="Y"
|
|
346 fi
|
|
347
|
|
348 #ljb refers to Liu, Jian, Boerwinkle paper in Human Mutation with pubmed ID 21520341. Cite this paper if you use the scores
|
|
349
|
|
350
|
|
351 #column header names we will be adding
|
|
352 # ESP 6500
|
|
353 esp6500si_colheader_ALL="ESP6500si_ALL"
|
|
354 esp6500si_colheader_EA="ESP6500si_EA"
|
|
355 esp6500si_colheader_AA="ESP6500si_AA"
|
|
356 esp6500_colheader_ALL="ESP6500_ALL"
|
|
357 esp6500_colheader_EA="ESP6500_EA"
|
|
358 esp6500_colheader_AA="ESP6500_AA"
|
|
359 esp5400si_colheader_ALL="ESP5400si_ALL"
|
|
360 esp5400si_colheader_EA="ESP5400si_EA"
|
|
361 esp5400si_colheader_AA="ESP5400si_AA"
|
|
362 esp5400_colheader_ALL="ESP5400_ALL"
|
|
363 esp5400_colheader_EA="ESP5400_EA"
|
|
364 esp5400_colheader_AA="ESP5400_AA"
|
|
365
|
|
366
|
|
367 # cg46 cg69
|
|
368 cg46_colheader="CG_46_genomes"
|
|
369 cg69_colheader="CG_69_genomes"
|
|
370
|
|
371 cp $infile originalfile
|
|
372 #run annovar or filter only?
|
|
373 if [ $dorunannovar == "Y" ]
|
|
374 then
|
|
375
|
|
376
|
|
377 ####################################
|
|
378 #
|
|
379 # PREPARE INPUT FILE
|
|
380 #
|
|
381 ####################################
|
|
382
|
|
383 echo "converting input file"
|
|
384 vcfheader=""
|
|
385 if [ $vcf == "Y" ] #if CG varfile, convert
|
|
386 then
|
|
387 # convert vcf to annovarinput
|
|
388 $scriptsdir/convert2annovar.pl --format vcf4 --includeinfo --outfile annovarinput $infile 2>&1
|
|
389
|
|
390 #construct header line from vcf file
|
|
391 cat $infile | grep "#CHROM" > additionalcols
|
|
392 sed -i 's/#//g' additionalcols
|
|
393 vcfheader="\t`cat additionalcols`"
|
|
394 echo "vcfheader:$vcfheader"
|
|
395 echo -e "chromosome\tbegin\tend\treference\tobserved\t`cat additionalcols`" > originalfile
|
|
396 cat annovarinput >> originalfile
|
|
397
|
|
398 chrcol=1
|
|
399 startcol=2
|
|
400 endcol=3
|
|
401 refcol=4
|
|
402 obscol=5
|
|
403
|
|
404
|
|
405 elif [ $varfile == "Y" ] #if CG varfile, convert
|
|
406 then
|
|
407 # convert varfile
|
|
408 $scriptsdir/convert2annovar.pl --format cg --outfile annovarinput $infile 2>&1
|
|
409 echo -e "chromosome\tbegin\tend\treference\talleleSeq\tvarType\thaplotype" > originalfile
|
|
410 cat annovarinput | cut -f1-6,8 >> originalfile
|
|
411 cat annovarinput | cut -f1-5 >> annovarinput2
|
|
412 mv annovarinput2 annovarinput
|
|
413
|
|
414 chrcol=1
|
|
415 startcol=2
|
|
416 endcol=3
|
|
417 refcol=4
|
|
418 obscol=5
|
|
419
|
|
420 elif [ $convertcoords == "Y" ] # if CG-coordinates, convert
|
|
421 then
|
|
422 #echo "rearranging columns and converting coordinates"
|
|
423 awk 'BEGIN{
|
|
424 FS="\t";
|
|
425 OFS="\t";
|
|
426 }{
|
|
427 if(FNR>1) {
|
|
428 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 };
|
|
429 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
|
|
430 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
|
|
431 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 };
|
|
432
|
|
433 printf("%s\t%s\t%s\t%s\t%s\n" ,$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'");
|
|
434 }
|
|
435 }
|
|
436 END{
|
|
437 }' $infile > annovarinput
|
|
438
|
|
439 #remove any "chr" prefixes
|
|
440 sed -i 's/chr//g' annovarinput
|
|
441
|
|
442 awk 'BEGIN{
|
|
443 FS="\t";
|
|
444 OFS="\t";
|
|
445 }{
|
|
446 if(FNR>=1) {
|
|
447 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" += 1 };
|
|
448 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "-" };
|
|
449 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" +=1; $"'"${obscol}"'" = "-" };
|
|
450 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" += 1 };
|
|
451
|
|
452 print $0
|
|
453 }
|
|
454 }
|
|
455 END{
|
|
456 }' $infile > originalfile
|
|
457
|
|
458 #remove any "chr" prefixes
|
|
459 sed -i 's/chr//g' originalfile
|
|
460 sed -i 's/omosome/chromosome/g' originalfile
|
|
461
|
|
462
|
|
463 else #only rearrange columns if already 1-based coordinates
|
|
464 echo "rearranging columns "
|
|
465 awk 'BEGIN{
|
|
466 FS="\t";
|
|
467 OFS="\t";
|
|
468 }{
|
|
469 if(FNR>1) {
|
|
470 printf("%s\t%s\t%s\t%s\t%s\n",$"'"${chrcol}"'",$"'"${startcol}"'",$"'"${endcol}"'",$"'"${refcol}"'",$"'"${obscol}"'");
|
|
471 }
|
|
472 }
|
|
473 END{
|
|
474 }' $infile > annovarinput
|
|
475
|
|
476 #remove any "chr" prefixes
|
|
477 sed -i 's/chr//g' annovarinput
|
|
478 sed 's/chr//g' $infile > originalfile
|
|
479 sed -i 's/omosome/chromosome/g' originalfile
|
|
480 fi
|
|
481
|
|
482 echo "...finished conversion"
|
|
483
|
|
484
|
|
485
|
|
486
|
|
487 ####################################
|
|
488 #
|
|
489 # RUN ANNOVAR COMMANDS
|
|
490 #
|
|
491 ####################################
|
|
492
|
|
493
|
|
494
|
|
495 ###### gene-based annotation #######
|
|
496
|
|
497 # RefSeq Gene
|
|
498 if [ $refgene == "Y" ]
|
|
499 then
|
|
500 echo -e "\nrefSeq gene"
|
|
501 $scriptsdir/annotate_variation.pl --geneanno --buildver $buildver -dbtype gene annovarinput $humandb 2>&1
|
|
502
|
|
503 annovarout=annovarinput.variant_function
|
|
504 sed -i '1i\RefSeq_Func\tRefSeq_Gene\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
505 joinresults originalfile $annovarout 3 4 5 6 7 B.RefSeq_Func,B.RefSeq_Gene
|
|
506
|
|
507 annovarout=annovarinput.exonic_variant_function
|
|
508 sed -i '1i\linenum\tRefSeq_ExonicFunc\tRefSeq_AAChange\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
509 joinresults originalfile $annovarout 4 5 6 7 8 B.RefSeq_ExonicFunc,B.RefSeq_AAChange
|
|
510 fi
|
|
511
|
|
512
|
|
513 # UCSC KnownGene
|
|
514 if [ $knowngene == "Y" ]
|
|
515 then
|
|
516 echo -e "\nUCSC known gene"
|
|
517 $scriptsdir/annotate_variation.pl --geneanno --buildver $buildver -dbtype knowngene annovarinput $humandb 2>&1
|
|
518
|
|
519 annovarout=annovarinput.variant_function
|
|
520 sed -i '1i\UCSCKnownGene_Func\tUCSCKnownGene_Gene\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
521 joinresults originalfile $annovarout 3 4 5 6 7 B.UCSCKnownGene_Func,B.UCSCKnownGene_Gene
|
|
522
|
|
523 annovarout=annovarinput.exonic_variant_function
|
|
524 sed -i '1i\linenum\tUCSCKnownGene_ExonicFunc\tUCSCKnownGene_AAChange\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
525 joinresults originalfile $annovarout 4 5 6 7 8 B.UCSCKnownGene_ExonicFunc,B.UCSCKnownGene_AAChange
|
|
526 fi
|
|
527
|
|
528
|
|
529 # Emsembl Gene
|
|
530 if [ $ensgene == "Y" ]
|
|
531 then
|
|
532 echo -e "\nEnsembl gene"
|
|
533 $scriptsdir/annotate_variation.pl --geneanno --buildver $buildver -dbtype ensgene annovarinput $humandb 2>&1
|
|
534
|
|
535 annovarout=annovarinput.variant_function
|
|
536 sed -i '1i\EnsemblGene_Func\tEnsemblGene_Gene\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
537 joinresults originalfile $annovarout 3 4 5 6 7 B.EnsemblGene_Func,B.EnsemblGene_Gene
|
|
538
|
|
539 annovarout=annovarinput.exonic_variant_function
|
|
540 sed -i '1i\linenum\tEnsemblGene_ExonicFunc\tEnsemblGene_AAChange\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
541 joinresults originalfile $annovarout 4 5 6 7 8 B.EnsemblGene_ExonicFunc,B.EnsemblGene_AAChange
|
|
542 fi
|
|
543
|
|
544
|
|
545
|
|
546 ###### region-based annotation #######
|
|
547
|
|
548
|
|
549 # Transcription Factor Binding Sites Annotation
|
|
550 if [ $mce == "Y" ]
|
|
551 then
|
|
552 echo -e "\nMost Conserved Elements"
|
|
553
|
|
554 if [ $buildver == "hg18" ]
|
|
555 then
|
|
556 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype mce44way annovarinput $humandb 2>&1
|
|
557 annovarout=annovarinput.${buildver}_phastConsElements44way
|
|
558 sed -i '1i\db\tphastConsElements44way\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
559 joinresults originalfile $annovarout 3 4 5 6 7 B.phastConsElements44way
|
|
560
|
|
561 else #hg19
|
|
562 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype mce46way annovarinput $humandb 2>&1
|
|
563 annovarout=annovarinput.${buildver}_phastConsElements46way
|
|
564 sed -i '1i\db\tphastConsElements46way\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
565 joinresults originalfile $annovarout 3 4 5 6 7 B.phastConsElements46way
|
|
566 fi
|
|
567
|
|
568 fi
|
|
569
|
|
570
|
|
571
|
|
572 # Transcription Factor Binding Sites Annotation
|
|
573 if [ $tfbs == "Y" ]
|
|
574 then
|
|
575 echo -e "\nTranscription Factor Binding Site Annotation"
|
|
576 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype tfbs annovarinput $humandb 2>&1
|
|
577
|
|
578 # arguments: originalfile, resultfile,chrcol,startcol,endcol,refcol,obscol,selectcolumns
|
|
579 annovarout=annovarinput.${buildver}_tfbsConsSites
|
|
580 sed -i '1i\db\tTFBS\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
581 joinresults originalfile $annovarout 3 4 5 6 7 B.TFBS
|
|
582 fi
|
|
583
|
|
584
|
|
585
|
|
586 # Identify cytogenetic band for genetic variants
|
|
587 if [ $cytoband == "Y" ]
|
|
588 then
|
|
589 echo -e "\nCytogenic band Annotation"
|
|
590 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype band annovarinput $humandb 2>&1
|
|
591
|
|
592 annovarout=annovarinput.${buildver}_cytoBand
|
|
593 sed -i '1i\db\tBand\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
594 joinresults originalfile $annovarout 3 4 5 6 7 B.Band
|
|
595 fi
|
|
596
|
|
597
|
|
598 # Identify variants located in segmental duplications
|
|
599 if [ $segdup == "Y" ]
|
|
600 then
|
|
601 echo -e "\nSegmental Duplications Annotation"
|
|
602 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype segdup annovarinput $humandb 2>&1
|
|
603
|
|
604 annovarout=annovarinput.${buildver}_genomicSuperDups
|
|
605 sed -i '1i\db\tSegDup\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
606 joinresults originalfile $annovarout 3 4 5 6 7 B.SegDup
|
|
607 fi
|
|
608
|
|
609
|
|
610
|
|
611 # Identify previously reported structural variants in DGV
|
|
612 if [ $dgv == "Y" ]
|
|
613 then
|
|
614 echo -e "\nDGV Annotation"
|
|
615 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype dgv annovarinput $humandb 2>&1
|
|
616
|
|
617 annovarout=annovarinput.${buildver}_dgv
|
|
618 sed -i '1i\db\tDGV\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
619 joinresults originalfile $annovarout 3 4 5 6 7 B.DGV
|
|
620 fi
|
|
621
|
|
622
|
|
623 # Identify variants reported in previously published GWAS studies
|
|
624 if [ $gwas == "Y" ]
|
|
625 then
|
|
626 echo -e "\nGWAS Annotation"
|
|
627 $scriptsdir/annotate_variation.pl --regionanno --buildver $buildver -dbtype gwascatalog annovarinput $humandb 2>&1
|
|
628
|
|
629 annovarout=annovarinput.${buildver}_gwasCatalog
|
|
630 sed -i '1i\db\tGWAS\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
631 joinresults originalfile $annovarout 3 4 5 6 7 B.GWAS
|
|
632 fi
|
|
633
|
|
634
|
|
635
|
|
636
|
|
637 ###### filter-based annotation #######
|
|
638
|
|
639 #dbSNP
|
|
640 for version in $dbsnpstr
|
|
641 do
|
|
642 if [ $version == "None" ]
|
|
643 then
|
|
644 break
|
|
645 fi
|
|
646 echo -e "\ndbSNP region Annotation, version: $version"
|
|
647 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ${version} annovarinput $humandb 2>&1
|
|
648
|
|
649 annovarout=annovarinput.${buildver}_${version}_dropped
|
|
650 sed -i '1i\db\tdb'${version}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
651 joinresults originalfile $annovarout 3 4 5 6 7 B.db${version}
|
|
652
|
|
653
|
|
654 done
|
|
655
|
|
656
|
|
657
|
|
658 #1000 Genomes
|
|
659
|
|
660 if [ $ver1000g != "None" ]
|
|
661 then
|
|
662
|
|
663 for version in $g1000str
|
|
664 do
|
|
665 #column headers
|
|
666 g1000_colheader_ALL="${version}_ALL"
|
|
667 g1000_colheader_AFR="${version}_AFR"
|
|
668 g1000_colheader_AMR="${version}_AMR"
|
|
669 g1000_colheader_ASN="${version}_ASN"
|
|
670 g1000_colheader_EUR="${version}_EUR"
|
|
671 g1000_colheader_CEU="${version}_CEU"
|
|
672 g1000_colheader_YRI="${version}_YRI"
|
|
673 g1000_colheader_JPTCHB="${version}_JPTCHB"
|
|
674
|
|
675 doALL="N"
|
|
676 doAMR="N"
|
|
677 doAFR="N"
|
|
678 doASN="N"
|
|
679 doEUR="N"
|
|
680 doCEU="N"
|
|
681 doYRI="N"
|
|
682 doJPTCHB="N"
|
|
683
|
|
684
|
|
685 if [ $version == "1000g2012apr" ]
|
|
686 then
|
|
687 fileID="2012_04"
|
|
688 doALL="Y"
|
|
689 if [ $buildver == "hg19" ]
|
|
690 then
|
|
691 doAMR="Y"
|
|
692 doAFR="Y"
|
|
693 doASN="Y"
|
|
694 doEUR="Y"
|
|
695 fi
|
|
696 elif [[ $version == "1000g2012feb" && $buildver == "hg19" ]]
|
|
697 then
|
|
698 fileID="2012_02"
|
|
699 doALL="Y"
|
|
700 elif [[ $version == "1000g2010nov" && $buildver == "hg19" ]]
|
|
701 then
|
|
702 fileID="2010_11"
|
|
703 doALL="Y"
|
|
704 elif [[ $version == "1000g2010jul" && $buildver == "hg18" ]]
|
|
705 then
|
|
706 fileID="2010_07"
|
|
707 doALL="N"
|
|
708 doCEU="Y"
|
|
709 doYRI="Y"
|
|
710 doJPTCHB="Y"
|
|
711 else
|
|
712 echo "unrecognized 1000g version, skipping"
|
|
713 fi
|
|
714
|
|
715 #ALL
|
|
716 if [ $doALL == "Y" ]
|
|
717 then
|
|
718 echo -e "\n1000Genomes ALL"
|
|
719 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_all" annovarinput $humandb 2>&1
|
|
720
|
|
721 annovarout=annovarinput.${buildver}_ALL.sites.${fileID}_dropped
|
|
722 sed -i '1i\db\t'$g1000_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
723 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ALL
|
|
724 fi
|
|
725
|
|
726 # AFR
|
|
727 if [ $doAFR == "Y" ]
|
|
728 then
|
|
729 echo -e "\n1000Genomes AFR"
|
|
730 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_afr" annovarinput $humandb 2>&1
|
|
731
|
|
732 annovarout=annovarinput.${buildver}_AFR.sites.${fileID}_dropped
|
|
733 sed -i '1i\db\t'$g1000_colheader_AFR'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
734 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_AFR
|
|
735 fi
|
|
736
|
|
737
|
|
738 # AMR
|
|
739 if [ $doAMR == "Y" ]
|
|
740 then
|
|
741 echo -e "\n1000Genomes AMR"
|
|
742 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_amr" annovarinput $humandb 2>&1
|
|
743
|
|
744 annovarout=annovarinput.${buildver}_AMR.sites.${fileID}_dropped
|
|
745 sed -i '1i\db\t'$g1000_colheader_AMR'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
746 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_AMR
|
|
747 fi
|
|
748
|
|
749 # ASN
|
|
750 if [ $doASN == "Y" ]
|
|
751 then
|
|
752 echo -e "\n1000Genomes ASN"
|
|
753 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_asn" annovarinput $humandb 2>&1
|
|
754
|
|
755 annovarout=annovarinput.${buildver}_ASN.sites.${fileID}_dropped
|
|
756 sed -i '1i\db\t'$g1000_colheader_ASN'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
757 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_ASN
|
|
758 fi
|
|
759
|
|
760 # EUR
|
|
761 if [ $doEUR == "Y" ]
|
|
762 then
|
|
763 echo -e "\n1000Genomes EUR"
|
|
764 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_eur" annovarinput $humandb 2>&1
|
|
765
|
|
766 annovarout=annovarinput.${buildver}_EUR.sites.${fileID}_dropped
|
|
767 sed -i '1i\db\t'$g1000_colheader_EUR'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
768 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_EUR
|
|
769 fi
|
|
770
|
|
771 # CEU
|
|
772 if [ $doCEU == "Y" ]
|
|
773 then
|
|
774 echo -e "\n1000Genomes CEU"
|
|
775 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_ceu" annovarinput $humandb 2>&1
|
|
776
|
|
777 annovarout=annovarinput.${buildver}_CEU.sites.${fileID}_dropped
|
|
778 sed -i '1i\db\t'$g1000_colheader_CEU'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
779 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_CEU
|
|
780 fi
|
|
781
|
|
782 # YRI
|
|
783 if [ $doYRI == "Y" ]
|
|
784 then
|
|
785 echo -e "\n1000Genomes YRI"
|
|
786 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_yri" annovarinput $humandb 2>&1
|
|
787
|
|
788 annovarout=annovarinput.${buildver}_YRI.sites.${fileID}_dropped
|
|
789 sed -i '1i\db\t'$g1000_colheader_YRI'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
790 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_YRI
|
|
791
|
|
792
|
|
793 fi
|
|
794
|
|
795 #JPTCHB
|
|
796 if [ $doJPTCHB == "Y" ]
|
|
797 then
|
|
798 echo -e "\n1000Genomes JPTCHB"
|
|
799 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype "${version}_jptchb" annovarinput $humandb 2>&1
|
|
800
|
|
801 annovarout=annovarinput.${buildver}_JPTCHB.sites.${fileID}_dropped
|
|
802 sed -i '1i\db\t'$g1000_colheader_JPTCHB'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
803 joinresults originalfile $annovarout 3 4 5 6 7 B.$g1000_colheader_JPTCHB
|
|
804 fi
|
|
805
|
|
806 done
|
|
807 fi
|
|
808
|
|
809
|
|
810 # SIFT
|
|
811 if [ $avsift == "Y" ]
|
|
812 then
|
|
813 echo -e "\nSIFT Annotation"
|
|
814 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype avsift annovarinput $humandb 2>&1
|
|
815
|
|
816 annovarout=annovarinput.${buildver}_avsift_dropped
|
|
817 sed -i '1i\db\tAVSIFT\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
818 joinresults originalfile $annovarout 3 4 5 6 7 B.AVSIFT
|
|
819 fi
|
|
820
|
|
821 #ljb refers to Liu, Jian, Boerwinkle paper in Human Mutation with pubmed ID 21520341. Cite this paper if you use the scores
|
|
822 # SIFT2
|
|
823 if [ $ljbsift == "Y" ]
|
|
824 then
|
|
825 echo -e "\nSIFT Annotation"
|
|
826 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_sift annovarinput $humandb 2>&1
|
|
827
|
|
828 annovarout=annovarinput.${buildver}_ljb_sift_dropped
|
|
829 sed -i '1i\db\tLJB_SIFT\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
830 joinresults originalfile $annovarout 3 4 5 6 7 B.LJB_SIFT
|
|
831 fi
|
|
832
|
|
833
|
|
834 # PolyPhen2
|
|
835 if [ $polyphen2 == "Y" ]
|
|
836 then
|
|
837 echo -e "\nPolyPhen Annotation"
|
|
838 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_pp2 annovarinput $humandb 2>&1
|
|
839
|
|
840 annovarout=annovarinput.${buildver}_ljb_pp2_dropped
|
|
841 sed -i '1i\db\tPolyPhen2\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
842 joinresults originalfile $annovarout 3 4 5 6 7 B.PolyPhen2
|
|
843 fi
|
|
844
|
|
845
|
|
846 # MutationTaster
|
|
847 if [ $mutationtaster == "Y" ]
|
|
848 then
|
|
849 echo -e "\nMutationTaster Annotation"
|
|
850 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_mt annovarinput $humandb 2>&1
|
|
851
|
|
852 annovarout=annovarinput.${buildver}_ljb_mt_dropped
|
|
853 sed -i '1i\db\tMutationTaster\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
854 joinresults originalfile $annovarout 3 4 5 6 7 B.MutationTaster
|
|
855 fi
|
|
856
|
|
857
|
|
858 # LRT
|
|
859 if [ $lrt == "Y" ]
|
|
860 then
|
|
861 echo -e "\nLRT Annotation"
|
|
862 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_lrt annovarinput $humandb 2>&1
|
|
863
|
|
864 annovarout=annovarinput.${buildver}_ljb_lrt_dropped
|
|
865 sed -i '1i\db\tLikelihoodRatioTestScore\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
866 joinresults originalfile $annovarout 3 4 5 6 7 B.LikelihoodRatioTestScore
|
|
867 fi
|
|
868
|
|
869 # PhyloP
|
|
870 if [ $phylop == "Y" ]
|
|
871 then
|
|
872 echo -e "\nPhyloP Annotation"
|
|
873 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype ljb_phylop annovarinput $humandb 2>&1
|
|
874
|
|
875 annovarout=annovarinput.${buildver}_ljb_phylop_dropped
|
|
876 sed -i '1i\db\tPhyloP\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
877 joinresults originalfile $annovarout 3 4 5 6 7 B.PhyloP
|
|
878 fi
|
|
879
|
|
880
|
|
881 ### ESP Exome Variant Server
|
|
882 if [ $esp != "None" ]
|
|
883 then
|
|
884 echo -e "\nESP Annotation"
|
|
885 for version in $espstr
|
|
886 do
|
|
887 echo "version: $version"
|
|
888 # 6500si ALL
|
|
889 if [ $version == "esp6500si_all" ]
|
|
890 then
|
|
891 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_all annovarinput $humandb 2>&1
|
|
892
|
|
893 annovarout=annovarinput.${buildver}_esp6500si_all_dropped
|
|
894 sed -i '1i\db\t'$esp6500si_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
895 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500si_colheader_ALL
|
|
896 fi
|
|
897
|
|
898
|
|
899 # 6500si European American
|
|
900 if [ $version == "esp6500si_ea" ]
|
|
901 then
|
|
902 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_ea annovarinput $humandb 2>&1
|
|
903
|
|
904 annovarout=annovarinput.${buildver}_esp6500si_ea_dropped
|
|
905 sed -i '1i\db\t'$esp6500si_colheader_EA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'" ' $annovarout
|
|
906 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500si_colheader_EA
|
|
907 fi
|
|
908
|
|
909 # 6500si African Americans
|
|
910 if [ $version == "esp6500si_aa" ]
|
|
911 then
|
|
912 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_aa annovarinput $humandb 2>&1
|
|
913
|
|
914 annovarout=annovarinput.${buildver}_esp6500si_aa_dropped
|
|
915 sed -i '1i\db\t'$esp6500si_colheader_AA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'" ' $annovarout
|
|
916 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500si_colheader_AA
|
|
917 fi
|
|
918
|
|
919
|
|
920 # 6500 ALL
|
|
921 if [ $version == "esp6500_all" ]
|
|
922 then
|
|
923 ls
|
|
924 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500_all annovarinput $humandb 2>&1
|
|
925
|
|
926 annovarout=annovarinput.${buildver}_esp6500_all_dropped
|
|
927 sed -i '1i\db\t'$esp6500_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'" ' $annovarout
|
|
928 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_ALL
|
|
929 fi
|
|
930
|
|
931
|
|
932 # 6500 European American
|
|
933 if [ $version == "esp6500_ea" ]
|
|
934 then
|
|
935 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500_ea annovarinput $humandb 2>&1
|
|
936 annovarout=annovarinput.${buildver}_esp6500_ea_dropped
|
|
937 sed -i '1i\db\t'$esp6500_colheader_EA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
938 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_EA
|
|
939 fi
|
|
940
|
|
941 # 6500 African Americans
|
|
942 if [ $version == "esp6500_aa" ]
|
|
943 then
|
|
944 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500_aa annovarinput $humandb 2>&1
|
|
945
|
|
946 annovarout=annovarinput.${buildver}_esp6500_aa_dropped
|
|
947 sed -i '1i\db\t'$esp6500_colheader_AA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
948 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_AA
|
|
949 fi
|
|
950
|
|
951
|
|
952 # 5400 ALL
|
|
953 if [ $version == "esp5400_all" ]
|
|
954 then
|
|
955 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp5400_all annovarinput $humandb 2>&1
|
|
956
|
|
957 annovarout=annovarinput.${buildver}_esp5400_all_dropped
|
|
958 sed -i '1i\db\t'$esp5400_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
959 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp5400_colheader_ALL
|
|
960 fi
|
|
961
|
|
962
|
|
963 # 5400 European American
|
|
964 if [ $version == "esp5400_ea" ]
|
|
965 then
|
|
966 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp5400_ea annovarinput $humandb 2>&1
|
|
967
|
|
968 annovarout=annovarinput.${buildver}_esp5400_ea_dropped
|
|
969 sed -i '1i\db\t'$esp5400_colheader_EA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
970 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp5400_colheader_EA
|
|
971 fi
|
|
972
|
|
973 # 5400 African Americans
|
|
974 if [ $version == "esp5400_aa" ]
|
|
975 then
|
|
976 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp5400_aa annovarinput $humandb 2>&1
|
|
977
|
|
978 annovarout=annovarinput.${buildver}_esp5400_aa_dropped
|
|
979 sed -i '1i\db\t'$esp5400_colheader_AA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
980 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp5400_colheader_AA
|
|
981 fi
|
|
982
|
|
983 done
|
|
984 fi
|
|
985
|
|
986
|
|
987 #ESP6500
|
|
988 if [ $esp == "Y" ]
|
|
989 then
|
|
990 echo -e "\nESP Annotation OLD"
|
|
991 # ALL
|
|
992 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_all annovarinput $humandb 2>&1
|
|
993
|
|
994 annovarout=annovarinput.${buildver}_esp6500si_all_dropped
|
|
995 sed -i '1i\db\t'$esp6500_colheader_ALL'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
996 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_ALL
|
|
997
|
|
998
|
|
999 # European American
|
|
1000 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_ea annovarinput $humandb 2>&1
|
|
1001
|
|
1002 annovarout=annovarinput.${buildver}_esp6500si_ea_dropped
|
|
1003 sed -i '1i\db\t'$esp6500_colheader_EA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1004 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_EA
|
|
1005
|
|
1006 # African Americans
|
|
1007 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype esp6500si_aa annovarinput $humandb 2>&1
|
|
1008
|
|
1009 annovarout=annovarinput.${buildver}_esp6500si_aa_dropped
|
|
1010 sed -i '1i\db\t'$esp6500_colheader_AA'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1011 joinresults originalfile $annovarout 3 4 5 6 7 B.$esp6500_colheader_AA
|
|
1012 fi
|
|
1013
|
|
1014
|
|
1015
|
|
1016 #GERP++
|
|
1017 if [ $gerp == "Y" ]
|
|
1018 then
|
|
1019 echo -e "\nGERP++ Annotation"
|
|
1020 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype gerp++gt2 annovarinput $humandb 2>&1
|
|
1021
|
|
1022 annovarout="annovarinput.${buildver}_gerp++gt2_dropped"
|
|
1023 sed -i '1i\db\tGERP++\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1024 joinresults originalfile $annovarout 3 4 5 6 7 B.GERP++
|
|
1025 fi
|
|
1026
|
|
1027
|
|
1028 #COSMIC
|
|
1029 if [[ $cosmic61 == "Y" && $buildver == "hg19" ]]
|
|
1030 then
|
|
1031 echo -e "\nCOSMIC61 Annotation"
|
|
1032 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic61 annovarinput $humandb 2>&1
|
|
1033
|
|
1034 annovarout="annovarinput.${buildver}_cosmic61_dropped"
|
|
1035 sed -i '1i\db\tCOSMIC61\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1036 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC61
|
|
1037
|
|
1038 fi
|
|
1039
|
|
1040 if [[ $cosmic63 == "Y" && $buildver == "hg19" ]]
|
|
1041 then
|
|
1042 echo -e "\nCOSMIC63 Annotation"
|
|
1043 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic63 annovarinput $humandb 2>&1
|
|
1044
|
|
1045 annovarout="annovarinput.${buildver}_cosmic63_dropped"
|
|
1046 sed -i '1i\db\tCOSMIC63\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1047 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC63
|
|
1048
|
|
1049 fi
|
|
1050
|
|
1051 if [[ $cosmic64 == "Y" && $buildver == "hg19" ]]
|
|
1052 then
|
|
1053 echo -e "\nCOSMIC64 Annotation"
|
|
1054 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic64 annovarinput $humandb 2>&1
|
|
1055
|
|
1056 annovarout="annovarinput.${buildver}_cosmic64_dropped"
|
|
1057 sed -i '1i\db\tCOSMIC64\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1058 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC64
|
|
1059
|
|
1060 fi
|
|
1061
|
|
1062 if [[ $cosmic65 == "Y" && $buildver == "hg19" ]]
|
|
1063 then
|
|
1064 echo -e "\nCOSMIC65 Annotation"
|
|
1065 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cosmic65 annovarinput $humandb 2>&1
|
|
1066
|
|
1067 annovarout="annovarinput.${buildver}_cosmic65_dropped"
|
|
1068 sed -i '1i\db\tCOSMIC65\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1069 joinresults originalfile $annovarout 3 4 5 6 7 B.COSMIC65
|
|
1070
|
|
1071 fi
|
|
1072
|
|
1073 #cg46
|
|
1074 if [[ $cg46 == "Y" ]]
|
|
1075 then
|
|
1076 echo -e "\nCG 46 genomes Annotation"
|
|
1077 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cg46 annovarinput $humandb 2>&1
|
|
1078
|
|
1079 annovarout="annovarinput.${buildver}_cg46_dropped"
|
|
1080 sed -i '1i\db\t'${cg46_colheader}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1081 joinresults originalfile $annovarout 3 4 5 6 7 B.${cg46_colheader}
|
|
1082
|
|
1083 fi
|
|
1084
|
|
1085
|
|
1086 #cg69
|
|
1087 if [[ $cg69 == "Y" ]]
|
|
1088 then
|
|
1089 echo -e "\nCG 69 genomes Annotation"
|
|
1090 $scriptsdir/annotate_variation.pl --filter --buildver $buildver -dbtype cg69 annovarinput $humandb 2>&1
|
|
1091
|
|
1092 annovarout="annovarinput.${buildver}_cg69_dropped"
|
|
1093 sed -i '1i\db\t'${cg69_colheader}'\tchromosome\tstart\tend\treference\talleleSeq"'"$vcfheader"'"' $annovarout
|
|
1094 joinresults originalfile $annovarout 3 4 5 6 7 B.${cg69_colheader}
|
|
1095
|
|
1096 fi
|
|
1097
|
|
1098
|
|
1099
|
|
1100 if [ $convertcoords == "Y" ]
|
|
1101 then
|
|
1102 echo "converting back coordinates"
|
|
1103 awk 'BEGIN{
|
|
1104 FS="\t";
|
|
1105 OFS="\t";
|
|
1106 }{
|
|
1107 if (FNR==1)
|
|
1108 print $0
|
|
1109 if(FNR>1) {
|
|
1110 $"'"${chrcol}"'" = "chr"$"'"${chrcol}"'"
|
|
1111 if( $"'"${vartypecol}"'" == "snp" ){ $"'"${startcol}"'" -= 1 };
|
|
1112 if( $"'"${vartypecol}"'" == "ins" ){ $"'"${refcol}"'" = "" };
|
|
1113 if( $"'"${vartypecol}"'" == "del" ){ $"'"${startcol}"'" -=1; $"'"${obscol}"'" = "" };
|
|
1114 if( $"'"${vartypecol}"'" == "sub" ){ $"'"${startcol}"'" -= 1 };
|
|
1115 print $0
|
|
1116
|
|
1117 }
|
|
1118 }
|
|
1119 END{
|
|
1120 }' originalfile > originalfile_coords
|
|
1121 else
|
|
1122 mv originalfile originalfile_coords
|
|
1123 fi
|
|
1124
|
|
1125 #restore "chr" prefix?
|
|
1126
|
|
1127 #move to outputfile
|
|
1128 if [ ! -s annovarinput.invalid_input ]
|
|
1129 then
|
|
1130 echo "Congrats, your input file contained no invalid lines!" > annovarinput.invalid_input
|
|
1131 fi
|
|
1132
|
|
1133 cp originalfile_coords $outfile_all
|
|
1134 cp annovarinput.invalid_input $outfile_invalid 2>&1
|
|
1135 fi #if $dorunannovar
|
|
1136
|
|
1137
|
|
1138
|
|
1139
|
|
1140
|
|
1141 ############################################
|
|
1142 #
|
|
1143 # Filter Annotated Variants
|
|
1144 #
|
|
1145 ############################################
|
|
1146
|
|
1147
|
|
1148 if [[ $dofilter == "Y" ]]
|
|
1149 then
|
|
1150 echo "starting filtering"
|
|
1151 cp originalfile filteredfile
|
|
1152
|
|
1153 ### do the filtering
|
|
1154 # usage: runfilter <column name> <threshold> (-1=do not filter, 0=filter any value)
|
|
1155
|
|
1156 #1000genomes
|
|
1157 runfilter filteredfile ${g1000_colheader_ALL} ${threshold_1000g_ALL}
|
|
1158 runfilter filteredfile ${g1000_colheader_AFR} ${threshold_1000g_AFR}
|
|
1159 runfilter filteredfile ${g1000_colheader_AMR} ${threshold_1000g_AMR}
|
|
1160 runfilter filteredfile ${g1000_colheader_ASN} ${threshold_1000g_ASN}
|
|
1161 runfilter filteredfile ${g1000_colheader_EUR} ${threshold_1000g_EUR}
|
|
1162
|
|
1163 #esp
|
|
1164 runfilter filteredfile ${esp6500_colheader_ALL} ${threshold_ESP6500_ALL}
|
|
1165 runfilter filteredfile ${esp6500_colheader_EA} ${threshold_ESP6500_EA}
|
|
1166 runfilter filteredfile ${esp6500_colheader_AA} ${threshold_ESP6500_AA}
|
|
1167
|
|
1168 #dbsnp
|
|
1169 for version in $filt_dbsnpstr
|
|
1170 do
|
|
1171 if [ $version == "None" ]
|
|
1172 then
|
|
1173 break
|
|
1174 fi
|
|
1175 runfilter filteredfile "db$version" "text" #-42 will filter any non-empty string in that field
|
|
1176
|
|
1177 done
|
|
1178
|
|
1179 #complete genomics
|
|
1180 runfilter filteredfile ${cg46_colheader} ${threshold_cg46}
|
|
1181 runfilter filteredfile ${cg69_colheader} ${threshold_cg69}
|
|
1182
|
|
1183 #move filtered output file to galaxy output file
|
|
1184 cp filteredfile $outfile_filt
|
|
1185
|
|
1186 fi
|
|
1187
|
|
1188
|
|
1189
|
|
1190
|
|
1191
|
|
1192
|
|
1193
|
|
1194
|
|
1195
|
|
1196
|
|
1197
|
|
1198
|
|
1199
|
|
1200
|
|
1201
|
|
1202
|
|
1203
|