Mercurial > repos > saskia-hiltemann > virtual_normal_analysis
comparison TV-vs-background.sh @ 0:1209f18a5a83 draft
Uploaded
author | saskia-hiltemann |
---|---|
date | Mon, 03 Aug 2015 05:01:15 -0400 |
parents | |
children | 885ba15c2564 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1209f18a5a83 |
---|---|
1 #!/bin/bash | |
2 | |
3 #TV-vs-background.sh $variants $genomes ${reference.fields.crr_path} ${reference.fields.31G_var_paths} ${reference.54G_var_paths} $threshold $output_all $output_filtered | |
4 | |
5 echo $@ | |
6 | |
7 set -- `getopt -n$0 -u -a --longoptions="variants: reference: VN_varfiles: outputfile_filtered: outputfile_all: threshold: thresholdhc:" "h:" "$@"` || usage | |
8 [ $# -eq 0 ] && usage | |
9 | |
10 while [ $# -gt 0 ] | |
11 do | |
12 case "$1" in | |
13 --variants) variants=$2;shift;; | |
14 --reference) crr=$2;shift;; | |
15 --VN_varfiles) VN_varfiles_list=$2;shift;; | |
16 --outputfile_filtered) output_filtered=$2;shift;; | |
17 --outputfile_all) output_all=$2;shift;; | |
18 --threshold) threshold=$2;shift;; | |
19 --thresholdhc) thresholdhc=$2;shift;; | |
20 -h) shift;; | |
21 --) shift;break;; | |
22 -*) usage;; | |
23 *) break;; | |
24 esac | |
25 shift | |
26 done | |
27 | |
28 # replace newline chars with spaces for input to testvariants | |
29 tr '\n' ' ' < $VN_varfiles_list > VN_varfiles.txt | |
30 | |
31 | |
32 ### run TestVariants against 31G, 54G or 85G | |
33 | |
34 echo "number of normals: $VNsetsize" | |
35 echo "list of normals: ($VN_varfiles_list)" | |
36 cat VN_varfiles.txt | |
37 | |
38 | |
39 echo "running TV against Virtual Normal set" | |
40 echo "command: cgatools testvariants\ | |
41 --beta \ | |
42 --reference $crr \ | |
43 --input $variants \ | |
44 --output $output_all \ | |
45 --variants `cat VN_varfiles.txt`" | |
46 | |
47 cgatools testvariants \ | |
48 --beta \ | |
49 --reference $crr \ | |
50 --input $variants \ | |
51 --output $output_all \ | |
52 --variants `cat VN_varfiles.txt` | |
53 | |
54 | |
55 | |
56 VNsetsize=`cat $VN_varfiles_list | wc -l` | |
57 | |
58 | |
59 | |
60 ### filter file based on occurrence in background genomes | |
61 cp $output_all $output_filtered | |
62 cp $output_all output_expanded | |
63 | |
64 ### condens file to columns with counts for all background genomes | |
65 echo "Counting..." | |
66 awk 'BEGIN{ | |
67 FS="\t"; | |
68 OFS="\t"; | |
69 totalnormals="'"$VNsetsize"'"+0 | |
70 count["00"]="0"; | |
71 count["01"]="0"; | |
72 count["11"]="0"; | |
73 count["0N"]="0"; | |
74 count["1N"]="0"; | |
75 count["NN"]="0"; | |
76 count["0"]="0"; | |
77 count["1"]="0"; | |
78 count["N"]="0"; | |
79 }{ | |
80 if(FNR==1) # header | |
81 print $1,$2,$3,$4,$5,$6,$7,$8,"VN_occurrences","VN_frequency","VN_fullycalled_count","VN_fullycalled_frequency","VN_00","VN_01","VN_11","VN_0N","VN_1N","VN_NN","VN_0","VN_1","VN_N" | |
82 else{ | |
83 #count entries in reference genomes | |
84 for (c in count) | |
85 count[c]=0; | |
86 for (i=9; i<=NF; i++){ | |
87 count[$i]++; | |
88 } | |
89 occurrences=count["11"]+count["01"]+count["1N"]+count["1"] | |
90 fullycalled=count["11"]+count["01"]+count["00"]+count["1"]+count["0"] | |
91 print $1,$2,$3,$4,$5,$6,$7,$8,occurrences,occurrences/totalnormals,fullycalled,fullycalled/totalnormals,count["00"],count["01"],count["11"],count["0N"],count["1N"],count["NN"],count["0"],count["1"],count["N"] | |
92 } | |
93 }END{ | |
94 | |
95 | |
96 }' $output_all > "${output_all}-counted" | |
97 | |
98 | |
99 # this counted file is the final output file | |
100 rm $output_all | |
101 mv "${output_all}-counted" $output_all | |
102 | |
103 | |
104 | |
105 ### filter out variants occurring in more than <threshold> of the background genomes | |
106 # if total of columns containing a 1 (01,11,1N,1) is >= threshold | |
107 awk 'BEGIN{ | |
108 FS="\t"; | |
109 OFS="\t"; | |
110 }{ | |
111 if(FNR==1){ | |
112 print $0 | |
113 } | |
114 if(FNR>1){ | |
115 if($9 < "'"$threshold"'" ) | |
116 print $0 | |
117 } | |
118 }END{}' $output_all > $output_filtered | |
119 | |
120 | |
121 awk 'BEGIN{ | |
122 FS="\t"; | |
123 OFS="\t"; | |
124 threshold="'"${thresholdhc}"'"+0 | |
125 }{ | |
126 if(FNR==1) | |
127 print $0 | |
128 else if($11 >= threshold) | |
129 print $0 | |
130 | |
131 }END{}' $output_filtered > "output_filtered_highconf.tsv" | |
132 | |
133 | |
134 | |
135 | |
136 | |
137 | |
138 | |
139 |