Mercurial > repos > gandres > vcftools_filter_stats_diversity
comparison VCFToolsSlidingWindow/VCFToolsSlidingWindow.pl @ 3:612066e3f57d draft
Uploaded
author | gandres |
---|---|
date | Mon, 09 Nov 2015 05:18:45 -0500 |
parents | ac7c9e40d601 |
children | b762ecbe2314 |
comparison
equal
deleted
inserted
replaced
2:ac7c9e40d601 | 3:612066e3f57d |
---|---|
1 | 1 |
2 #!/usr/bin/perl | 2 #!/usr/bin/perl |
3 | 3 |
4 use strict; | 4 use strict; |
5 use Getopt::Long; | 5 use Getopt::Long; |
6 use Bio::SeqIO; | |
6 | 7 |
7 my $usage = qq~Usage:$0 <args> [<opts>] | 8 my $usage = qq~Usage:$0 <args> [<opts>] |
8 | 9 |
9 where <args> are: | 10 where <args> are: |
10 | 11 |
70 else{ | 71 else{ |
71 die "Error: window size must be an integer\n"; | 72 die "Error: window size must be an integer\n"; |
72 } | 73 } |
73 | 74 |
74 | 75 |
75 my $VCFTOOLS_EXE = "vcftools"; | 76 my $input_format = "--vcf $input.sorted"; |
76 | 77 if ($input =~/\.vcf/){ |
77 system("vcf-sort $input >$input.sorted"); | 78 system("vcf-sort $input >$input.sorted"); |
78 | 79 $input_format = "--vcf $input.sorted"; |
79 | 80 } |
80 | 81 elsif ($input =~/\.bcf/){ |
81 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --window-pi $window --window-pi-step $step >>$out.vcftools.log 2>&1"); | 82 $input_format = "--bcf $input"; |
82 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --TajimaD $window >>$out.vcftools.log 2>&1"); | 83 } |
83 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --TsTv $window >>$out.vcftools.log 2>&1"); | 84 system("vcftools $input_format --out $out --window-pi $window --window-pi-step $step >>$out.vcftools.log 2>&1"); |
84 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --SNPdensity $window >>$out.vcftools.log 2>&1"); | 85 system("vcftools $input_format --out $out --TajimaD $window >>$out.vcftools.log 2>&1"); |
86 system("vcftools $input_format --out $out --TsTv $window >>$out.vcftools.log 2>&1"); | |
87 system("vcftools $input_format --out $out --SNPdensity $window >>$out.vcftools.log 2>&1"); | |
88 | |
89 | |
90 | |
91 ################################################## | |
92 # Tajima distribution | |
93 ################################################## | |
94 my %counts3; | |
95 my @values; | |
96 open(my $TAJIMA,"$out.Tajima.D"); | |
97 <$TAJIMA>; | |
98 my $max = 0; | |
99 while(<$TAJIMA>){ | |
100 my $line = $_; | |
101 $line =~s/\n//g; | |
102 $line =~s/\r//g; | |
103 my @inf = split(/\t/,$line); | |
104 my $val = $inf[3]; | |
105 my $arrondi = sprintf("%.1f",$val); | |
106 $counts3{$arrondi}++; | |
107 } | |
108 close($TAJIMA); | |
109 | |
110 | |
111 | |
112 open(my $M3,">$out.tajima_distrib"); | |
113 print $M3 "Values nb of values\n"; | |
114 for (my $i = -2.5; $i <= 2.5; $i=$i+0.1) | |
115 { | |
116 my $arrondi = sprintf("%.1f",$i); | |
117 my $nb = 0; | |
118 if ($counts3{$arrondi}) | |
119 { | |
120 $nb = $counts3{$arrondi}; | |
121 } | |
122 print $M3 "Tajima $arrondi $nb\n"; | |
123 } | |
124 close($M3); | |
125 | |
85 | 126 |
86 if (keys(%hash) > 0) | 127 if (keys(%hash) > 0) |
87 { | 128 { |
88 my $files_pi = ""; | 129 my $files_pi = ""; |
89 my $files_dtajima = ""; | 130 my $files_dtajima = ""; |
131 my $cmd_fst = "--fst-window-size $window"; | |
132 my $cmd_fst_by_marker = ""; | |
90 foreach my $pop(sort(keys(%hash))) | 133 foreach my $pop(sort(keys(%hash))) |
91 { | 134 { |
135 my $cmd_part = $hash{$pop}; | |
136 | |
137 | |
138 open(my $POP,">$input.$pop.txt"); | |
139 my @ind_of_pop = split(" --indv ",$cmd_part); | |
140 print $POP join("\n",@ind_of_pop); | |
141 close($POP); | |
142 $cmd_fst .= " --weir-fst-pop $input.$pop.txt"; | |
143 $cmd_fst_by_marker .= " --weir-fst-pop $input.$pop.txt"; | |
144 | |
92 my $cmd_part = $hash{$pop}; | 145 my $cmd_part = $hash{$pop}; |
93 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --window-pi $window --window-pi-step $step $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); | 146 system("vcftools $input_format --remove-filtered-all --out $out.$pop --window-pi $window --window-pi-step $step $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); |
94 my $sed_cmd = "sed -i \"s\/PI\/$pop\/g\" $out.$pop.windowed.pi"; | 147 my $sed_cmd = "sed -i \"s\/PI\/$pop\/g\" $out.$pop.windowed.pi"; |
95 system($sed_cmd); | 148 system($sed_cmd); |
96 $files_pi .= "$out.$pop.windowed.pi "; | 149 $files_pi .= "$out.$pop.windowed.pi "; |
97 | 150 |
98 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --SNPdensity $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); | 151 system("vcftools $input_format --remove-filtered-all --out $out.$pop --SNPdensity $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); |
99 | 152 |
100 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --TajimaD $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); | 153 system("vcftools $input_format --remove-filtered-all --out $out.$pop --TajimaD $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); |
101 my $sed_cmd = "sed -i \"s\/TajimaD\/$pop\/g\" $out.$pop.Tajima.D"; | 154 my $sed_cmd = "sed -i \"s\/TajimaD\/$pop\/g\" $out.$pop.Tajima.D"; |
102 system($sed_cmd); | 155 system($sed_cmd); |
103 $sed_cmd = "sed -i \"s/nan/0/g\" $out.Tajima.D"; | 156 $sed_cmd = "sed -i \"s/nan/0/g\" $out.Tajima.D"; |
104 system($sed_cmd); | 157 system($sed_cmd); |
105 $files_dtajima .= "$out.$pop.Tajima.D "; | 158 $files_dtajima .= "$out.$pop.Tajima.D "; |
106 | 159 |
107 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --TsTv $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); | 160 system("vcftools $input_format --remove-filtered-all --out $out.$pop --TsTv $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); |
108 } | 161 } |
109 system("paste $files_pi >>$out.combined.pi"); | 162 system("paste $files_pi >>$out.combined.pi"); |
110 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5\"\t\"\$10'} $out.combined.pi >$out.combined.pi.txt"; | 163 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5\"\t\"\$10'} $out.combined.pi >$out.combined.pi.txt"; |
111 system($awk_cmd); | 164 system($awk_cmd); |
112 | 165 |
113 system("paste $files_dtajima >>$out.combined.dtajima"); | 166 system("paste $files_dtajima >>$out.combined.dtajima"); |
114 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$4\"\t\"\$8'} $out.combined.dtajima >$out.combined.dtajima.txt"; | 167 $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$4\"\t\"\$8'} $out.combined.dtajima >$out.combined.dtajima.txt"; |
115 system($awk_cmd); | 168 system($awk_cmd); |
169 | |
170 system("vcftools $input_format --remove-filtered-all --out $out.fst $cmd_fst --maf 0.001 >>$out.vcftools.log 2>&1"); | |
171 $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5'} $out.fst.windowed.weir.fst >$out.fst.txt"; | |
172 system($awk_cmd); | |
173 | |
174 system("vcftools $input_format --remove-filtered-all --out $out.fst $cmd_fst_by_marker --maf 0.001 >>$out.vcftools.log 2>&1"); | |
175 | |
176 my %genes; | |
177 open(my $V,$input); | |
178 my $go = 0; | |
179 while(<$V>){ | |
180 if ($go){ | |
181 my @infos = split(/\t/,$_); | |
182 my $chr = $infos[0]; | |
183 my $pos = $infos[1]; | |
184 my $annot = $infos[7]; | |
185 my $gene = "#"; | |
186 if ($annot =~/EFF=\w+\((.*)\)/){ | |
187 my @parts = split(/\|/,$1); | |
188 $gene = $parts[4]; | |
189 } | |
190 $genes{"$chr:$pos"} = $gene; | |
191 } | |
192 if (/^#CHROM/){$go = 1;} | |
193 } | |
194 | |
195 my $cumul = 0; | |
196 my $previous_pos; | |
197 my %chrs; | |
198 open(my $F,"$out.fst.weir.fst"); | |
199 open(my $O,">$out.fst.by_marker.txt"); | |
200 open(my $O2,">$out.fst.by_marker.genes.txt"); | |
201 print $O "Chromosome Marker Position Fst\n"; | |
202 print $O2 "Chromosome Marker Position Fst Gene\n"; | |
203 while(<$F>){ | |
204 my $line = $_; | |
205 chomp($line); | |
206 my ($chr,$pos,$val) = split(/\t/,$line); | |
207 if (!$chrs{$chr}){ | |
208 $cumul = $previous_pos; | |
209 $chrs{$chr} = 1; | |
210 } | |
211 if ($val > 0.6 && $val !~/-/ && $val ne 'nan'){ | |
212 my $x = $pos+$cumul; | |
213 print $O2 "$chr $chr:$pos $pos $val " . $genes{"$chr:$pos"} . "\n"; | |
214 print $O "$chr $chr:$pos $x $val\n"; | |
215 $previous_pos = $x; | |
216 } | |
217 | |
218 } | |
219 close($F); | |
220 close($O); | |
221 close($O2); | |
222 | |
223 #$awk_cmd = "awk {'if (\$3>0.6 && \$3<=1 && \$3!~\"nan\"){print \$1\"\t\"\$2\"\t\"\$2\"\t\"\$3}'} $out.fst.weir.fst >$out.fst.by_marker.txt"; | |
224 #system($awk_cmd); | |
116 } | 225 } |
117 | 226 |
118 | 227 |
119 | 228 |
120 | 229 |