comparison VCFToolsSlidingWindow/VCFToolsSlidingWindow.pl @ 3:612066e3f57d draft

Uploaded
author gandres
date Mon, 09 Nov 2015 05:18:45 -0500
parents ac7c9e40d601
children b762ecbe2314
comparison
equal deleted inserted replaced
2:ac7c9e40d601 3:612066e3f57d
1 1
2 #!/usr/bin/perl 2 #!/usr/bin/perl
3 3
4 use strict; 4 use strict;
5 use Getopt::Long; 5 use Getopt::Long;
6 use Bio::SeqIO;
6 7
7 my $usage = qq~Usage:$0 <args> [<opts>] 8 my $usage = qq~Usage:$0 <args> [<opts>]
8 9
9 where <args> are: 10 where <args> are:
10 11
70 else{ 71 else{
71 die "Error: window size must be an integer\n"; 72 die "Error: window size must be an integer\n";
72 } 73 }
73 74
74 75
75 my $VCFTOOLS_EXE = "vcftools"; 76 my $input_format = "--vcf $input.sorted";
76 77 if ($input =~/\.vcf/){
77 system("vcf-sort $input >$input.sorted"); 78 system("vcf-sort $input >$input.sorted");
78 79 $input_format = "--vcf $input.sorted";
79 80 }
80 81 elsif ($input =~/\.bcf/){
81 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --window-pi $window --window-pi-step $step >>$out.vcftools.log 2>&1"); 82 $input_format = "--bcf $input";
82 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --TajimaD $window >>$out.vcftools.log 2>&1"); 83 }
83 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --TsTv $window >>$out.vcftools.log 2>&1"); 84 system("vcftools $input_format --out $out --window-pi $window --window-pi-step $step >>$out.vcftools.log 2>&1");
84 system("$VCFTOOLS_EXE --vcf $input.sorted --out $out --SNPdensity $window >>$out.vcftools.log 2>&1"); 85 system("vcftools $input_format --out $out --TajimaD $window >>$out.vcftools.log 2>&1");
86 system("vcftools $input_format --out $out --TsTv $window >>$out.vcftools.log 2>&1");
87 system("vcftools $input_format --out $out --SNPdensity $window >>$out.vcftools.log 2>&1");
88
89
90
91 ##################################################
92 # Tajima distribution
93 ##################################################
94 my %counts3;
95 my @values;
96 open(my $TAJIMA,"$out.Tajima.D");
97 <$TAJIMA>;
98 my $max = 0;
99 while(<$TAJIMA>){
100 my $line = $_;
101 $line =~s/\n//g;
102 $line =~s/\r//g;
103 my @inf = split(/\t/,$line);
104 my $val = $inf[3];
105 my $arrondi = sprintf("%.1f",$val);
106 $counts3{$arrondi}++;
107 }
108 close($TAJIMA);
109
110
111
112 open(my $M3,">$out.tajima_distrib");
113 print $M3 "Values nb of values\n";
114 for (my $i = -2.5; $i <= 2.5; $i=$i+0.1)
115 {
116 my $arrondi = sprintf("%.1f",$i);
117 my $nb = 0;
118 if ($counts3{$arrondi})
119 {
120 $nb = $counts3{$arrondi};
121 }
122 print $M3 "Tajima $arrondi $nb\n";
123 }
124 close($M3);
125
85 126
86 if (keys(%hash) > 0) 127 if (keys(%hash) > 0)
87 { 128 {
88 my $files_pi = ""; 129 my $files_pi = "";
89 my $files_dtajima = ""; 130 my $files_dtajima = "";
131 my $cmd_fst = "--fst-window-size $window";
132 my $cmd_fst_by_marker = "";
90 foreach my $pop(sort(keys(%hash))) 133 foreach my $pop(sort(keys(%hash)))
91 { 134 {
135 my $cmd_part = $hash{$pop};
136
137
138 open(my $POP,">$input.$pop.txt");
139 my @ind_of_pop = split(" --indv ",$cmd_part);
140 print $POP join("\n",@ind_of_pop);
141 close($POP);
142 $cmd_fst .= " --weir-fst-pop $input.$pop.txt";
143 $cmd_fst_by_marker .= " --weir-fst-pop $input.$pop.txt";
144
92 my $cmd_part = $hash{$pop}; 145 my $cmd_part = $hash{$pop};
93 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --window-pi $window --window-pi-step $step $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); 146 system("vcftools $input_format --remove-filtered-all --out $out.$pop --window-pi $window --window-pi-step $step $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1");
94 my $sed_cmd = "sed -i \"s\/PI\/$pop\/g\" $out.$pop.windowed.pi"; 147 my $sed_cmd = "sed -i \"s\/PI\/$pop\/g\" $out.$pop.windowed.pi";
95 system($sed_cmd); 148 system($sed_cmd);
96 $files_pi .= "$out.$pop.windowed.pi "; 149 $files_pi .= "$out.$pop.windowed.pi ";
97 150
98 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --SNPdensity $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); 151 system("vcftools $input_format --remove-filtered-all --out $out.$pop --SNPdensity $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1");
99 152
100 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --TajimaD $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); 153 system("vcftools $input_format --remove-filtered-all --out $out.$pop --TajimaD $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1");
101 my $sed_cmd = "sed -i \"s\/TajimaD\/$pop\/g\" $out.$pop.Tajima.D"; 154 my $sed_cmd = "sed -i \"s\/TajimaD\/$pop\/g\" $out.$pop.Tajima.D";
102 system($sed_cmd); 155 system($sed_cmd);
103 $sed_cmd = "sed -i \"s/nan/0/g\" $out.Tajima.D"; 156 $sed_cmd = "sed -i \"s/nan/0/g\" $out.Tajima.D";
104 system($sed_cmd); 157 system($sed_cmd);
105 $files_dtajima .= "$out.$pop.Tajima.D "; 158 $files_dtajima .= "$out.$pop.Tajima.D ";
106 159
107 system("$VCFTOOLS_EXE --vcf $input.sorted --remove-filtered-all --out $out.$pop --TsTv $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1"); 160 system("vcftools $input_format --remove-filtered-all --out $out.$pop --TsTv $window $cmd_part --maf 0.001 >>$out.vcftools.log 2>&1");
108 } 161 }
109 system("paste $files_pi >>$out.combined.pi"); 162 system("paste $files_pi >>$out.combined.pi");
110 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5\"\t\"\$10'} $out.combined.pi >$out.combined.pi.txt"; 163 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5\"\t\"\$10'} $out.combined.pi >$out.combined.pi.txt";
111 system($awk_cmd); 164 system($awk_cmd);
112 165
113 system("paste $files_dtajima >>$out.combined.dtajima"); 166 system("paste $files_dtajima >>$out.combined.dtajima");
114 my $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$4\"\t\"\$8'} $out.combined.dtajima >$out.combined.dtajima.txt"; 167 $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$4\"\t\"\$8'} $out.combined.dtajima >$out.combined.dtajima.txt";
115 system($awk_cmd); 168 system($awk_cmd);
169
170 system("vcftools $input_format --remove-filtered-all --out $out.fst $cmd_fst --maf 0.001 >>$out.vcftools.log 2>&1");
171 $awk_cmd = "awk {'print \$1\"\t\"\$2\"\t\"\$5'} $out.fst.windowed.weir.fst >$out.fst.txt";
172 system($awk_cmd);
173
174 system("vcftools $input_format --remove-filtered-all --out $out.fst $cmd_fst_by_marker --maf 0.001 >>$out.vcftools.log 2>&1");
175
176 my %genes;
177 open(my $V,$input);
178 my $go = 0;
179 while(<$V>){
180 if ($go){
181 my @infos = split(/\t/,$_);
182 my $chr = $infos[0];
183 my $pos = $infos[1];
184 my $annot = $infos[7];
185 my $gene = "#";
186 if ($annot =~/EFF=\w+\((.*)\)/){
187 my @parts = split(/\|/,$1);
188 $gene = $parts[4];
189 }
190 $genes{"$chr:$pos"} = $gene;
191 }
192 if (/^#CHROM/){$go = 1;}
193 }
194
195 my $cumul = 0;
196 my $previous_pos;
197 my %chrs;
198 open(my $F,"$out.fst.weir.fst");
199 open(my $O,">$out.fst.by_marker.txt");
200 open(my $O2,">$out.fst.by_marker.genes.txt");
201 print $O "Chromosome Marker Position Fst\n";
202 print $O2 "Chromosome Marker Position Fst Gene\n";
203 while(<$F>){
204 my $line = $_;
205 chomp($line);
206 my ($chr,$pos,$val) = split(/\t/,$line);
207 if (!$chrs{$chr}){
208 $cumul = $previous_pos;
209 $chrs{$chr} = 1;
210 }
211 if ($val > 0.6 && $val !~/-/ && $val ne 'nan'){
212 my $x = $pos+$cumul;
213 print $O2 "$chr $chr:$pos $pos $val " . $genes{"$chr:$pos"} . "\n";
214 print $O "$chr $chr:$pos $x $val\n";
215 $previous_pos = $x;
216 }
217
218 }
219 close($F);
220 close($O);
221 close($O2);
222
223 #$awk_cmd = "awk {'if (\$3>0.6 && \$3<=1 && \$3!~\"nan\"){print \$1\"\t\"\$2\"\t\"\$2\"\t\"\$3}'} $out.fst.weir.fst >$out.fst.by_marker.txt";
224 #system($awk_cmd);
116 } 225 }
117 226
118 227
119 228
120 229