annotate SNP_density/CalculateSlidingWindowsSNPdensitiesFromVCF.pl @ 6:ebb0ac9b6fa9 draft

planemo upload
author gandres
date Mon, 23 May 2016 17:49:17 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
1 #!/usr/bin/perl
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
2
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
3 use strict;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
4 use Switch;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
5 use Getopt::Long;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
6 use Bio::SeqIO;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
7
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
8 my $usage = qq~Usage:$0 <args> [<opts>]
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
9 where <args> are:
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
10 -i, --input <VCF/Hapmap input>
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
11 -o, --out <output in tabular format>
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
12 -s, --step <step (in bp)>
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
13 ~;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
14 $usage .= "\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
15
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
16 my ($input,$out,$step);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
17
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
18 GetOptions(
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
19 "input=s" => \$input,
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
20 "out=s" => \$out,
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
21 "step=s" => \$step,
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
22 );
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
23
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
24
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
25 die $usage
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
26 if ( !$input || !$step || !$out );
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
27
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
28 my $max_chr_num = 100;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
29
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
30 my %counts;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
31 my %counts_by_ind;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
32
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
33 my $VCFTOOLS_EXE = "vcftools";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
34
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
35 my $is_vcf = `head -4000 $input | grep -c CHROM`;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
36 my $is_bcf = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
37 if ($input =~/\.bcf/){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
38 $is_bcf = 1;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
39 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
40
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
41
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
42 my $IN;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
43 my $headers;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
44 my $start_indiv_retrieval = 12;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
45 my $chrom_retrieval = 2;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
46 my $pos_retrieval = 3;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
47 if ($is_vcf or $is_bcf){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
48 $start_indiv_retrieval = 9;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
49 $chrom_retrieval = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
50 $pos_retrieval = 1;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
51 if ($is_vcf){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
52 $headers = `grep '#CHROM' $input`;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
53 open($IN,$input);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
54 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
55 elsif ($is_bcf){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
56 my $cmd = "$VCFTOOLS_EXE --bcf $input --stdout --recode | head -4000 | grep CHROM";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
57 $headers = `$cmd`;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
58 my $cmd2 = "$VCFTOOLS_EXE --bcf $input --stdout --recode";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
59 open $IN, '-|' , "$cmd2" or die "Can not run Vcftools";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
60 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
61 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
62 else{
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
63 $headers= <$IN>;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
64 open($IN,$input);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
65 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
66 $headers=~s/\n//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
67 $headers=~s/\r//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
68 my @ind_names = split(/\t/,$headers);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
69 my @individual_names;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
70 for (my $i = $start_indiv_retrieval; $i <= $#ind_names; $i++)
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
71 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
72 push(@individual_names,$ind_names[$i]);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
73 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
74 my %maximums;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
75 while(<$IN>)
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
76 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
77 my $line = $_;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
78 $line=~s/\n//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
79 $line=~s/\r//g;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
80 my @infos = split(/\t/,$line);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
81 if (scalar @infos > 8 && !/#CHROM/){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
82 my $chrom = $infos[$chrom_retrieval];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
83 my $position = $infos[$pos_retrieval];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
84 if ($position > $maximums{$chrom}){$maximums{$chrom}=$position;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
85 my $classe_position = int($position/$step);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
86 $counts{$chrom}{$classe_position}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
87
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
88 my $ref_allele = $infos[11];
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
89 if ($is_vcf or $is_bcf){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
90 $ref_allele = "0/0";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
91 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
92 for (my $i = $start_indiv_retrieval; $i <= $#infos; $i++){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
93 if (!$counts_by_ind{$chrom}{$classe_position}{$i}){$counts_by_ind{$chrom}{$classe_position}{$i} = 0;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
94 if ($infos[$i] ne $ref_allele){
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
95 $counts_by_ind{$chrom}{$classe_position}{$i}++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
96 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
97 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
98 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
99 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
100 close($IN);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
101
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
102 #######################################################
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
103 # global
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
104 #######################################################
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
105 open(my $OUT,">$out");
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
106 print $OUT "Chromosome Position SNPs\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
107 my $chr_num = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
108 foreach my $chrom(sort keys(%counts))
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
109 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
110 $chr_num++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
111 my $ref_counts = $counts{$chrom};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
112 my %final_counts = %$ref_counts;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
113 my $x = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
114 #foreach my $classe_position(sort {$a<=>$b} keys(%final_counts))
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
115 for (my $classe_position = 0; $classe_position <= $maximums{$chrom}/$step;$classe_position++)
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
116 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
117 my $nb = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
118 if ($counts{$chrom}{$classe_position})
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
119 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
120 $nb = $counts{$chrom}{$classe_position};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
121 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
122 $x += $step;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
123 print $OUT "$chrom $x $nb\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
124 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
125 if ($chr_num >= $max_chr_num){last;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
126 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
127 close($OUT);
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
128
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
129 #######################################################
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
130 # For each individual
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
131 #######################################################
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
132 open(my $OUT2,">$out.by_sample");
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
133 $chr_num = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
134 print $OUT2 "Chromosome ".join("\t",@individual_names) . "\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
135 foreach my $chrom(sort keys(%counts_by_ind))
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
136 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
137 $chr_num++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
138 my $ref_counts = $counts_by_ind{$chrom};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
139 my %final_counts = %$ref_counts;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
140 for (my $classe_position = 0; $classe_position <= $maximums{$chrom}/$step;$classe_position++)
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
141 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
142 print $OUT2 "$chrom";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
143 my $num_ind = $start_indiv_retrieval;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
144 foreach my $indiv(@individual_names)
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
145 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
146 my $val = 0;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
147
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
148 if ($counts_by_ind{$chrom}{$classe_position}{$num_ind})
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
149 {
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
150 $val = $counts_by_ind{$chrom}{$classe_position}{$num_ind};
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
151 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
152 print $OUT2 " $val";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
153 $num_ind++;
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
154 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
155 print $OUT2 "\n";
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
156 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
157 if ($chr_num >= $max_chr_num){last;}
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
158 }
ebb0ac9b6fa9 planemo upload
gandres
parents:
diff changeset
159 close($OUT2);