6
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use Switch;
|
|
5 use Getopt::Long;
|
|
6 use Bio::SeqIO;
|
|
7
|
|
8 my $usage = qq~Usage:$0 <args> [<opts>]
|
|
9 where <args> are:
|
|
10 -i, --input <VCF/Hapmap input>
|
|
11 -o, --out <output in tabular format>
|
|
12 -s, --step <step (in bp)>
|
|
13 ~;
|
|
14 $usage .= "\n";
|
|
15
|
|
16 my ($input,$out,$step);
|
|
17
|
|
18 GetOptions(
|
|
19 "input=s" => \$input,
|
|
20 "out=s" => \$out,
|
|
21 "step=s" => \$step,
|
|
22 );
|
|
23
|
|
24
|
|
25 die $usage
|
|
26 if ( !$input || !$step || !$out );
|
|
27
|
|
28 my $max_chr_num = 100;
|
|
29
|
|
30 my %counts;
|
|
31 my %counts_by_ind;
|
|
32
|
|
33 my $VCFTOOLS_EXE = "vcftools";
|
|
34
|
|
35 my $is_vcf = `head -4000 $input | grep -c CHROM`;
|
|
36 my $is_bcf = 0;
|
|
37 if ($input =~/\.bcf/){
|
|
38 $is_bcf = 1;
|
|
39 }
|
|
40
|
|
41
|
|
42 my $IN;
|
|
43 my $headers;
|
|
44 my $start_indiv_retrieval = 12;
|
|
45 my $chrom_retrieval = 2;
|
|
46 my $pos_retrieval = 3;
|
|
47 if ($is_vcf or $is_bcf){
|
|
48 $start_indiv_retrieval = 9;
|
|
49 $chrom_retrieval = 0;
|
|
50 $pos_retrieval = 1;
|
|
51 if ($is_vcf){
|
|
52 $headers = `grep '#CHROM' $input`;
|
|
53 open($IN,$input);
|
|
54 }
|
|
55 elsif ($is_bcf){
|
|
56 my $cmd = "$VCFTOOLS_EXE --bcf $input --stdout --recode | head -4000 | grep CHROM";
|
|
57 $headers = `$cmd`;
|
|
58 my $cmd2 = "$VCFTOOLS_EXE --bcf $input --stdout --recode";
|
|
59 open $IN, '-|' , "$cmd2" or die "Can not run Vcftools";
|
|
60 }
|
|
61 }
|
|
62 else{
|
|
63 $headers= <$IN>;
|
|
64 open($IN,$input);
|
|
65 }
|
|
66 $headers=~s/\n//g;
|
|
67 $headers=~s/\r//g;
|
|
68 my @ind_names = split(/\t/,$headers);
|
|
69 my @individual_names;
|
|
70 for (my $i = $start_indiv_retrieval; $i <= $#ind_names; $i++)
|
|
71 {
|
|
72 push(@individual_names,$ind_names[$i]);
|
|
73 }
|
|
74 my %maximums;
|
|
75 while(<$IN>)
|
|
76 {
|
|
77 my $line = $_;
|
|
78 $line=~s/\n//g;
|
|
79 $line=~s/\r//g;
|
|
80 my @infos = split(/\t/,$line);
|
|
81 if (scalar @infos > 8 && !/#CHROM/){
|
|
82 my $chrom = $infos[$chrom_retrieval];
|
|
83 my $position = $infos[$pos_retrieval];
|
|
84 if ($position > $maximums{$chrom}){$maximums{$chrom}=$position;}
|
|
85 my $classe_position = int($position/$step);
|
|
86 $counts{$chrom}{$classe_position}++;
|
|
87
|
|
88 my $ref_allele = $infos[11];
|
|
89 if ($is_vcf or $is_bcf){
|
|
90 $ref_allele = "0/0";
|
|
91 }
|
|
92 for (my $i = $start_indiv_retrieval; $i <= $#infos; $i++){
|
|
93 if (!$counts_by_ind{$chrom}{$classe_position}{$i}){$counts_by_ind{$chrom}{$classe_position}{$i} = 0;}
|
|
94 if ($infos[$i] ne $ref_allele){
|
|
95 $counts_by_ind{$chrom}{$classe_position}{$i}++;
|
|
96 }
|
|
97 }
|
|
98 }
|
|
99 }
|
|
100 close($IN);
|
|
101
|
|
102 #######################################################
|
|
103 # global
|
|
104 #######################################################
|
|
105 open(my $OUT,">$out");
|
|
106 print $OUT "Chromosome Position SNPs\n";
|
|
107 my $chr_num = 0;
|
|
108 foreach my $chrom(sort keys(%counts))
|
|
109 {
|
|
110 $chr_num++;
|
|
111 my $ref_counts = $counts{$chrom};
|
|
112 my %final_counts = %$ref_counts;
|
|
113 my $x = 0;
|
|
114 #foreach my $classe_position(sort {$a<=>$b} keys(%final_counts))
|
|
115 for (my $classe_position = 0; $classe_position <= $maximums{$chrom}/$step;$classe_position++)
|
|
116 {
|
|
117 my $nb = 0;
|
|
118 if ($counts{$chrom}{$classe_position})
|
|
119 {
|
|
120 $nb = $counts{$chrom}{$classe_position};
|
|
121 }
|
|
122 $x += $step;
|
|
123 print $OUT "$chrom $x $nb\n";
|
|
124 }
|
|
125 if ($chr_num >= $max_chr_num){last;}
|
|
126 }
|
|
127 close($OUT);
|
|
128
|
|
129 #######################################################
|
|
130 # For each individual
|
|
131 #######################################################
|
|
132 open(my $OUT2,">$out.by_sample");
|
|
133 $chr_num = 0;
|
|
134 print $OUT2 "Chromosome ".join("\t",@individual_names) . "\n";
|
|
135 foreach my $chrom(sort keys(%counts_by_ind))
|
|
136 {
|
|
137 $chr_num++;
|
|
138 my $ref_counts = $counts_by_ind{$chrom};
|
|
139 my %final_counts = %$ref_counts;
|
|
140 for (my $classe_position = 0; $classe_position <= $maximums{$chrom}/$step;$classe_position++)
|
|
141 {
|
|
142 print $OUT2 "$chrom";
|
|
143 my $num_ind = $start_indiv_retrieval;
|
|
144 foreach my $indiv(@individual_names)
|
|
145 {
|
|
146 my $val = 0;
|
|
147
|
|
148 if ($counts_by_ind{$chrom}{$classe_position}{$num_ind})
|
|
149 {
|
|
150 $val = $counts_by_ind{$chrom}{$classe_position}{$num_ind};
|
|
151 }
|
|
152 print $OUT2 " $val";
|
|
153 $num_ind++;
|
|
154 }
|
|
155 print $OUT2 "\n";
|
|
156 }
|
|
157 if ($chr_num >= $max_chr_num){last;}
|
|
158 }
|
|
159 close($OUT2);
|