comparison check_gwas_inputs/CheckGWASInputs.pl @ 1:420b57c3c185 draft

Uploaded
author dereeper
date Fri, 10 Jul 2015 04:39:30 -0400
parents
children
comparison
equal deleted inserted replaced
0:3e19d0dfcf3e 1:420b57c3c185
1 #!/usr/bin/perl
2
3 use strict;
4 use Switch;
5 use Getopt::Long;
6
7 my $usage = qq~Usage:$0 <args> [<opts>]
8 where <args> are:
9 -h, --hapmap <Hapmap input file>
10 -t, --trait <Trait input file>
11 -o, --out <Output base name>
12 ~;
13 $usage .= "\n";
14
15 my ($hapmap,$trait,$out);
16
17
18 GetOptions(
19 "trait=s" => \$trait,
20 "out=s" => \$out,
21 "hapmap=s" => \$hapmap
22 );
23
24
25 die $usage
26 if ( !$trait || !$out || !$hapmap);
27
28 my %inds;
29
30 #######################################
31 # get individuals in trait file
32 #######################################
33 my %traits;
34 my $head_trait = `head -1 $trait`;
35 open(my $T,$trait);
36 <$T>;
37 while(<$T>)
38 {
39 my @infos = split(/\t/,$_);
40 my $ind = $infos[0];
41 $inds{$ind}++;
42 $traits{$ind} = $_;
43 }
44 close($T);
45 my $nb_ind_trait = scalar keys(%traits);
46
47 #######################################
48 # get individuals in hapmap file
49 #######################################
50 my $line_ind = `head -1 $hapmap`;
51 chomp($line_ind);
52 my @infos = split(/\t/,$line_ind);
53 for (my $i = 11; $i <= $#infos; $i++)
54 {
55 my $ind = $infos[$i];
56 $inds{$ind}++;
57 }
58 my $nb_ind_hapmap = scalar @infos - 11;
59
60 #################################################################
61 # create trait output by keeping individuals found in both files
62 #################################################################
63 open(my $O,">$out.trait");
64 print $O $head_trait;
65 my $nb_common = 0;
66 foreach my $ind(keys(%inds))
67 {
68 my $nb_found = $inds{$ind};
69 if ($nb_found == 2)
70 {
71 $nb_common++;
72 print $O $traits{$ind};
73 }
74 }
75 close($O);
76
77
78 #####################################################################
79 # create hapmap output after keeping individuals found in both files
80 # and removing monomorphic positions
81 #####################################################################
82 open(my $O2,">$out.hapmap");
83 my $numline = 0;
84 my %genotypes;
85 my %columns_to_keep;
86 my $nb_monomorphic = 0;
87 my $not_biallelic = 0;
88 my $diff_variation = 0;
89 open(my $H,$hapmap);
90 while(<$H>)
91 {
92 $numline++;
93 my $line = $_;
94 $line =~s/\n//g;
95 $line =~s/\r//g;
96 my @infos = split(/\t/,$line);
97 if ($numline == 1)
98 {
99 my @titles;
100 for (my $i = 0; $i <= 10; $i++)
101 {
102 my $title = $infos[$i];
103 push(@titles,$title);
104 }
105 print $O2 join("\t",@titles);
106 for (my $i = 11; $i <= $#infos; $i++)
107 {
108 my $ind = $infos[$i];
109 my $nb_found = $inds{$ind};
110 if ($nb_found == 2)
111 {
112 print $O2 " $ind";
113 $columns_to_keep{$i} = 1;
114 }
115 }
116 print $O2 "\n";
117 }
118 else
119 {
120 my $to_be_printed = "";
121 my $variation = $infos[1];
122 for (my $i = 0; $i <= 10; $i++)
123 {
124 my $title = $infos[$i];
125 $to_be_printed .= "$title ";
126 }
127 my %letters;
128 for (my $i = 11; $i <= $#infos; $i++)
129 {
130 if ($columns_to_keep{$i})
131 {
132 my $genotype = $infos[$i];
133 if ($genotype ne 'NN')
134 {
135 my ($allele1,$allele2) = split(//,$genotype);
136 $letters{$allele1}=1;
137 $letters{$allele2}=1;
138 }
139 $to_be_printed .= "$genotype ";
140 }
141 }
142 chop($to_be_printed);
143
144 my $variation_obs = join("/",sort keys(%letters));
145
146 # print only if polymorphic
147 if (scalar keys(%letters) < 2)
148 {
149 $nb_monomorphic++;
150 }
151 elsif (scalar keys(%letters) > 2)
152 {
153 $not_biallelic++;
154 }
155 else
156 {
157 if ($variation ne $variation_obs)
158 {
159 $to_be_printed =~s/$variation/$variation_obs/;
160 $diff_variation++;
161 }
162
163 print $O2 $to_be_printed . "\n";
164 }
165 }
166 }
167 close($H);
168 close($O2);
169
170 print "==============================================\n";
171 print "Individuals\n";
172 print "==============================================\n";
173 print "Individuals in hapmap file: $nb_ind_hapmap\n";
174 print "Individuals in trait file: $nb_ind_trait\n";
175 print "Individuals found in both files: $nb_common\n";
176 print "==============================================\n";
177 print "Markers\n";
178 print "==============================================\n";
179 print "Discarded markers:\n";
180 print "Monomorphic: $nb_monomorphic\n";
181 print "Not biallelic: $not_biallelic\n";
182 print "Modified markers:\n";
183 print "Difference in variation: $diff_variation\n";
184