Mercurial > repos > iarc > mutspec
comparison mutspecFilter.pl @ 0:8c682b3a7c5b draft
Uploaded
author | iarc |
---|---|
date | Tue, 19 Apr 2016 03:07:11 -0400 |
parents | |
children | 916846f73e25 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8c682b3a7c5b |
---|---|
1 # !/usr/bin/perl | |
2 | |
3 #-----------------------------------# | |
4 # Author: Maude # | |
5 # Script: mutspecFilter.pl # | |
6 # Last update: 18/03/16 # | |
7 #-----------------------------------# | |
8 | |
9 use strict; | |
10 use warnings; | |
11 use Getopt::Long; | |
12 use Pod::Usage; | |
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); | |
14 use File::Path; | |
15 | |
16 ################################################################################################################################################################################ | |
17 # Filter an Annotaed file with Annovar # | |
18 ################################################################################################################################################################################ | |
19 | |
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. | |
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome. | |
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use. | |
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases. | |
24 our ($dir) = ""; | |
25 | |
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2); | |
27 | |
28 our ($input) = @ARGV; | |
29 | |
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); | |
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); | |
32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script | |
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) | |
34 | |
35 | |
36 | |
37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify | |
38 our $dbSNP = 0; | |
39 if($dbSNP_value > 0) { $dbSNP = 1; } | |
40 | |
41 | |
42 ############ Check flags ############ | |
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" } | |
44 | |
45 # Zero databases is specified | |
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) ) | |
47 { | |
48 print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n"; | |
49 exit; | |
50 } | |
51 | |
52 | |
53 | |
54 ############ Recover the name of the databases to filter against ############ | |
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", ""); | |
56 my @tab_protocol = (); | |
57 | |
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) ) | |
59 { | |
60 ### Recover the name of the column | |
61 my $protocol = ""; | |
62 ExtractAVDBName($listAVDB, \$protocol); | |
63 @tab_protocol = split(",", $protocol); | |
64 | |
65 for(my $i=0; $i<=$#tab_protocol; $i++) | |
66 { | |
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; } | |
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; } | |
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; } | |
70 } | |
71 } | |
72 | |
73 | |
74 ############ Filter the file ############ | |
75 filterAgainstPublicDB(); | |
76 | |
77 | |
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n"; | |
79 | |
80 | |
81 sub filterAgainstPublicDB | |
82 { | |
83 open(FILTER, ">", "$output") or die "$!: $output\n"; | |
84 | |
85 open(F1, $input) or die "$!: $input\n"; | |
86 my $header = <F1>; print FILTER $header; | |
87 while(<F1>) | |
88 { | |
89 $_ =~ s/[\r\n]+$//; | |
90 my @tab = split("\t", $_); | |
91 | |
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0); | |
93 | |
94 if($segDup == 1) | |
95 { | |
96 my $segDup_value = recoverNumCol($input, $segDup_name); | |
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]); | |
98 # Replace NA by 0 for making test on the same type of variable | |
99 $segDupInfo =~ s/NA/0/; | |
100 } | |
101 if($esp == 1) | |
102 { | |
103 my $espAll_value = recoverNumCol($input, $espAll_name); | |
104 $espAllInfo = $tab[$espAll_value]; | |
105 # Replace NA by 0 for making test on the same type of variable | |
106 $espAllInfo =~ s/NA/0/; | |
107 } | |
108 if($thG == 1) | |
109 { | |
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name); | |
111 # Replace NA by 0 for making test on the same type of variable | |
112 $thgInfo = $tab[$thousandGenome_value]; | |
113 $thgInfo =~ s/NA/0/; | |
114 } | |
115 | |
116 | |
117 ############################## | |
118 # One Filter # | |
119 ############################## | |
120 # Remove all the variants present in dbSNP | |
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } } | |
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database | |
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } } | |
124 # Remove all the variants with greater than 0.001 in Exome sequencing project | |
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } } | |
126 # Remove all the variants with greater than 0.001 in 1000 genome database | |
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } } | |
128 | |
129 | |
130 ############################# | |
131 # Two Filter # | |
132 ############################## | |
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } } | |
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
136 | |
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
139 | |
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
141 | |
142 | |
143 ############################# | |
144 # Three Filter # | |
145 ############################## | |
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) | |
147 { print FILTER "$_\n"; } } | |
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) | |
149 { print FILTER "$_\n"; } } | |
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
151 { print FILTER "$_\n"; } } | |
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
153 { print FILTER "$_\n"; } } | |
154 | |
155 | |
156 ############################# | |
157 # FOUR Filter # | |
158 ############################## | |
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
160 { print FILTER "$_\n"; } } | |
161 | |
162 } | |
163 close F1; close FILTER; | |
164 } | |
165 | |
166 | |
167 sub formatSegDupInfo | |
168 { | |
169 my ($segDup_info) = @_; | |
170 | |
171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931 | |
172 { | |
173 my @segDup = split(";", $segDup_info); | |
174 $segDup[0] =~ /Score=(.+)/; | |
175 return $1; | |
176 } | |
177 else { return $segDup_info; } | |
178 } | |
179 | |
180 | |
181 sub ExtractAVDBName | |
182 { | |
183 my ($listAVDB, $refS_protocol) = @_; | |
184 | |
185 open(F1, $listAVDB) or die "$!: $listAVDB\n"; | |
186 while(<F1>) | |
187 { | |
188 if ($_ =~ /^#/) { next; } | |
189 | |
190 $_ =~ s/[\r\n]+$//; | |
191 my @tab = split("\t", $_); | |
192 | |
193 # db name like refGenome_dbName.txt | |
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) ) | |
195 { | |
196 my $temp = $1; | |
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; } | |
198 } | |
199 # 1000 genome | |
200 if($tab[0] =~ /sites/) | |
201 { | |
202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; | |
203 my ($dbName, $year, $month) = ($1, $2, $3); | |
204 $dbName =~ tr/A-Z/a-z/; | |
205 | |
206 # convert the month number into the month name | |
207 ConvertMonth(\$month); | |
208 | |
209 my $AVdbName_final = "1000g".$year.$month."_".$dbName; | |
210 | |
211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; } | |
212 } | |
213 # ESP | |
214 if($tab[0] =~ /esp/) | |
215 { | |
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; | |
217 my $AVdbName_final = $1."_".$2; | |
218 | |
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; } | |
220 } | |
221 } | |
222 close F1; | |
223 | |
224 sub ConvertMonth | |
225 { | |
226 my ($refS_month) = @_; | |
227 | |
228 if($$refS_month == 1) { $$refS_month = "janv"; } | |
229 elsif($$refS_month == 2) { $$refS_month = "feb"; } | |
230 elsif($$refS_month == 3) { $$refS_month = "mar"; } | |
231 elsif($$refS_month == 4) { $$refS_month = "apr"; } | |
232 elsif($$refS_month == 5) { $$refS_month = "may"; } | |
233 elsif($$refS_month == 6) { $$refS_month = "jun"; } | |
234 elsif($$refS_month == 7) { $$refS_month = "jul"; } | |
235 elsif($$refS_month == 8) { $$refS_month = "aug"; } | |
236 elsif($$refS_month == 9) { $$refS_month = "sept"; } | |
237 elsif($$refS_month == 10) { $$refS_month = "oct"; } | |
238 elsif($$refS_month == 11) { $$refS_month = "nov"; } | |
239 elsif($$refS_month == 12) { $$refS_month = "dec"; } | |
240 else { print STDERR "Month number don't considered\n"; exit; } | |
241 } | |
242 } | |
243 | |
244 | |
245 sub recoverNumCol | |
246 { | |
247 my ($input, $name_of_column) = @_; | |
248 | |
249 # With Annovar updates the databases name changed and are present in an array | |
250 if( ref($name_of_column) eq "ARRAY" ) | |
251 { | |
252 my $test = ""; | |
253 my @tab = @$name_of_column; | |
254 foreach (@tab) | |
255 { | |
256 open(F1,$input) or die "$!: $input\n"; | |
257 # For having the name of the columns | |
258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); | |
259 close F1; | |
260 # The number of the column | |
261 my $name_of_column_NB = "toto"; | |
262 for(my $i=0; $i<=$#tab_search_header; $i++) | |
263 { | |
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; } | |
265 } | |
266 if($name_of_column_NB eq "toto") { next; } | |
267 else { return $name_of_column_NB; } | |
268 } | |
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | |
270 } | |
271 # Only one name is pass | |
272 else | |
273 { | |
274 open(FT,$input) or die "$!: $input\n"; | |
275 # For having the name of the columns | |
276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); | |
277 close FT; | |
278 # The number of the column | |
279 my $name_of_column_NB = "toto"; | |
280 for(my $i=0; $i<=$#tab_search_header; $i++) | |
281 { | |
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; } | |
283 } | |
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | |
285 else { return $name_of_column_NB; } | |
286 } | |
287 } | |
288 | |
289 =head1 NAME | |
290 | |
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | |
292 | |
293 =head1 SYNOPSIS | |
294 | |
295 mutspecFilter.pl [arguments] <query-file> | |
296 | |
297 <query-file> an annotated file | |
298 | |
299 Arguments: | |
300 -h, --help print help message | |
301 -m, --man print complete documentation | |
302 -v, --verbose use verbose output | |
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file | |
304 --segDup filter against genomic duplicate database | |
305 --esp filter against Exome Sequencing Project database (only for human) | |
306 --thG filter against 1000 genome database (onyl for human) | |
307 -o, --outfile <string> name of output file | |
308 --refGenome reference genome to use | |
309 --pathAVDBList path to the list of Annovar databases installed | |
310 | |
311 | |
312 Function: Filter out variants present in public databases | |
313 | |
314 Example: # Filter against dbSNP | |
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | |
316 | |
317 # Filter against the four databases | |
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | |
319 | |
320 | |
321 Version: 03-2016 (March 2016) | |
322 | |
323 | |
324 =head1 OPTIONS | |
325 | |
326 =over 8 | |
327 | |
328 =item B<--help> | |
329 | |
330 print a brief usage message and detailed explanation of options. | |
331 | |
332 =item B<--man> | |
333 | |
334 print the complete manual of the program. | |
335 | |
336 =item B<--verbose> | |
337 | |
338 use verbose output. | |
339 | |
340 =item B<--dbSNP> | |
341 | |
342 Remove all the variants presents in the dbSNP databases | |
343 Specify the number of column containing the annotation | |
344 For human and mouse genome | |
345 | |
346 =item B<--segDup> | |
347 | |
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database | |
349 For human and mouse genome | |
350 | |
351 =item B<--esp> | |
352 | |
353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project | |
354 For human genome only | |
355 | |
356 =item B<--thG> | |
357 | |
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database | |
359 | |
360 =item B<--refGenome> | |
361 | |
362 the reference genome to use, could be hg19 or mm9. | |
363 | |
364 =item B<--outfile> | |
365 | |
366 the name of the output file | |
367 | |
368 =item B<--pathAVDBList> | |
369 | |
370 the path to a texte file containing the list of the Annovar databases installed. | |
371 | |
372 =back | |
373 | |
374 =head1 DESCRIPTION | |
375 | |
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | |
377 | |
378 =cut |