0
|
1 # !/usr/bin/perl
|
|
2
|
|
3 #-----------------------------------#
|
|
4 # Author: Maude #
|
|
5 # Script: mutspecFilter.pl #
|
|
6 # Last update: 18/03/16 #
|
|
7 #-----------------------------------#
|
|
8
|
|
9 use strict;
|
|
10 use warnings;
|
|
11 use Getopt::Long;
|
|
12 use Pod::Usage;
|
|
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
14 use File::Path;
|
|
15
|
|
16 ################################################################################################################################################################################
|
|
17 # Filter an Annotaed file with Annovar #
|
|
18 ################################################################################################################################################################################
|
|
19
|
|
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
|
|
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
|
|
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
|
|
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases.
|
|
24 our ($dir) = "";
|
|
25
|
|
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);
|
|
27
|
|
28 our ($input) = @ARGV;
|
|
29
|
|
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
|
|
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
|
|
32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
|
|
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
|
|
34
|
|
35
|
|
36
|
|
37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
|
|
38 our $dbSNP = 0;
|
|
39 if($dbSNP_value > 0) { $dbSNP = 1; }
|
|
40
|
|
41
|
|
42 ############ Check flags ############
|
|
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
|
|
44
|
|
45 # Zero databases is specified
|
|
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
|
|
47 {
|
|
48 print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
|
|
49 exit;
|
|
50 }
|
|
51
|
|
52
|
|
53
|
|
54 ############ Recover the name of the databases to filter against ############
|
|
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
|
|
56 my @tab_protocol = ();
|
|
57
|
|
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
|
|
59 {
|
|
60 ### Recover the name of the column
|
|
61 my $protocol = "";
|
|
62 ExtractAVDBName($listAVDB, \$protocol);
|
|
63 @tab_protocol = split(",", $protocol);
|
|
64
|
|
65 for(my $i=0; $i<=$#tab_protocol; $i++)
|
|
66 {
|
|
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
|
|
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
|
|
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
|
|
70 }
|
|
71 }
|
|
72
|
|
73
|
|
74 ############ Filter the file ############
|
|
75 filterAgainstPublicDB();
|
|
76
|
|
77
|
|
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";
|
|
79
|
|
80
|
|
81 sub filterAgainstPublicDB
|
|
82 {
|
|
83 open(FILTER, ">", "$output") or die "$!: $output\n";
|
|
84
|
|
85 open(F1, $input) or die "$!: $input\n";
|
|
86 my $header = <F1>; print FILTER $header;
|
|
87 while(<F1>)
|
|
88 {
|
|
89 $_ =~ s/[\r\n]+$//;
|
|
90 my @tab = split("\t", $_);
|
|
91
|
|
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);
|
|
93
|
|
94 if($segDup == 1)
|
|
95 {
|
|
96 my $segDup_value = recoverNumCol($input, $segDup_name);
|
|
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
|
|
98 # Replace NA by 0 for making test on the same type of variable
|
|
99 $segDupInfo =~ s/NA/0/;
|
|
100 }
|
|
101 if($esp == 1)
|
|
102 {
|
|
103 my $espAll_value = recoverNumCol($input, $espAll_name);
|
|
104 $espAllInfo = $tab[$espAll_value];
|
|
105 # Replace NA by 0 for making test on the same type of variable
|
|
106 $espAllInfo =~ s/NA/0/;
|
|
107 }
|
|
108 if($thG == 1)
|
|
109 {
|
|
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
|
|
111 # Replace NA by 0 for making test on the same type of variable
|
|
112 $thgInfo = $tab[$thousandGenome_value];
|
|
113 $thgInfo =~ s/NA/0/;
|
|
114 }
|
|
115
|
|
116
|
|
117 ##############################
|
|
118 # One Filter #
|
|
119 ##############################
|
|
120 # Remove all the variants present in dbSNP
|
|
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
|
|
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database
|
|
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } }
|
|
124 # Remove all the variants with greater than 0.001 in Exome sequencing project
|
|
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } }
|
|
126 # Remove all the variants with greater than 0.001 in 1000 genome database
|
|
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } }
|
|
128
|
|
129
|
|
130 #############################
|
|
131 # Two Filter #
|
|
132 ##############################
|
|
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } }
|
|
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
|
|
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
|
|
136
|
|
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
|
|
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
|
|
139
|
|
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
|
|
141
|
|
142
|
|
143 #############################
|
|
144 # Three Filter #
|
|
145 ##############################
|
|
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
|
|
147 { print FILTER "$_\n"; } }
|
|
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
|
|
149 { print FILTER "$_\n"; } }
|
|
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
|
|
151 { print FILTER "$_\n"; } }
|
|
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
|
|
153 { print FILTER "$_\n"; } }
|
|
154
|
|
155
|
|
156 #############################
|
|
157 # FOUR Filter #
|
|
158 ##############################
|
|
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
|
|
160 { print FILTER "$_\n"; } }
|
|
161
|
|
162 }
|
|
163 close F1; close FILTER;
|
|
164 }
|
|
165
|
|
166
|
|
167 sub formatSegDupInfo
|
|
168 {
|
|
169 my ($segDup_info) = @_;
|
|
170
|
|
171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
|
|
172 {
|
|
173 my @segDup = split(";", $segDup_info);
|
|
174 $segDup[0] =~ /Score=(.+)/;
|
|
175 return $1;
|
|
176 }
|
|
177 else { return $segDup_info; }
|
|
178 }
|
|
179
|
|
180
|
|
181 sub ExtractAVDBName
|
|
182 {
|
|
183 my ($listAVDB, $refS_protocol) = @_;
|
|
184
|
|
185 open(F1, $listAVDB) or die "$!: $listAVDB\n";
|
|
186 while(<F1>)
|
|
187 {
|
|
188 if ($_ =~ /^#/) { next; }
|
|
189
|
|
190 $_ =~ s/[\r\n]+$//;
|
|
191 my @tab = split("\t", $_);
|
|
192
|
|
193 # db name like refGenome_dbName.txt
|
|
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
|
|
195 {
|
|
196 my $temp = $1;
|
|
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
|
|
198 }
|
|
199 # 1000 genome
|
|
200 if($tab[0] =~ /sites/)
|
|
201 {
|
|
202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
|
|
203 my ($dbName, $year, $month) = ($1, $2, $3);
|
|
204 $dbName =~ tr/A-Z/a-z/;
|
|
205
|
|
206 # convert the month number into the month name
|
|
207 ConvertMonth(\$month);
|
|
208
|
|
209 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
|
|
210
|
|
211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
|
|
212 }
|
|
213 # ESP
|
|
214 if($tab[0] =~ /esp/)
|
|
215 {
|
|
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
|
|
217 my $AVdbName_final = $1."_".$2;
|
|
218
|
|
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
|
|
220 }
|
|
221 }
|
|
222 close F1;
|
|
223
|
|
224 sub ConvertMonth
|
|
225 {
|
|
226 my ($refS_month) = @_;
|
|
227
|
|
228 if($$refS_month == 1) { $$refS_month = "janv"; }
|
|
229 elsif($$refS_month == 2) { $$refS_month = "feb"; }
|
|
230 elsif($$refS_month == 3) { $$refS_month = "mar"; }
|
|
231 elsif($$refS_month == 4) { $$refS_month = "apr"; }
|
|
232 elsif($$refS_month == 5) { $$refS_month = "may"; }
|
|
233 elsif($$refS_month == 6) { $$refS_month = "jun"; }
|
|
234 elsif($$refS_month == 7) { $$refS_month = "jul"; }
|
|
235 elsif($$refS_month == 8) { $$refS_month = "aug"; }
|
|
236 elsif($$refS_month == 9) { $$refS_month = "sept"; }
|
|
237 elsif($$refS_month == 10) { $$refS_month = "oct"; }
|
|
238 elsif($$refS_month == 11) { $$refS_month = "nov"; }
|
|
239 elsif($$refS_month == 12) { $$refS_month = "dec"; }
|
|
240 else { print STDERR "Month number don't considered\n"; exit; }
|
|
241 }
|
|
242 }
|
|
243
|
|
244
|
|
245 sub recoverNumCol
|
|
246 {
|
|
247 my ($input, $name_of_column) = @_;
|
|
248
|
|
249 # With Annovar updates the databases name changed and are present in an array
|
|
250 if( ref($name_of_column) eq "ARRAY" )
|
|
251 {
|
|
252 my $test = "";
|
|
253 my @tab = @$name_of_column;
|
|
254 foreach (@tab)
|
|
255 {
|
|
256 open(F1,$input) or die "$!: $input\n";
|
|
257 # For having the name of the columns
|
|
258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
259 close F1;
|
|
260 # The number of the column
|
|
261 my $name_of_column_NB = "toto";
|
|
262 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
263 {
|
|
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
|
|
265 }
|
|
266 if($name_of_column_NB eq "toto") { next; }
|
|
267 else { return $name_of_column_NB; }
|
|
268 }
|
|
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
|
|
270 }
|
|
271 # Only one name is pass
|
|
272 else
|
|
273 {
|
|
274 open(FT,$input) or die "$!: $input\n";
|
|
275 # For having the name of the columns
|
|
276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
277 close FT;
|
|
278 # The number of the column
|
|
279 my $name_of_column_NB = "toto";
|
|
280 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
281 {
|
|
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
|
|
283 }
|
|
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
|
|
285 else { return $name_of_column_NB; }
|
|
286 }
|
|
287 }
|
|
288
|
|
289 =head1 NAME
|
|
290
|
|
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
|
|
292
|
|
293 =head1 SYNOPSIS
|
|
294
|
|
295 mutspecFilter.pl [arguments] <query-file>
|
|
296
|
|
297 <query-file> an annotated file
|
|
298
|
|
299 Arguments:
|
|
300 -h, --help print help message
|
|
301 -m, --man print complete documentation
|
|
302 -v, --verbose use verbose output
|
|
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file
|
|
304 --segDup filter against genomic duplicate database
|
|
305 --esp filter against Exome Sequencing Project database (only for human)
|
|
306 --thG filter against 1000 genome database (onyl for human)
|
|
307 -o, --outfile <string> name of output file
|
|
308 --refGenome reference genome to use
|
|
309 --pathAVDBList path to the list of Annovar databases installed
|
|
310
|
|
311
|
|
312 Function: Filter out variants present in public databases
|
|
313
|
|
314 Example: # Filter against dbSNP
|
|
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
|
|
316
|
|
317 # Filter against the four databases
|
|
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
|
|
319
|
|
320
|
|
321 Version: 03-2016 (March 2016)
|
|
322
|
|
323
|
|
324 =head1 OPTIONS
|
|
325
|
|
326 =over 8
|
|
327
|
|
328 =item B<--help>
|
|
329
|
|
330 print a brief usage message and detailed explanation of options.
|
|
331
|
|
332 =item B<--man>
|
|
333
|
|
334 print the complete manual of the program.
|
|
335
|
|
336 =item B<--verbose>
|
|
337
|
|
338 use verbose output.
|
|
339
|
|
340 =item B<--dbSNP>
|
|
341
|
|
342 Remove all the variants presents in the dbSNP databases
|
|
343 Specify the number of column containing the annotation
|
|
344 For human and mouse genome
|
|
345
|
|
346 =item B<--segDup>
|
|
347
|
|
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
|
|
349 For human and mouse genome
|
|
350
|
|
351 =item B<--esp>
|
|
352
|
|
353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
|
|
354 For human genome only
|
|
355
|
|
356 =item B<--thG>
|
|
357
|
|
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
|
|
359
|
|
360 =item B<--refGenome>
|
|
361
|
|
362 the reference genome to use, could be hg19 or mm9.
|
|
363
|
|
364 =item B<--outfile>
|
|
365
|
|
366 the name of the output file
|
|
367
|
|
368 =item B<--pathAVDBList>
|
|
369
|
|
370 the path to a texte file containing the list of the Annovar databases installed.
|
|
371
|
|
372 =back
|
|
373
|
|
374 =head1 DESCRIPTION
|
|
375
|
|
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
|
|
377
|
|
378 =cut
|