7
|
1 # !/usr/bin/perl
|
|
2
|
|
3 #-----------------------------------#
|
|
4 # Author: Maude / Vincent #
|
|
5 # Script: mutspecFilter.pl #
|
|
6 # Last update: 01/02/17 #
|
|
7 #-----------------------------------#
|
|
8
|
|
9 use strict;
|
|
10 use warnings;
|
|
11 use Getopt::Long;
|
|
12 use Pod::Usage;
|
|
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
|
|
14 use File::Path;
|
|
15
|
|
16 #########################################################################################################################################################
|
|
17 # Filter an Annotaed file with Annovar #
|
|
18 #########################################################################################################################################################
|
|
19
|
|
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
|
|
21 our ($dbSNP_value, $segDup, $esp, $thG, $exac) = (0, 0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome, ExAC.
|
|
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
|
|
23 our ($listAVDB) = "empty"; # Text file with the list of Annovar databases.
|
|
24 our ($dir) = ""; # Directory containing the script + the text file with the list of Annovar databases
|
|
25 our (@filters); # Path to BED file(s) to filter against
|
|
26
|
|
27 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'exac'=>\$exac, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB, 'filter=s'=> \@filters) or pod2usage(2);
|
|
28
|
|
29 our ($input) = @ARGV;
|
|
30
|
|
31 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
|
|
32 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
|
|
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
|
|
34 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
|
|
35
|
|
36
|
|
37
|
|
38 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
|
|
39 our $dbSNP = 0;
|
|
40 if($dbSNP_value > 0) { $dbSNP = 1; }
|
|
41
|
|
42
|
|
43 ############ Check flags ############
|
|
44 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
|
|
45
|
|
46 # Zero databases is specified
|
|
47 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) && ($exac == 0) && (scalar(@filters) == 0) )
|
|
48 {
|
|
49 print STDERR "Error message:\n";
|
|
50 print STDERR "There is no databases selected for filtering against!!!\n";
|
|
51 print STDERR "Please chose at least one between dbSNP, SegDup (only for human and mouse genomes), ESP (only for human genome), 1000 genome (only for human genome) or ExAC (only for human genome)\n";
|
|
52 print STDERR "Or specify a BED file\n";
|
|
53 exit;
|
|
54 }
|
|
55
|
|
56
|
|
57
|
|
58 ############ Recover the name of the databases to filter against ############
|
|
59 my ($segDup_name, $espAll_name, $thousandGenome_name, $exac_name) = ("", "", "", "");
|
|
60 my @tab_protocol = ();
|
|
61
|
|
62 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) || ($exac == 1))
|
|
63 {
|
|
64 ### Recover the name of the column
|
|
65 my $protocol = "";
|
|
66 ExtractAVDBName($listAVDB, \$protocol);
|
|
67 @tab_protocol = split(",", $protocol);
|
|
68
|
|
69 for(my $i=0; $i<=$#tab_protocol; $i++)
|
|
70 {
|
|
71 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
|
|
72 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
|
|
73 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
|
|
74 elsif($tab_protocol[$i] =~ /exac/i) { $exac_name = $tab_protocol[$i]; }
|
|
75 }
|
|
76 }
|
|
77
|
|
78
|
|
79 ############ Filter the file ############
|
|
80 filterAgainstPublicDB();
|
|
81
|
|
82
|
|
83 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\tEXac = ". $exac . "\n";
|
|
84
|
|
85
|
|
86 ### Write a message if the input file contains zero variants or if all the variants are filtered out
|
|
87 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
|
|
88 my $nbVariantsIn = `wc -l $input`;
|
|
89 $nbVariantsIn =~ /(\d+).+/;
|
|
90 my $nbLineIn = $1;
|
|
91 if($nbLineIn == 1)
|
|
92 {
|
|
93 print STDERR "Error message:\n";
|
|
94 print STDERR "\nThere is no variant to be filtered for $filename\n";
|
|
95 print STDERR "Check MutSpecAnnot standard output for more informations\n";
|
|
96 exit;
|
|
97 }
|
|
98 else
|
|
99 {
|
|
100 my ($filenameOut, $directoriesOut, $suffixOut) = fileparse($output, qr/\.[^.]*/);
|
|
101 my $nbVariantsOut = `wc -l $output`;
|
|
102 $nbVariantsOut =~ /(\d+).+/;
|
|
103 my $nbLineOut = $1;
|
|
104 if($nbLineOut == 1)
|
|
105 {
|
|
106 print STDOUT "Warning message:\n";
|
|
107 print STDOUT "\nAll the variants were filtered out for $filenameOut\n";
|
|
108 }
|
|
109 }
|
|
110
|
|
111 ### filter versus additional VCF files if provided.
|
|
112 if ( scalar(@filters) > 0) { filterAdditionalBED(); }
|
|
113
|
|
114
|
|
115
|
|
116
|
|
117 sub filterAgainstPublicDB
|
|
118 {
|
|
119 open(FILTER, ">", "$output") or die "$!: $output\n";
|
|
120
|
|
121 open(F1, $input) or die "$!: $input\n";
|
|
122 my $header = <F1>; print FILTER $header;
|
|
123 while(<F1>)
|
|
124 {
|
|
125 $_ =~ s/[\r\n]+$//;
|
|
126 my @tab = split("\t", $_);
|
|
127
|
|
128 my ($segDupInfo, $espAllInfo, $thgInfo, $exacInfo) = (0, 0 ,0, 0);
|
|
129
|
|
130 if($segDup == 1)
|
|
131 {
|
|
132 my $segDup_value = recoverNumCol($input, $segDup_name);
|
|
133 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
|
|
134 # Replace NA by 0 for making test on the same type of variable
|
|
135 $segDupInfo =~ s/NA/0/;
|
|
136 }
|
|
137 if($esp == 1)
|
|
138 {
|
|
139 my $espAll_value = recoverNumCol($input, $espAll_name);
|
|
140 $espAllInfo = $tab[$espAll_value];
|
|
141 # Replace NA by 0 for making test on the same type of variable
|
|
142 $espAllInfo =~ s/NA/0/;
|
|
143 }
|
|
144 if($thG == 1)
|
|
145 {
|
|
146 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
|
|
147 # Replace NA by 0 for making test on the same type of variable
|
|
148 $thgInfo = $tab[$thousandGenome_value];
|
|
149 $thgInfo =~ s/NA/0/;
|
|
150 }
|
|
151 if($exac == 1)
|
|
152 {
|
|
153 my $exac_value = recoverNumCol($input, $exac_name);
|
|
154 # Replace NA by 0 for making test on the same type of variable
|
|
155 $exacInfo = $tab[$exac_value];
|
|
156 $exacInfo =~ s/NA/0/;
|
|
157 }
|
|
158
|
|
159 my $filter = 0;
|
|
160 if( $dbSNP == 1 && $tab[$dbSNP_value-1] ne "NA" ){ $filter = 1; }
|
|
161 if( $segDup == 1 && $segDupInfo >= 0.9) { $filter = 1; }
|
|
162 if( $esp == 1 && $espAllInfo > 0.001) { $filter = 1; }
|
|
163 if( $thG == 1 && $thgInfo > 0.001) { $filter = 1; }
|
|
164 if( $thG == 1 && $exacInfo > 0.001) { $filter = 1; }
|
|
165
|
|
166 if (!$filter) { print FILTER "$_\n"; }
|
|
167
|
|
168 }
|
|
169 close F1; close FILTER;
|
|
170 }
|
|
171
|
|
172
|
|
173 sub formatSegDupInfo
|
|
174 {
|
|
175 my ($segDup_info) = @_;
|
|
176
|
|
177 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
|
|
178 {
|
|
179 my @segDup = split(";", $segDup_info);
|
|
180 $segDup[0] =~ /Score=(.+)/;
|
|
181 return $1;
|
|
182 }
|
|
183 else { return $segDup_info; }
|
|
184 }
|
|
185
|
|
186
|
|
187 sub ExtractAVDBName
|
|
188 {
|
|
189 my ($listAVDB, $refS_protocol) = @_;
|
|
190
|
|
191 open(F1, $listAVDB) or die "$!: $listAVDB\n";
|
|
192 while(<F1>)
|
|
193 {
|
|
194 if ($_ =~ /^#/) { next; }
|
|
195
|
|
196 $_ =~ s/[\r\n]+$//;
|
|
197 my @tab = split("\t", $_);
|
|
198
|
|
199 # db name like refGenome_dbName.txt
|
|
200 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) && ($tab[0] !~ /exac/i) )
|
|
201 {
|
|
202 my $temp = $1;
|
|
203 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
|
|
204 }
|
|
205 # 1000 genome
|
|
206 if($tab[0] =~ /sites/)
|
|
207 {
|
|
208 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
|
|
209 my ($dbName, $year, $month) = ($1, $2, $3);
|
|
210 $dbName =~ tr/A-Z/a-z/;
|
|
211
|
|
212 # convert the month number into the month name
|
|
213 ConvertMonth(\$month);
|
|
214
|
|
215 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
|
|
216
|
|
217 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
|
|
218 }
|
|
219 # ESP
|
|
220 if($tab[0] =~ /esp/)
|
|
221 {
|
|
222 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
|
|
223 my $AVdbName_final = $1."_".$2;
|
|
224
|
|
225 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
|
|
226 }
|
|
227 # EXAC
|
|
228 if($tab[0] =~ /exac/i)
|
|
229 {
|
|
230 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
|
|
231 my $AVdbName_final = "ExAC_ALL";
|
|
232
|
|
233 $$refS_protocol .= $AVdbName_final.",";
|
|
234 }
|
|
235
|
|
236 }
|
|
237 close F1;
|
|
238
|
|
239 sub ConvertMonth
|
|
240 {
|
|
241 my ($refS_month) = @_;
|
|
242
|
|
243 if($$refS_month == 1) { $$refS_month = "janv"; }
|
|
244 elsif($$refS_month == 2) { $$refS_month = "feb"; }
|
|
245 elsif($$refS_month == 3) { $$refS_month = "mar"; }
|
|
246 elsif($$refS_month == 4) { $$refS_month = "apr"; }
|
|
247 elsif($$refS_month == 5) { $$refS_month = "may"; }
|
|
248 elsif($$refS_month == 6) { $$refS_month = "jun"; }
|
|
249 elsif($$refS_month == 7) { $$refS_month = "jul"; }
|
|
250 elsif($$refS_month == 8) { $$refS_month = "aug"; }
|
|
251 elsif($$refS_month == 9) { $$refS_month = "sept"; }
|
|
252 elsif($$refS_month == 10) { $$refS_month = "oct"; }
|
|
253 elsif($$refS_month == 11) { $$refS_month = "nov"; }
|
|
254 elsif($$refS_month == 12) { $$refS_month = "dec"; }
|
|
255 }
|
|
256 }
|
|
257
|
|
258
|
|
259 sub recoverNumCol
|
|
260 {
|
|
261 my ($input, $name_of_column) = @_;
|
|
262
|
|
263 # With Annovar updates the databases name changed and are present in an array
|
|
264 if( ref($name_of_column) eq "ARRAY" )
|
|
265 {
|
|
266 my $test = "";
|
|
267 my @tab = @$name_of_column;
|
|
268 foreach (@tab)
|
|
269 {
|
|
270 open(F1,$input) or die "$!: $input\n";
|
|
271 # For having the name of the columns
|
|
272 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
273 close F1;
|
|
274 # The number of the column
|
|
275 my $name_of_column_NB = "toto";
|
|
276 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
277 {
|
|
278 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
|
|
279 }
|
|
280 if($name_of_column_NB eq "toto") { next; }
|
|
281 else { return $name_of_column_NB; }
|
|
282 }
|
|
283 if($name_of_column eq "toto")
|
|
284 {
|
|
285 print STDERR "Error message:\n";
|
|
286 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
|
|
287 exit;
|
|
288 }
|
|
289 }
|
|
290 # Only one name is pass
|
|
291 else
|
|
292 {
|
|
293 open(FT,$input) or die "$!: $input\n";
|
|
294 # For having the name of the columns
|
|
295 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
|
|
296 close FT;
|
|
297 # The number of the column
|
|
298 my $name_of_column_NB = "toto";
|
|
299 for(my $i=0; $i<=$#tab_search_header; $i++)
|
|
300 {
|
|
301 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
|
|
302 }
|
|
303 if($name_of_column_NB eq "toto")
|
|
304 {
|
|
305 print STDERR "Error message:\n";
|
|
306 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
|
|
307 exit;
|
|
308 }
|
|
309 else
|
|
310 {
|
|
311 return $name_of_column_NB;
|
|
312 }
|
|
313 }
|
|
314 }
|
|
315
|
|
316
|
|
317 sub filterAdditionalBED{
|
|
318
|
|
319 #create bed
|
|
320 open(TABLE, "$output") or die "$!: $output\n";
|
|
321 open(F1, ">bed") or die "cannot create bed file";
|
|
322 my $NL=1;
|
|
323 my $headers=<TABLE>;
|
|
324 while(<TABLE>)
|
|
325 {
|
|
326 $NL++;
|
|
327 my @line=split("\t", $_);
|
|
328 print F1 "$line[0]\t$line[1]\t$line[2]\t$NL\n";
|
|
329 }
|
|
330 close F1;
|
|
331 close TABLE;
|
|
332 #and sort it
|
|
333 `sort -k1,1 -k2,2n bed > sorted`;
|
|
334
|
|
335 foreach my $filter (@filters){
|
|
336
|
|
337 my ($filename, $directories, $suffix) = fileparse($filter, qr/\.[^.]*/);
|
|
338
|
|
339 print STDOUT "\tFilter against BED: $filename\n";
|
|
340
|
|
341 #find intersect
|
|
342 `sort -k1,1 -k2,2n $filter > ref`;
|
|
343 `bedtools intersect -a sorted -b ref -v -sorted > bed`;
|
|
344 `sort -k1,1 -k2,2n bed > sorted`;
|
|
345 }
|
|
346
|
|
347 #generate new output
|
|
348 `sort -k4n sorted > bed`;
|
|
349 `cp $output table`;
|
|
350
|
|
351 open(F1, "bed") or die "error no sorted file";
|
|
352 open(F2, "table") or die "error no table file";
|
|
353 open(OUT, ">$output") or die "error cannot open output file";
|
|
354 print OUT $headers;
|
|
355 $NL=1;
|
|
356 my $line = <F2>;
|
|
357 while(<F1>)
|
|
358 {
|
|
359 my @NR=split("\t", $_);
|
|
360 while( $NL < $NR[3]){ $line = <F2>; $NL++; }
|
|
361 print OUT $line;
|
|
362 }
|
|
363 close F1;
|
|
364 close F2;
|
|
365 close OUT;
|
|
366
|
|
367 }
|
|
368
|
|
369
|
|
370
|
|
371 =head1 NAME
|
|
372
|
|
373 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
|
|
374
|
|
375 =head1 SYNOPSIS
|
|
376
|
|
377 mutspecFilter.pl [arguments] <query-file>
|
|
378
|
|
379 <query-file> an annotated file
|
|
380
|
|
381 Arguments:
|
|
382 -h, --help print help message
|
|
383 -m, --man print complete documentation
|
|
384 -v, --verbose use verbose output
|
|
385 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file (start to count from 1)
|
|
386 --segDup filter against genomic duplicate database
|
|
387 --esp filter against Exome Sequencing Project database (only for human)
|
|
388 --thG filter against 1000 genome database (onyl for human)
|
|
389 -o, --outfile <string> path to output file
|
|
390 --refGenome reference genome to use
|
|
391 --pathAVDBList path to the list of Annovar databases installed
|
|
392 --filter path to a bed file
|
|
393
|
|
394
|
|
395 Function: Filter out variants present in public databases
|
|
396
|
|
397 Example: # Filter against dbSNP
|
|
398 mutspecFilter.pl --dbSNP col_number (start to count from 1) --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
|
|
399
|
|
400 # Filter against all Annovar databases
|
|
401 mutspecFilter.pl --dbSNP col_number (start to count from 1) --segDup --esp --thG --exac --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
|
|
402
|
|
403 # Filter against additional databases in BED format
|
|
404 mutspecFilter.pl --filter path_to_bed --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
|
|
405
|
|
406
|
|
407 Version: 02-2017 (February 2017)
|
|
408
|
|
409
|
|
410 =head1 OPTIONS
|
|
411
|
|
412 =over 8
|
|
413
|
|
414 =item B<--help>
|
|
415
|
|
416 print a brief usage message and detailed explanation of options.
|
|
417
|
|
418 =item B<--man>
|
|
419
|
|
420 print the complete manual of the program.
|
|
421
|
|
422 =item B<--verbose>
|
|
423
|
|
424 use verbose output.
|
|
425
|
|
426 =item B<--dbSNP>
|
|
427
|
|
428 Remove all the variants presents in the dbSNP databases
|
|
429 Specify the number of the dbSNP column in the file (start to count from 1)
|
|
430 For human and mouse genome
|
|
431
|
|
432 =item B<--segDup>
|
|
433
|
|
434 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
|
|
435 For human and mouse genome
|
|
436
|
|
437 =item B<--esp>
|
|
438
|
|
439 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
|
|
440 For human genome only
|
|
441
|
|
442 =item B<--thG>
|
|
443
|
|
444 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
|
|
445
|
|
446
|
|
447 =item B<--exac>
|
|
448
|
|
449 Remove all the variants with a frequency greater than 0.001 in ExAC database
|
|
450
|
|
451
|
|
452 =item B<--filter>
|
|
453
|
|
454 Remove all variants present in the BED file
|
|
455
|
|
456
|
|
457 =item B<--refGenome>
|
|
458
|
|
459 The reference genome to use.
|
|
460
|
|
461 =item B<--outfile>
|
|
462
|
|
463 path to output file
|
|
464
|
|
465 =item B<--pathAVDBList>
|
|
466
|
|
467 the path to a texte file containing the list of the Annovar databases installed.
|
|
468
|
|
469 =back
|
|
470
|
|
471 =head1 DESCRIPTION
|
|
472
|
|
473 mutspecFilter - Filter a file annotated with MutSpec-Annot tool.
|
|
474 Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome, exac obtained from Annovar) will be removed from the input file (with frequency limits described above).
|
|
475 Additionally, using the --filter option, any variants present in a specified bed file will be removed from the input file.
|
|
476
|
|
477 =cut
|