comparison mutspecFilter.pl @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 916846f73e25
comparison
equal deleted inserted replaced
-1:000000000000 0:8c682b3a7c5b
1 # !/usr/bin/perl
2
3 #-----------------------------------#
4 # Author: Maude #
5 # Script: mutspecFilter.pl #
6 # Last update: 18/03/16 #
7 #-----------------------------------#
8
9 use strict;
10 use warnings;
11 use Getopt::Long;
12 use Pod::Usage;
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
14 use File::Path;
15
16 ################################################################################################################################################################################
17 # Filter an Annotaed file with Annovar #
18 ################################################################################################################################################################################
19
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases.
24 our ($dir) = "";
25
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);
27
28 our ($input) = @ARGV;
29
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
34
35
36
37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
38 our $dbSNP = 0;
39 if($dbSNP_value > 0) { $dbSNP = 1; }
40
41
42 ############ Check flags ############
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
44
45 # Zero databases is specified
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
47 {
48 print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
49 exit;
50 }
51
52
53
54 ############ Recover the name of the databases to filter against ############
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
56 my @tab_protocol = ();
57
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
59 {
60 ### Recover the name of the column
61 my $protocol = "";
62 ExtractAVDBName($listAVDB, \$protocol);
63 @tab_protocol = split(",", $protocol);
64
65 for(my $i=0; $i<=$#tab_protocol; $i++)
66 {
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
70 }
71 }
72
73
74 ############ Filter the file ############
75 filterAgainstPublicDB();
76
77
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";
79
80
81 sub filterAgainstPublicDB
82 {
83 open(FILTER, ">", "$output") or die "$!: $output\n";
84
85 open(F1, $input) or die "$!: $input\n";
86 my $header = <F1>; print FILTER $header;
87 while(<F1>)
88 {
89 $_ =~ s/[\r\n]+$//;
90 my @tab = split("\t", $_);
91
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);
93
94 if($segDup == 1)
95 {
96 my $segDup_value = recoverNumCol($input, $segDup_name);
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
98 # Replace NA by 0 for making test on the same type of variable
99 $segDupInfo =~ s/NA/0/;
100 }
101 if($esp == 1)
102 {
103 my $espAll_value = recoverNumCol($input, $espAll_name);
104 $espAllInfo = $tab[$espAll_value];
105 # Replace NA by 0 for making test on the same type of variable
106 $espAllInfo =~ s/NA/0/;
107 }
108 if($thG == 1)
109 {
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
111 # Replace NA by 0 for making test on the same type of variable
112 $thgInfo = $tab[$thousandGenome_value];
113 $thgInfo =~ s/NA/0/;
114 }
115
116
117 ##############################
118 # One Filter #
119 ##############################
120 # Remove all the variants present in dbSNP
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } }
124 # Remove all the variants with greater than 0.001 in Exome sequencing project
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } }
126 # Remove all the variants with greater than 0.001 in 1000 genome database
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } }
128
129
130 #############################
131 # Two Filter #
132 ##############################
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } }
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
136
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
139
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
141
142
143 #############################
144 # Three Filter #
145 ##############################
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
147 { print FILTER "$_\n"; } }
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
149 { print FILTER "$_\n"; } }
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
151 { print FILTER "$_\n"; } }
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
153 { print FILTER "$_\n"; } }
154
155
156 #############################
157 # FOUR Filter #
158 ##############################
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
160 { print FILTER "$_\n"; } }
161
162 }
163 close F1; close FILTER;
164 }
165
166
167 sub formatSegDupInfo
168 {
169 my ($segDup_info) = @_;
170
171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
172 {
173 my @segDup = split(";", $segDup_info);
174 $segDup[0] =~ /Score=(.+)/;
175 return $1;
176 }
177 else { return $segDup_info; }
178 }
179
180
181 sub ExtractAVDBName
182 {
183 my ($listAVDB, $refS_protocol) = @_;
184
185 open(F1, $listAVDB) or die "$!: $listAVDB\n";
186 while(<F1>)
187 {
188 if ($_ =~ /^#/) { next; }
189
190 $_ =~ s/[\r\n]+$//;
191 my @tab = split("\t", $_);
192
193 # db name like refGenome_dbName.txt
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
195 {
196 my $temp = $1;
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
198 }
199 # 1000 genome
200 if($tab[0] =~ /sites/)
201 {
202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
203 my ($dbName, $year, $month) = ($1, $2, $3);
204 $dbName =~ tr/A-Z/a-z/;
205
206 # convert the month number into the month name
207 ConvertMonth(\$month);
208
209 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
210
211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
212 }
213 # ESP
214 if($tab[0] =~ /esp/)
215 {
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
217 my $AVdbName_final = $1."_".$2;
218
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
220 }
221 }
222 close F1;
223
224 sub ConvertMonth
225 {
226 my ($refS_month) = @_;
227
228 if($$refS_month == 1) { $$refS_month = "janv"; }
229 elsif($$refS_month == 2) { $$refS_month = "feb"; }
230 elsif($$refS_month == 3) { $$refS_month = "mar"; }
231 elsif($$refS_month == 4) { $$refS_month = "apr"; }
232 elsif($$refS_month == 5) { $$refS_month = "may"; }
233 elsif($$refS_month == 6) { $$refS_month = "jun"; }
234 elsif($$refS_month == 7) { $$refS_month = "jul"; }
235 elsif($$refS_month == 8) { $$refS_month = "aug"; }
236 elsif($$refS_month == 9) { $$refS_month = "sept"; }
237 elsif($$refS_month == 10) { $$refS_month = "oct"; }
238 elsif($$refS_month == 11) { $$refS_month = "nov"; }
239 elsif($$refS_month == 12) { $$refS_month = "dec"; }
240 else { print STDERR "Month number don't considered\n"; exit; }
241 }
242 }
243
244
245 sub recoverNumCol
246 {
247 my ($input, $name_of_column) = @_;
248
249 # With Annovar updates the databases name changed and are present in an array
250 if( ref($name_of_column) eq "ARRAY" )
251 {
252 my $test = "";
253 my @tab = @$name_of_column;
254 foreach (@tab)
255 {
256 open(F1,$input) or die "$!: $input\n";
257 # For having the name of the columns
258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
259 close F1;
260 # The number of the column
261 my $name_of_column_NB = "toto";
262 for(my $i=0; $i<=$#tab_search_header; $i++)
263 {
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
265 }
266 if($name_of_column_NB eq "toto") { next; }
267 else { return $name_of_column_NB; }
268 }
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
270 }
271 # Only one name is pass
272 else
273 {
274 open(FT,$input) or die "$!: $input\n";
275 # For having the name of the columns
276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
277 close FT;
278 # The number of the column
279 my $name_of_column_NB = "toto";
280 for(my $i=0; $i<=$#tab_search_header; $i++)
281 {
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
283 }
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
285 else { return $name_of_column_NB; }
286 }
287 }
288
289 =head1 NAME
290
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
292
293 =head1 SYNOPSIS
294
295 mutspecFilter.pl [arguments] <query-file>
296
297 <query-file> an annotated file
298
299 Arguments:
300 -h, --help print help message
301 -m, --man print complete documentation
302 -v, --verbose use verbose output
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file
304 --segDup filter against genomic duplicate database
305 --esp filter against Exome Sequencing Project database (only for human)
306 --thG filter against 1000 genome database (onyl for human)
307 -o, --outfile <string> name of output file
308 --refGenome reference genome to use
309 --pathAVDBList path to the list of Annovar databases installed
310
311
312 Function: Filter out variants present in public databases
313
314 Example: # Filter against dbSNP
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
316
317 # Filter against the four databases
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
319
320
321 Version: 03-2016 (March 2016)
322
323
324 =head1 OPTIONS
325
326 =over 8
327
328 =item B<--help>
329
330 print a brief usage message and detailed explanation of options.
331
332 =item B<--man>
333
334 print the complete manual of the program.
335
336 =item B<--verbose>
337
338 use verbose output.
339
340 =item B<--dbSNP>
341
342 Remove all the variants presents in the dbSNP databases
343 Specify the number of column containing the annotation
344 For human and mouse genome
345
346 =item B<--segDup>
347
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
349 For human and mouse genome
350
351 =item B<--esp>
352
353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
354 For human genome only
355
356 =item B<--thG>
357
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
359
360 =item B<--refGenome>
361
362 the reference genome to use, could be hg19 or mm9.
363
364 =item B<--outfile>
365
366 the name of the output file
367
368 =item B<--pathAVDBList>
369
370 the path to a texte file containing the list of the Annovar databases installed.
371
372 =back
373
374 =head1 DESCRIPTION
375
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
377
378 =cut