Mercurial > repos > iarc > mutspec
comparison mutspecFilter.pl @ 7:eda59b985b1c draft default tip
Uploaded
author | iarc |
---|---|
date | Mon, 13 Mar 2017 08:21:19 -0400 |
parents | 46a10309dfe2 |
children |
comparison
equal
deleted
inserted
replaced
6:46a10309dfe2 | 7:eda59b985b1c |
---|---|
1 # !/usr/bin/perl | 1 # !/usr/bin/perl |
2 | 2 |
3 #-----------------------------------# | 3 #-----------------------------------# |
4 # Author: Maude # | 4 # Author: Maude / Vincent # |
5 # Script: mutspecFilter.pl # | 5 # Script: mutspecFilter.pl # |
6 # Last update: 18/03/16 # | 6 # Last update: 01/02/17 # |
7 #-----------------------------------# | 7 #-----------------------------------# |
8 | 8 |
9 use strict; | 9 use strict; |
10 use warnings; | 10 use warnings; |
11 use Getopt::Long; | 11 use Getopt::Long; |
12 use Pod::Usage; | 12 use Pod::Usage; |
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); | 13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); |
14 use File::Path; | 14 use File::Path; |
15 | 15 |
16 ################################################################################################################################################################################ | 16 ######################################################################################################################################################### |
17 # Filter an Annotaed file with Annovar # | 17 # Filter an Annotaed file with Annovar # |
18 ################################################################################################################################################################################ | 18 ######################################################################################################################################################### |
19 | 19 |
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. | 20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. |
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome. | 21 our ($dbSNP_value, $segDup, $esp, $thG, $exac) = (0, 0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome, ExAC. |
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use. | 22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use. |
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases. | 23 our ($listAVDB) = "empty"; # Text file with the list of Annovar databases. |
24 our ($dir) = ""; | 24 our ($dir) = ""; # Directory containing the script + the text file with the list of Annovar databases |
25 | 25 our (@filters); # Path to BED file(s) to filter against |
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2); | 26 |
27 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'exac'=>\$exac, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB, 'filter=s'=> \@filters) or pod2usage(2); | |
27 | 28 |
28 our ($input) = @ARGV; | 29 our ($input) = @ARGV; |
29 | 30 |
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); | 31 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); |
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); | 32 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); |
41 | 42 |
42 ############ Check flags ############ | 43 ############ Check flags ############ |
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" } | 44 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" } |
44 | 45 |
45 # Zero databases is specified | 46 # Zero databases is specified |
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) ) | 47 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) && ($exac == 0) && (scalar(@filters) == 0) ) |
47 { | 48 { |
48 print STDERR "There is no databases selected for filtering against!!!\nPlease chose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n"; | 49 print STDERR "Error message:\n"; |
50 print STDERR "There is no databases selected for filtering against!!!\n"; | |
51 print STDERR "Please chose at least one between dbSNP, SegDup (only for human and mouse genomes), ESP (only for human genome), 1000 genome (only for human genome) or ExAC (only for human genome)\n"; | |
52 print STDERR "Or specify a BED file\n"; | |
49 exit; | 53 exit; |
50 } | 54 } |
51 | 55 |
52 | 56 |
53 | 57 |
54 ############ Recover the name of the databases to filter against ############ | 58 ############ Recover the name of the databases to filter against ############ |
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", ""); | 59 my ($segDup_name, $espAll_name, $thousandGenome_name, $exac_name) = ("", "", "", ""); |
56 my @tab_protocol = (); | 60 my @tab_protocol = (); |
57 | 61 |
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) ) | 62 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) || ($exac == 1)) |
59 { | 63 { |
60 ### Recover the name of the column | 64 ### Recover the name of the column |
61 my $protocol = ""; | 65 my $protocol = ""; |
62 ExtractAVDBName($listAVDB, \$protocol); | 66 ExtractAVDBName($listAVDB, \$protocol); |
63 @tab_protocol = split(",", $protocol); | 67 @tab_protocol = split(",", $protocol); |
65 for(my $i=0; $i<=$#tab_protocol; $i++) | 69 for(my $i=0; $i<=$#tab_protocol; $i++) |
66 { | 70 { |
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; } | 71 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; } |
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; } | 72 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; } |
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; } | 73 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; } |
74 elsif($tab_protocol[$i] =~ /exac/i) { $exac_name = $tab_protocol[$i]; } | |
70 } | 75 } |
71 } | 76 } |
72 | 77 |
73 | 78 |
74 ############ Filter the file ############ | 79 ############ Filter the file ############ |
75 filterAgainstPublicDB(); | 80 filterAgainstPublicDB(); |
76 | 81 |
77 | 82 |
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n"; | 83 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\tEXac = ". $exac . "\n"; |
84 | |
85 | |
86 ### Write a message if the input file contains zero variants or if all the variants are filtered out | |
87 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/); | |
88 my $nbVariantsIn = `wc -l $input`; | |
89 $nbVariantsIn =~ /(\d+).+/; | |
90 my $nbLineIn = $1; | |
91 if($nbLineIn == 1) | |
92 { | |
93 print STDERR "Error message:\n"; | |
94 print STDERR "\nThere is no variant to be filtered for $filename\n"; | |
95 print STDERR "Check MutSpecAnnot standard output for more informations\n"; | |
96 exit; | |
97 } | |
98 else | |
99 { | |
100 my ($filenameOut, $directoriesOut, $suffixOut) = fileparse($output, qr/\.[^.]*/); | |
101 my $nbVariantsOut = `wc -l $output`; | |
102 $nbVariantsOut =~ /(\d+).+/; | |
103 my $nbLineOut = $1; | |
104 if($nbLineOut == 1) | |
105 { | |
106 print STDOUT "Warning message:\n"; | |
107 print STDOUT "\nAll the variants were filtered out for $filenameOut\n"; | |
108 } | |
109 } | |
110 | |
111 ### filter versus additional VCF files if provided. | |
112 if ( scalar(@filters) > 0) { filterAdditionalBED(); } | |
113 | |
114 | |
79 | 115 |
80 | 116 |
81 sub filterAgainstPublicDB | 117 sub filterAgainstPublicDB |
82 { | 118 { |
83 open(FILTER, ">", "$output") or die "$!: $output\n"; | 119 open(FILTER, ">", "$output") or die "$!: $output\n"; |
87 while(<F1>) | 123 while(<F1>) |
88 { | 124 { |
89 $_ =~ s/[\r\n]+$//; | 125 $_ =~ s/[\r\n]+$//; |
90 my @tab = split("\t", $_); | 126 my @tab = split("\t", $_); |
91 | 127 |
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0); | 128 my ($segDupInfo, $espAllInfo, $thgInfo, $exacInfo) = (0, 0 ,0, 0); |
93 | 129 |
94 if($segDup == 1) | 130 if($segDup == 1) |
95 { | 131 { |
96 my $segDup_value = recoverNumCol($input, $segDup_name); | 132 my $segDup_value = recoverNumCol($input, $segDup_name); |
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]); | 133 $segDupInfo = formatSegDupInfo($tab[$segDup_value]); |
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name); | 146 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name); |
111 # Replace NA by 0 for making test on the same type of variable | 147 # Replace NA by 0 for making test on the same type of variable |
112 $thgInfo = $tab[$thousandGenome_value]; | 148 $thgInfo = $tab[$thousandGenome_value]; |
113 $thgInfo =~ s/NA/0/; | 149 $thgInfo =~ s/NA/0/; |
114 } | 150 } |
115 | 151 if($exac == 1) |
116 | 152 { |
117 ############################## | 153 my $exac_value = recoverNumCol($input, $exac_name); |
118 # One Filter # | 154 # Replace NA by 0 for making test on the same type of variable |
119 ############################## | 155 $exacInfo = $tab[$exac_value]; |
120 # Remove all the variants present in dbSNP | 156 $exacInfo =~ s/NA/0/; |
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } } | 157 } |
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database | 158 |
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } } | 159 my $filter = 0; |
124 # Remove all the variants with greater than 0.001 in Exome sequencing project | 160 if( $dbSNP == 1 && $tab[$dbSNP_value-1] ne "NA" ){ $filter = 1; } |
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } } | 161 if( $segDup == 1 && $segDupInfo >= 0.9) { $filter = 1; } |
126 # Remove all the variants with greater than 0.001 in 1000 genome database | 162 if( $esp == 1 && $espAllInfo > 0.001) { $filter = 1; } |
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } } | 163 if( $thG == 1 && $thgInfo > 0.001) { $filter = 1; } |
128 | 164 if( $thG == 1 && $exacInfo > 0.001) { $filter = 1; } |
129 | 165 |
130 ############################# | 166 if (!$filter) { print FILTER "$_\n"; } |
131 # Two Filter # | |
132 ############################## | |
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } } | |
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
136 | |
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
139 | |
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
141 | |
142 | |
143 ############################# | |
144 # Three Filter # | |
145 ############################## | |
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) | |
147 { print FILTER "$_\n"; } } | |
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) | |
149 { print FILTER "$_\n"; } } | |
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
151 { print FILTER "$_\n"; } } | |
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
153 { print FILTER "$_\n"; } } | |
154 | |
155 | |
156 ############################# | |
157 # FOUR Filter # | |
158 ############################## | |
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
160 { print FILTER "$_\n"; } } | |
161 | 167 |
162 } | 168 } |
163 close F1; close FILTER; | 169 close F1; close FILTER; |
164 } | 170 } |
165 | 171 |
189 | 195 |
190 $_ =~ s/[\r\n]+$//; | 196 $_ =~ s/[\r\n]+$//; |
191 my @tab = split("\t", $_); | 197 my @tab = split("\t", $_); |
192 | 198 |
193 # db name like refGenome_dbName.txt | 199 # db name like refGenome_dbName.txt |
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) ) | 200 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) && ($tab[0] !~ /exac/i) ) |
195 { | 201 { |
196 my $temp = $1; | 202 my $temp = $1; |
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; } | 203 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; } |
198 } | 204 } |
199 # 1000 genome | 205 # 1000 genome |
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; | 222 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; |
217 my $AVdbName_final = $1."_".$2; | 223 my $AVdbName_final = $1."_".$2; |
218 | 224 |
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; } | 225 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; } |
220 } | 226 } |
227 # EXAC | |
228 if($tab[0] =~ /exac/i) | |
229 { | |
230 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; | |
231 my $AVdbName_final = "ExAC_ALL"; | |
232 | |
233 $$refS_protocol .= $AVdbName_final.","; | |
234 } | |
235 | |
221 } | 236 } |
222 close F1; | 237 close F1; |
223 | 238 |
224 sub ConvertMonth | 239 sub ConvertMonth |
225 { | 240 { |
235 elsif($$refS_month == 8) { $$refS_month = "aug"; } | 250 elsif($$refS_month == 8) { $$refS_month = "aug"; } |
236 elsif($$refS_month == 9) { $$refS_month = "sept"; } | 251 elsif($$refS_month == 9) { $$refS_month = "sept"; } |
237 elsif($$refS_month == 10) { $$refS_month = "oct"; } | 252 elsif($$refS_month == 10) { $$refS_month = "oct"; } |
238 elsif($$refS_month == 11) { $$refS_month = "nov"; } | 253 elsif($$refS_month == 11) { $$refS_month = "nov"; } |
239 elsif($$refS_month == 12) { $$refS_month = "dec"; } | 254 elsif($$refS_month == 12) { $$refS_month = "dec"; } |
240 else { print STDERR "Month number don't considered\n"; exit; } | |
241 } | 255 } |
242 } | 256 } |
243 | 257 |
244 | 258 |
245 sub recoverNumCol | 259 sub recoverNumCol |
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; } | 278 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; } |
265 } | 279 } |
266 if($name_of_column_NB eq "toto") { next; } | 280 if($name_of_column_NB eq "toto") { next; } |
267 else { return $name_of_column_NB; } | 281 else { return $name_of_column_NB; } |
268 } | 282 } |
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | 283 if($name_of_column eq "toto") |
284 { | |
285 print STDERR "Error message:\n"; | |
286 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; | |
287 exit; | |
288 } | |
270 } | 289 } |
271 # Only one name is pass | 290 # Only one name is pass |
272 else | 291 else |
273 { | 292 { |
274 open(FT,$input) or die "$!: $input\n"; | 293 open(FT,$input) or die "$!: $input\n"; |
279 my $name_of_column_NB = "toto"; | 298 my $name_of_column_NB = "toto"; |
280 for(my $i=0; $i<=$#tab_search_header; $i++) | 299 for(my $i=0; $i<=$#tab_search_header; $i++) |
281 { | 300 { |
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; } | 301 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; } |
283 } | 302 } |
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | 303 if($name_of_column_NB eq "toto") |
285 else { return $name_of_column_NB; } | 304 { |
286 } | 305 print STDERR "Error message:\n"; |
287 } | 306 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; |
307 exit; | |
308 } | |
309 else | |
310 { | |
311 return $name_of_column_NB; | |
312 } | |
313 } | |
314 } | |
315 | |
316 | |
317 sub filterAdditionalBED{ | |
318 | |
319 #create bed | |
320 open(TABLE, "$output") or die "$!: $output\n"; | |
321 open(F1, ">bed") or die "cannot create bed file"; | |
322 my $NL=1; | |
323 my $headers=<TABLE>; | |
324 while(<TABLE>) | |
325 { | |
326 $NL++; | |
327 my @line=split("\t", $_); | |
328 print F1 "$line[0]\t$line[1]\t$line[2]\t$NL\n"; | |
329 } | |
330 close F1; | |
331 close TABLE; | |
332 #and sort it | |
333 `sort -k1,1 -k2,2n bed > sorted`; | |
334 | |
335 foreach my $filter (@filters){ | |
336 | |
337 my ($filename, $directories, $suffix) = fileparse($filter, qr/\.[^.]*/); | |
338 | |
339 print STDOUT "\tFilter against BED: $filename\n"; | |
340 | |
341 #find intersect | |
342 `sort -k1,1 -k2,2n $filter > ref`; | |
343 `bedtools intersect -a sorted -b ref -v -sorted > bed`; | |
344 `sort -k1,1 -k2,2n bed > sorted`; | |
345 } | |
346 | |
347 #generate new output | |
348 `sort -k4n sorted > bed`; | |
349 `cp $output table`; | |
350 | |
351 open(F1, "bed") or die "error no sorted file"; | |
352 open(F2, "table") or die "error no table file"; | |
353 open(OUT, ">$output") or die "error cannot open output file"; | |
354 print OUT $headers; | |
355 $NL=1; | |
356 my $line = <F2>; | |
357 while(<F1>) | |
358 { | |
359 my @NR=split("\t", $_); | |
360 while( $NL < $NR[3]){ $line = <F2>; $NL++; } | |
361 print OUT $line; | |
362 } | |
363 close F1; | |
364 close F2; | |
365 close OUT; | |
366 | |
367 } | |
368 | |
369 | |
288 | 370 |
289 =head1 NAME | 371 =head1 NAME |
290 | 372 |
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | 373 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) |
292 | 374 |
298 | 380 |
299 Arguments: | 381 Arguments: |
300 -h, --help print help message | 382 -h, --help print help message |
301 -m, --man print complete documentation | 383 -m, --man print complete documentation |
302 -v, --verbose use verbose output | 384 -v, --verbose use verbose output |
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file | 385 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file (start to count from 1) |
304 --segDup filter against genomic duplicate database | 386 --segDup filter against genomic duplicate database |
305 --esp filter against Exome Sequencing Project database (only for human) | 387 --esp filter against Exome Sequencing Project database (only for human) |
306 --thG filter against 1000 genome database (onyl for human) | 388 --thG filter against 1000 genome database (onyl for human) |
307 -o, --outfile <string> name of output file | 389 -o, --outfile <string> path to output file |
308 --refGenome reference genome to use | 390 --refGenome reference genome to use |
309 --pathAVDBList path to the list of Annovar databases installed | 391 --pathAVDBList path to the list of Annovar databases installed |
392 --filter path to a bed file | |
310 | 393 |
311 | 394 |
312 Function: Filter out variants present in public databases | 395 Function: Filter out variants present in public databases |
313 | 396 |
314 Example: # Filter against dbSNP | 397 Example: # Filter against dbSNP |
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | 398 mutspecFilter.pl --dbSNP col_number (start to count from 1) --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input |
316 | 399 |
317 # Filter against the four databases | 400 # Filter against all Annovar databases |
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | 401 mutspecFilter.pl --dbSNP col_number (start to count from 1) --segDup --esp --thG --exac --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input |
319 | 402 |
320 | 403 # Filter against additional databases in BED format |
321 Version: 03-2016 (March 2016) | 404 mutspecFilter.pl --filter path_to_bed --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input |
405 | |
406 | |
407 Version: 02-2017 (February 2017) | |
322 | 408 |
323 | 409 |
324 =head1 OPTIONS | 410 =head1 OPTIONS |
325 | 411 |
326 =over 8 | 412 =over 8 |
338 use verbose output. | 424 use verbose output. |
339 | 425 |
340 =item B<--dbSNP> | 426 =item B<--dbSNP> |
341 | 427 |
342 Remove all the variants presents in the dbSNP databases | 428 Remove all the variants presents in the dbSNP databases |
343 Specify the number of column containing the annotation | 429 Specify the number of the dbSNP column in the file (start to count from 1) |
344 For human and mouse genome | 430 For human and mouse genome |
345 | 431 |
346 =item B<--segDup> | 432 =item B<--segDup> |
347 | 433 |
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database | 434 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database |
355 | 441 |
356 =item B<--thG> | 442 =item B<--thG> |
357 | 443 |
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database | 444 Remove all the variants with a frequency greater than 0.001 in 1000 genome database |
359 | 445 |
446 | |
447 =item B<--exac> | |
448 | |
449 Remove all the variants with a frequency greater than 0.001 in ExAC database | |
450 | |
451 | |
452 =item B<--filter> | |
453 | |
454 Remove all variants present in the BED file | |
455 | |
456 | |
360 =item B<--refGenome> | 457 =item B<--refGenome> |
361 | 458 |
362 the reference genome to use, could be hg19 or mm9. | 459 The reference genome to use. |
363 | 460 |
364 =item B<--outfile> | 461 =item B<--outfile> |
365 | 462 |
366 the name of the output file | 463 path to output file |
367 | 464 |
368 =item B<--pathAVDBList> | 465 =item B<--pathAVDBList> |
369 | 466 |
370 the path to a texte file containing the list of the Annovar databases installed. | 467 the path to a texte file containing the list of the Annovar databases installed. |
371 | 468 |
372 =back | 469 =back |
373 | 470 |
374 =head1 DESCRIPTION | 471 =head1 DESCRIPTION |
375 | 472 |
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | 473 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. |
474 Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome, exac obtained from Annovar) will be removed from the input file (with frequency limits described above). | |
475 Additionally, using the --filter option, any variants present in a specified bed file will be removed from the input file. | |
377 | 476 |
378 =cut | 477 =cut |