annotate mutspecFilter.pl @ 7:eda59b985b1c draft default tip

Uploaded
author iarc
date Mon, 13 Mar 2017 08:21:19 -0400
parents 46a10309dfe2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1 # !/usr/bin/perl
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
2
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
3 #-----------------------------------#
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
4 # Author: Maude / Vincent #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
5 # Script: mutspecFilter.pl #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
6 # Last update: 01/02/17 #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
7 #-----------------------------------#
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
8
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
9 use strict;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
10 use warnings;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
11 use Getopt::Long;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
12 use Pod::Usage;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
14 use File::Path;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
15
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
16 #########################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
17 # Filter an Annotaed file with Annovar #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
18 #########################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
19
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
21 our ($dbSNP_value, $segDup, $esp, $thG, $exac) = (0, 0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome, ExAC.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
23 our ($listAVDB) = "empty"; # Text file with the list of Annovar databases.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
24 our ($dir) = ""; # Directory containing the script + the text file with the list of Annovar databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
25 our (@filters); # Path to BED file(s) to filter against
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
26
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
27 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'exac'=>\$exac, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB, 'filter=s'=> \@filters) or pod2usage(2);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
28
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
29 our ($input) = @ARGV;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
30
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
31 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
32 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
34 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
35
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
36
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
37
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
38 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
39 our $dbSNP = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
40 if($dbSNP_value > 0) { $dbSNP = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
41
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
42
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
43 ############ Check flags ############
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
44 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
45
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
46 # Zero databases is specified
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
47 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) && ($exac == 0) && (scalar(@filters) == 0) )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
48 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
49 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
50 print STDERR "There is no databases selected for filtering against!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
51 print STDERR "Please chose at least one between dbSNP, SegDup (only for human and mouse genomes), ESP (only for human genome), 1000 genome (only for human genome) or ExAC (only for human genome)\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
52 print STDERR "Or specify a BED file\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
53 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
54 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
55
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
56
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
57
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
58 ############ Recover the name of the databases to filter against ############
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
59 my ($segDup_name, $espAll_name, $thousandGenome_name, $exac_name) = ("", "", "", "");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
60 my @tab_protocol = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
61
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
62 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) || ($exac == 1))
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
63 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
64 ### Recover the name of the column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
65 my $protocol = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
66 ExtractAVDBName($listAVDB, \$protocol);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
67 @tab_protocol = split(",", $protocol);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
68
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
69 for(my $i=0; $i<=$#tab_protocol; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
70 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
71 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
72 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
73 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
74 elsif($tab_protocol[$i] =~ /exac/i) { $exac_name = $tab_protocol[$i]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
75 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
76 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
77
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
78
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
79 ############ Filter the file ############
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
80 filterAgainstPublicDB();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
81
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
82
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
83 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\tEXac = ". $exac . "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
84
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
85
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
86 ### Write a message if the input file contains zero variants or if all the variants are filtered out
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
87 my ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
88 my $nbVariantsIn = `wc -l $input`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
89 $nbVariantsIn =~ /(\d+).+/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
90 my $nbLineIn = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
91 if($nbLineIn == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
92 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
93 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
94 print STDERR "\nThere is no variant to be filtered for $filename\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
95 print STDERR "Check MutSpecAnnot standard output for more informations\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
96 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
97 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
98 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
99 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
100 my ($filenameOut, $directoriesOut, $suffixOut) = fileparse($output, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
101 my $nbVariantsOut = `wc -l $output`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
102 $nbVariantsOut =~ /(\d+).+/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
103 my $nbLineOut = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
104 if($nbLineOut == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
105 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
106 print STDOUT "Warning message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
107 print STDOUT "\nAll the variants were filtered out for $filenameOut\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
108 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
109 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
110
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
111 ### filter versus additional VCF files if provided.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
112 if ( scalar(@filters) > 0) { filterAdditionalBED(); }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
113
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
114
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
115
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
116
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
117 sub filterAgainstPublicDB
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
118 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
119 open(FILTER, ">", "$output") or die "$!: $output\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
120
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
121 open(F1, $input) or die "$!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
122 my $header = <F1>; print FILTER $header;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
123 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
124 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
125 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
126 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
127
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
128 my ($segDupInfo, $espAllInfo, $thgInfo, $exacInfo) = (0, 0 ,0, 0);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
129
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
130 if($segDup == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
131 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
132 my $segDup_value = recoverNumCol($input, $segDup_name);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
133 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
134 # Replace NA by 0 for making test on the same type of variable
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
135 $segDupInfo =~ s/NA/0/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
136 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
137 if($esp == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
138 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
139 my $espAll_value = recoverNumCol($input, $espAll_name);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
140 $espAllInfo = $tab[$espAll_value];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
141 # Replace NA by 0 for making test on the same type of variable
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
142 $espAllInfo =~ s/NA/0/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
143 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
144 if($thG == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
145 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
146 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
147 # Replace NA by 0 for making test on the same type of variable
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
148 $thgInfo = $tab[$thousandGenome_value];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
149 $thgInfo =~ s/NA/0/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
150 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
151 if($exac == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
152 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
153 my $exac_value = recoverNumCol($input, $exac_name);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
154 # Replace NA by 0 for making test on the same type of variable
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
155 $exacInfo = $tab[$exac_value];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
156 $exacInfo =~ s/NA/0/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
157 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
158
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
159 my $filter = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
160 if( $dbSNP == 1 && $tab[$dbSNP_value-1] ne "NA" ){ $filter = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
161 if( $segDup == 1 && $segDupInfo >= 0.9) { $filter = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
162 if( $esp == 1 && $espAllInfo > 0.001) { $filter = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
163 if( $thG == 1 && $thgInfo > 0.001) { $filter = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
164 if( $thG == 1 && $exacInfo > 0.001) { $filter = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
165
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
166 if (!$filter) { print FILTER "$_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
167
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
168 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
169 close F1; close FILTER;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
170 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
171
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
172
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
173 sub formatSegDupInfo
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
174 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
175 my ($segDup_info) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
176
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
177 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
178 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
179 my @segDup = split(";", $segDup_info);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
180 $segDup[0] =~ /Score=(.+)/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
181 return $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
182 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
183 else { return $segDup_info; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
184 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
185
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
186
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
187 sub ExtractAVDBName
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
188 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
189 my ($listAVDB, $refS_protocol) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
190
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
191 open(F1, $listAVDB) or die "$!: $listAVDB\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
192 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
193 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
194 if ($_ =~ /^#/) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
195
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
196 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
197 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
198
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
199 # db name like refGenome_dbName.txt
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
200 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) && ($tab[0] !~ /exac/i) )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
201 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
202 my $temp = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
203 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
204 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
205 # 1000 genome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
206 if($tab[0] =~ /sites/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
207 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
208 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
209 my ($dbName, $year, $month) = ($1, $2, $3);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
210 $dbName =~ tr/A-Z/a-z/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
211
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
212 # convert the month number into the month name
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
213 ConvertMonth(\$month);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
214
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
215 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
216
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
217 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
218 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
219 # ESP
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
220 if($tab[0] =~ /esp/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
221 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
222 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
223 my $AVdbName_final = $1."_".$2;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
224
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
225 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
226 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
227 # EXAC
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
228 if($tab[0] =~ /exac/i)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
229 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
230 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
231 my $AVdbName_final = "ExAC_ALL";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
232
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
233 $$refS_protocol .= $AVdbName_final.",";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
234 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
235
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
236 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
237 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
238
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
239 sub ConvertMonth
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
240 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
241 my ($refS_month) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
242
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
243 if($$refS_month == 1) { $$refS_month = "janv"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
244 elsif($$refS_month == 2) { $$refS_month = "feb"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
245 elsif($$refS_month == 3) { $$refS_month = "mar"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
246 elsif($$refS_month == 4) { $$refS_month = "apr"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
247 elsif($$refS_month == 5) { $$refS_month = "may"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
248 elsif($$refS_month == 6) { $$refS_month = "jun"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
249 elsif($$refS_month == 7) { $$refS_month = "jul"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
250 elsif($$refS_month == 8) { $$refS_month = "aug"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
251 elsif($$refS_month == 9) { $$refS_month = "sept"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
252 elsif($$refS_month == 10) { $$refS_month = "oct"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
253 elsif($$refS_month == 11) { $$refS_month = "nov"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
254 elsif($$refS_month == 12) { $$refS_month = "dec"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
255 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
256 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
257
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
258
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
259 sub recoverNumCol
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
260 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
261 my ($input, $name_of_column) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
262
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
263 # With Annovar updates the databases name changed and are present in an array
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
264 if( ref($name_of_column) eq "ARRAY" )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
265 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
266 my $test = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
267 my @tab = @$name_of_column;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
268 foreach (@tab)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
269 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
270 open(F1,$input) or die "$!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
271 # For having the name of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
272 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
273 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
274 # The number of the column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
275 my $name_of_column_NB = "toto";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
276 for(my $i=0; $i<=$#tab_search_header; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
277 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
278 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
279 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
280 if($name_of_column_NB eq "toto") { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
281 else { return $name_of_column_NB; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
282 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
283 if($name_of_column eq "toto")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
284 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
285 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
286 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
287 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
288 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
289 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
290 # Only one name is pass
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
291 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
292 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
293 open(FT,$input) or die "$!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
294 # For having the name of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
295 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
296 close FT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
297 # The number of the column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
298 my $name_of_column_NB = "toto";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
299 for(my $i=0; $i<=$#tab_search_header; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
300 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
301 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
302 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
303 if($name_of_column_NB eq "toto")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
304 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
305 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
306 print STDERR "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
307 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
308 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
309 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
310 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
311 return $name_of_column_NB;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
312 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
313 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
314 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
315
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
316
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
317 sub filterAdditionalBED{
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
318
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
319 #create bed
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
320 open(TABLE, "$output") or die "$!: $output\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
321 open(F1, ">bed") or die "cannot create bed file";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
322 my $NL=1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
323 my $headers=<TABLE>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
324 while(<TABLE>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
325 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
326 $NL++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
327 my @line=split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
328 print F1 "$line[0]\t$line[1]\t$line[2]\t$NL\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
329 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
330 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
331 close TABLE;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
332 #and sort it
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
333 `sort -k1,1 -k2,2n bed > sorted`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
334
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
335 foreach my $filter (@filters){
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
336
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
337 my ($filename, $directories, $suffix) = fileparse($filter, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
338
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
339 print STDOUT "\tFilter against BED: $filename\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
340
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
341 #find intersect
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
342 `sort -k1,1 -k2,2n $filter > ref`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
343 `bedtools intersect -a sorted -b ref -v -sorted > bed`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
344 `sort -k1,1 -k2,2n bed > sorted`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
345 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
346
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
347 #generate new output
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
348 `sort -k4n sorted > bed`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
349 `cp $output table`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
350
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
351 open(F1, "bed") or die "error no sorted file";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
352 open(F2, "table") or die "error no table file";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
353 open(OUT, ">$output") or die "error cannot open output file";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
354 print OUT $headers;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
355 $NL=1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
356 my $line = <F2>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
357 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
358 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
359 my @NR=split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
360 while( $NL < $NR[3]){ $line = <F2>; $NL++; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
361 print OUT $line;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
362 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
363 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
364 close F2;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
365 close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
366
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
367 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
368
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
369
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
370
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
371 =head1 NAME
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
372
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
373 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
374
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
375 =head1 SYNOPSIS
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
376
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
377 mutspecFilter.pl [arguments] <query-file>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
378
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
379 <query-file> an annotated file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
380
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
381 Arguments:
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
382 -h, --help print help message
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
383 -m, --man print complete documentation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
384 -v, --verbose use verbose output
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
385 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file (start to count from 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
386 --segDup filter against genomic duplicate database
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
387 --esp filter against Exome Sequencing Project database (only for human)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
388 --thG filter against 1000 genome database (onyl for human)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
389 -o, --outfile <string> path to output file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
390 --refGenome reference genome to use
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
391 --pathAVDBList path to the list of Annovar databases installed
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
392 --filter path to a bed file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
393
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
394
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
395 Function: Filter out variants present in public databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
396
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
397 Example: # Filter against dbSNP
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
398 mutspecFilter.pl --dbSNP col_number (start to count from 1) --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
399
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
400 # Filter against all Annovar databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
401 mutspecFilter.pl --dbSNP col_number (start to count from 1) --segDup --esp --thG --exac --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
402
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
403 # Filter against additional databases in BED format
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
404 mutspecFilter.pl --filter path_to_bed --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
405
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
406
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
407 Version: 02-2017 (February 2017)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
408
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
409
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
410 =head1 OPTIONS
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
411
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
412 =over 8
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
413
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
414 =item B<--help>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
415
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
416 print a brief usage message and detailed explanation of options.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
417
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
418 =item B<--man>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
419
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
420 print the complete manual of the program.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
421
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
422 =item B<--verbose>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
423
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
424 use verbose output.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
425
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
426 =item B<--dbSNP>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
427
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
428 Remove all the variants presents in the dbSNP databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
429 Specify the number of the dbSNP column in the file (start to count from 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
430 For human and mouse genome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
431
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
432 =item B<--segDup>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
433
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
434 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
435 For human and mouse genome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
436
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
437 =item B<--esp>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
438
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
439 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
440 For human genome only
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
441
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
442 =item B<--thG>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
443
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
444 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
445
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
446
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
447 =item B<--exac>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
448
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
449 Remove all the variants with a frequency greater than 0.001 in ExAC database
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
450
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
451
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
452 =item B<--filter>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
453
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
454 Remove all variants present in the BED file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
455
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
456
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
457 =item B<--refGenome>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
458
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
459 The reference genome to use.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
460
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
461 =item B<--outfile>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
462
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
463 path to output file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
464
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
465 =item B<--pathAVDBList>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
466
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
467 the path to a texte file containing the list of the Annovar databases installed.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
468
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
469 =back
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
470
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
471 =head1 DESCRIPTION
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
472
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
473 mutspecFilter - Filter a file annotated with MutSpec-Annot tool.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
474 Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome, exac obtained from Annovar) will be removed from the input file (with frequency limits described above).
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
475 Additionally, using the --filter option, any variants present in a specified bed file will be removed from the input file.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
476
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
477 =cut