Mercurial > repos > iarc > mutspec
comparison mutspecFilter.pl @ 0:8c682b3a7c5b draft
Uploaded
| author | iarc |
|---|---|
| date | Tue, 19 Apr 2016 03:07:11 -0400 |
| parents | |
| children | 916846f73e25 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8c682b3a7c5b |
|---|---|
| 1 # !/usr/bin/perl | |
| 2 | |
| 3 #-----------------------------------# | |
| 4 # Author: Maude # | |
| 5 # Script: mutspecFilter.pl # | |
| 6 # Last update: 18/03/16 # | |
| 7 #-----------------------------------# | |
| 8 | |
| 9 use strict; | |
| 10 use warnings; | |
| 11 use Getopt::Long; | |
| 12 use Pod::Usage; | |
| 13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/); | |
| 14 use File::Path; | |
| 15 | |
| 16 ################################################################################################################################################################################ | |
| 17 # Filter an Annotaed file with Annovar # | |
| 18 ################################################################################################################################################################################ | |
| 19 | |
| 20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested. | |
| 21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome. | |
| 22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use. | |
| 23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases. | |
| 24 our ($dir) = ""; | |
| 25 | |
| 26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2); | |
| 27 | |
| 28 our ($input) = @ARGV; | |
| 29 | |
| 30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help); | |
| 31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man); | |
| 32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script | |
| 33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input) | |
| 34 | |
| 35 | |
| 36 | |
| 37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify | |
| 38 our $dbSNP = 0; | |
| 39 if($dbSNP_value > 0) { $dbSNP = 1; } | |
| 40 | |
| 41 | |
| 42 ############ Check flags ############ | |
| 43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" } | |
| 44 | |
| 45 # Zero databases is specified | |
| 46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) ) | |
| 47 { | |
| 48 print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n"; | |
| 49 exit; | |
| 50 } | |
| 51 | |
| 52 | |
| 53 | |
| 54 ############ Recover the name of the databases to filter against ############ | |
| 55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", ""); | |
| 56 my @tab_protocol = (); | |
| 57 | |
| 58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) ) | |
| 59 { | |
| 60 ### Recover the name of the column | |
| 61 my $protocol = ""; | |
| 62 ExtractAVDBName($listAVDB, \$protocol); | |
| 63 @tab_protocol = split(",", $protocol); | |
| 64 | |
| 65 for(my $i=0; $i<=$#tab_protocol; $i++) | |
| 66 { | |
| 67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; } | |
| 68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; } | |
| 69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; } | |
| 70 } | |
| 71 } | |
| 72 | |
| 73 | |
| 74 ############ Filter the file ############ | |
| 75 filterAgainstPublicDB(); | |
| 76 | |
| 77 | |
| 78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n"; | |
| 79 | |
| 80 | |
| 81 sub filterAgainstPublicDB | |
| 82 { | |
| 83 open(FILTER, ">", "$output") or die "$!: $output\n"; | |
| 84 | |
| 85 open(F1, $input) or die "$!: $input\n"; | |
| 86 my $header = <F1>; print FILTER $header; | |
| 87 while(<F1>) | |
| 88 { | |
| 89 $_ =~ s/[\r\n]+$//; | |
| 90 my @tab = split("\t", $_); | |
| 91 | |
| 92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0); | |
| 93 | |
| 94 if($segDup == 1) | |
| 95 { | |
| 96 my $segDup_value = recoverNumCol($input, $segDup_name); | |
| 97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]); | |
| 98 # Replace NA by 0 for making test on the same type of variable | |
| 99 $segDupInfo =~ s/NA/0/; | |
| 100 } | |
| 101 if($esp == 1) | |
| 102 { | |
| 103 my $espAll_value = recoverNumCol($input, $espAll_name); | |
| 104 $espAllInfo = $tab[$espAll_value]; | |
| 105 # Replace NA by 0 for making test on the same type of variable | |
| 106 $espAllInfo =~ s/NA/0/; | |
| 107 } | |
| 108 if($thG == 1) | |
| 109 { | |
| 110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name); | |
| 111 # Replace NA by 0 for making test on the same type of variable | |
| 112 $thgInfo = $tab[$thousandGenome_value]; | |
| 113 $thgInfo =~ s/NA/0/; | |
| 114 } | |
| 115 | |
| 116 | |
| 117 ############################## | |
| 118 # One Filter # | |
| 119 ############################## | |
| 120 # Remove all the variants present in dbSNP | |
| 121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } } | |
| 122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database | |
| 123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } } | |
| 124 # Remove all the variants with greater than 0.001 in Exome sequencing project | |
| 125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } } | |
| 126 # Remove all the variants with greater than 0.001 in 1000 genome database | |
| 127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } } | |
| 128 | |
| 129 | |
| 130 ############################# | |
| 131 # Two Filter # | |
| 132 ############################## | |
| 133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } } | |
| 134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
| 135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
| 136 | |
| 137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
| 138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
| 139 | |
| 140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } } | |
| 141 | |
| 142 | |
| 143 ############################# | |
| 144 # Three Filter # | |
| 145 ############################## | |
| 146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) | |
| 147 { print FILTER "$_\n"; } } | |
| 148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) | |
| 149 { print FILTER "$_\n"; } } | |
| 150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
| 151 { print FILTER "$_\n"; } } | |
| 152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
| 153 { print FILTER "$_\n"; } } | |
| 154 | |
| 155 | |
| 156 ############################# | |
| 157 # FOUR Filter # | |
| 158 ############################## | |
| 159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) | |
| 160 { print FILTER "$_\n"; } } | |
| 161 | |
| 162 } | |
| 163 close F1; close FILTER; | |
| 164 } | |
| 165 | |
| 166 | |
| 167 sub formatSegDupInfo | |
| 168 { | |
| 169 my ($segDup_info) = @_; | |
| 170 | |
| 171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931 | |
| 172 { | |
| 173 my @segDup = split(";", $segDup_info); | |
| 174 $segDup[0] =~ /Score=(.+)/; | |
| 175 return $1; | |
| 176 } | |
| 177 else { return $segDup_info; } | |
| 178 } | |
| 179 | |
| 180 | |
| 181 sub ExtractAVDBName | |
| 182 { | |
| 183 my ($listAVDB, $refS_protocol) = @_; | |
| 184 | |
| 185 open(F1, $listAVDB) or die "$!: $listAVDB\n"; | |
| 186 while(<F1>) | |
| 187 { | |
| 188 if ($_ =~ /^#/) { next; } | |
| 189 | |
| 190 $_ =~ s/[\r\n]+$//; | |
| 191 my @tab = split("\t", $_); | |
| 192 | |
| 193 # db name like refGenome_dbName.txt | |
| 194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) ) | |
| 195 { | |
| 196 my $temp = $1; | |
| 197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; } | |
| 198 } | |
| 199 # 1000 genome | |
| 200 if($tab[0] =~ /sites/) | |
| 201 { | |
| 202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/; | |
| 203 my ($dbName, $year, $month) = ($1, $2, $3); | |
| 204 $dbName =~ tr/A-Z/a-z/; | |
| 205 | |
| 206 # convert the month number into the month name | |
| 207 ConvertMonth(\$month); | |
| 208 | |
| 209 my $AVdbName_final = "1000g".$year.$month."_".$dbName; | |
| 210 | |
| 211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; } | |
| 212 } | |
| 213 # ESP | |
| 214 if($tab[0] =~ /esp/) | |
| 215 { | |
| 216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/; | |
| 217 my $AVdbName_final = $1."_".$2; | |
| 218 | |
| 219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; } | |
| 220 } | |
| 221 } | |
| 222 close F1; | |
| 223 | |
| 224 sub ConvertMonth | |
| 225 { | |
| 226 my ($refS_month) = @_; | |
| 227 | |
| 228 if($$refS_month == 1) { $$refS_month = "janv"; } | |
| 229 elsif($$refS_month == 2) { $$refS_month = "feb"; } | |
| 230 elsif($$refS_month == 3) { $$refS_month = "mar"; } | |
| 231 elsif($$refS_month == 4) { $$refS_month = "apr"; } | |
| 232 elsif($$refS_month == 5) { $$refS_month = "may"; } | |
| 233 elsif($$refS_month == 6) { $$refS_month = "jun"; } | |
| 234 elsif($$refS_month == 7) { $$refS_month = "jul"; } | |
| 235 elsif($$refS_month == 8) { $$refS_month = "aug"; } | |
| 236 elsif($$refS_month == 9) { $$refS_month = "sept"; } | |
| 237 elsif($$refS_month == 10) { $$refS_month = "oct"; } | |
| 238 elsif($$refS_month == 11) { $$refS_month = "nov"; } | |
| 239 elsif($$refS_month == 12) { $$refS_month = "dec"; } | |
| 240 else { print STDERR "Month number don't considered\n"; exit; } | |
| 241 } | |
| 242 } | |
| 243 | |
| 244 | |
| 245 sub recoverNumCol | |
| 246 { | |
| 247 my ($input, $name_of_column) = @_; | |
| 248 | |
| 249 # With Annovar updates the databases name changed and are present in an array | |
| 250 if( ref($name_of_column) eq "ARRAY" ) | |
| 251 { | |
| 252 my $test = ""; | |
| 253 my @tab = @$name_of_column; | |
| 254 foreach (@tab) | |
| 255 { | |
| 256 open(F1,$input) or die "$!: $input\n"; | |
| 257 # For having the name of the columns | |
| 258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); | |
| 259 close F1; | |
| 260 # The number of the column | |
| 261 my $name_of_column_NB = "toto"; | |
| 262 for(my $i=0; $i<=$#tab_search_header; $i++) | |
| 263 { | |
| 264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; } | |
| 265 } | |
| 266 if($name_of_column_NB eq "toto") { next; } | |
| 267 else { return $name_of_column_NB; } | |
| 268 } | |
| 269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | |
| 270 } | |
| 271 # Only one name is pass | |
| 272 else | |
| 273 { | |
| 274 open(FT,$input) or die "$!: $input\n"; | |
| 275 # For having the name of the columns | |
| 276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header); | |
| 277 close FT; | |
| 278 # The number of the column | |
| 279 my $name_of_column_NB = "toto"; | |
| 280 for(my $i=0; $i<=$#tab_search_header; $i++) | |
| 281 { | |
| 282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; } | |
| 283 } | |
| 284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; } | |
| 285 else { return $name_of_column_NB; } | |
| 286 } | |
| 287 } | |
| 288 | |
| 289 =head1 NAME | |
| 290 | |
| 291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | |
| 292 | |
| 293 =head1 SYNOPSIS | |
| 294 | |
| 295 mutspecFilter.pl [arguments] <query-file> | |
| 296 | |
| 297 <query-file> an annotated file | |
| 298 | |
| 299 Arguments: | |
| 300 -h, --help print help message | |
| 301 -m, --man print complete documentation | |
| 302 -v, --verbose use verbose output | |
| 303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file | |
| 304 --segDup filter against genomic duplicate database | |
| 305 --esp filter against Exome Sequencing Project database (only for human) | |
| 306 --thG filter against 1000 genome database (onyl for human) | |
| 307 -o, --outfile <string> name of output file | |
| 308 --refGenome reference genome to use | |
| 309 --pathAVDBList path to the list of Annovar databases installed | |
| 310 | |
| 311 | |
| 312 Function: Filter out variants present in public databases | |
| 313 | |
| 314 Example: # Filter against dbSNP | |
| 315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | |
| 316 | |
| 317 # Filter against the four databases | |
| 318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input | |
| 319 | |
| 320 | |
| 321 Version: 03-2016 (March 2016) | |
| 322 | |
| 323 | |
| 324 =head1 OPTIONS | |
| 325 | |
| 326 =over 8 | |
| 327 | |
| 328 =item B<--help> | |
| 329 | |
| 330 print a brief usage message and detailed explanation of options. | |
| 331 | |
| 332 =item B<--man> | |
| 333 | |
| 334 print the complete manual of the program. | |
| 335 | |
| 336 =item B<--verbose> | |
| 337 | |
| 338 use verbose output. | |
| 339 | |
| 340 =item B<--dbSNP> | |
| 341 | |
| 342 Remove all the variants presents in the dbSNP databases | |
| 343 Specify the number of column containing the annotation | |
| 344 For human and mouse genome | |
| 345 | |
| 346 =item B<--segDup> | |
| 347 | |
| 348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database | |
| 349 For human and mouse genome | |
| 350 | |
| 351 =item B<--esp> | |
| 352 | |
| 353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project | |
| 354 For human genome only | |
| 355 | |
| 356 =item B<--thG> | |
| 357 | |
| 358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database | |
| 359 | |
| 360 =item B<--refGenome> | |
| 361 | |
| 362 the reference genome to use, could be hg19 or mm9. | |
| 363 | |
| 364 =item B<--outfile> | |
| 365 | |
| 366 the name of the output file | |
| 367 | |
| 368 =item B<--pathAVDBList> | |
| 369 | |
| 370 the path to a texte file containing the list of the Annovar databases installed. | |
| 371 | |
| 372 =back | |
| 373 | |
| 374 =head1 DESCRIPTION | |
| 375 | |
| 376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above) | |
| 377 | |
| 378 =cut |
