annotate mutspecFilter.pl @ 0:8c682b3a7c5b draft

Uploaded
author iarc
date Tue, 19 Apr 2016 03:07:11 -0400
parents
children 916846f73e25
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
1 # !/usr/bin/perl
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
2
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
3 #-----------------------------------#
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
4 # Author: Maude #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
5 # Script: mutspecFilter.pl #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
6 # Last update: 18/03/16 #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
7 #-----------------------------------#
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
8
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
9 use strict;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
10 use warnings;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
11 use Getopt::Long;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
12 use Pod::Usage;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
14 use File::Path;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
15
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
16 ################################################################################################################################################################################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
17 # Filter an Annotaed file with Annovar #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
18 ################################################################################################################################################################################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
19
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
24 our ($dir) = "";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
25
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
27
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
28 our ($input) = @ARGV;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
29
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
34
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
35
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
36
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
38 our $dbSNP = 0;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
39 if($dbSNP_value > 0) { $dbSNP = 1; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
40
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
41
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
42 ############ Check flags ############
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
44
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
45 # Zero databases is specified
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
47 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
48 print STDERR "There is no databases selected for filtering against!!!\nPlease choose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
49 exit;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
50 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
51
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
52
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
53
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
54 ############ Recover the name of the databases to filter against ############
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
56 my @tab_protocol = ();
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
57
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
59 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
60 ### Recover the name of the column
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
61 my $protocol = "";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
62 ExtractAVDBName($listAVDB, \$protocol);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
63 @tab_protocol = split(",", $protocol);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
64
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
65 for(my $i=0; $i<=$#tab_protocol; $i++)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
66 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
70 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
71 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
72
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
73
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
74 ############ Filter the file ############
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
75 filterAgainstPublicDB();
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
76
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
77
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
79
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
80
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
81 sub filterAgainstPublicDB
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
82 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
83 open(FILTER, ">", "$output") or die "$!: $output\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
84
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
85 open(F1, $input) or die "$!: $input\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
86 my $header = <F1>; print FILTER $header;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
87 while(<F1>)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
88 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
89 $_ =~ s/[\r\n]+$//;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
90 my @tab = split("\t", $_);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
91
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
93
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
94 if($segDup == 1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
95 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
96 my $segDup_value = recoverNumCol($input, $segDup_name);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
98 # Replace NA by 0 for making test on the same type of variable
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
99 $segDupInfo =~ s/NA/0/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
100 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
101 if($esp == 1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
102 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
103 my $espAll_value = recoverNumCol($input, $espAll_name);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
104 $espAllInfo = $tab[$espAll_value];
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
105 # Replace NA by 0 for making test on the same type of variable
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
106 $espAllInfo =~ s/NA/0/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
107 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
108 if($thG == 1)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
109 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
111 # Replace NA by 0 for making test on the same type of variable
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
112 $thgInfo = $tab[$thousandGenome_value];
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
113 $thgInfo =~ s/NA/0/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
114 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
115
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
116
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
117 ##############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
118 # One Filter #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
119 ##############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
120 # Remove all the variants present in dbSNP
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
124 # Remove all the variants with greater than 0.001 in Exome sequencing project
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
126 # Remove all the variants with greater than 0.001 in 1000 genome database
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
128
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
129
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
130 #############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
131 # Two Filter #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
132 ##############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
136
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
139
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
141
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
142
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
143 #############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
144 # Three Filter #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
145 ##############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
147 { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
149 { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
151 { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
153 { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
154
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
155
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
156 #############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
157 # FOUR Filter #
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
158 ##############################
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
160 { print FILTER "$_\n"; } }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
161
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
162 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
163 close F1; close FILTER;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
164 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
165
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
166
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
167 sub formatSegDupInfo
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
168 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
169 my ($segDup_info) = @_;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
170
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
172 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
173 my @segDup = split(";", $segDup_info);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
174 $segDup[0] =~ /Score=(.+)/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
175 return $1;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
176 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
177 else { return $segDup_info; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
178 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
179
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
180
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
181 sub ExtractAVDBName
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
182 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
183 my ($listAVDB, $refS_protocol) = @_;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
184
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
185 open(F1, $listAVDB) or die "$!: $listAVDB\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
186 while(<F1>)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
187 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
188 if ($_ =~ /^#/) { next; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
189
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
190 $_ =~ s/[\r\n]+$//;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
191 my @tab = split("\t", $_);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
192
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
193 # db name like refGenome_dbName.txt
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
195 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
196 my $temp = $1;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
198 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
199 # 1000 genome
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
200 if($tab[0] =~ /sites/)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
201 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
203 my ($dbName, $year, $month) = ($1, $2, $3);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
204 $dbName =~ tr/A-Z/a-z/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
205
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
206 # convert the month number into the month name
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
207 ConvertMonth(\$month);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
208
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
209 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
210
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
212 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
213 # ESP
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
214 if($tab[0] =~ /esp/)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
215 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
217 my $AVdbName_final = $1."_".$2;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
218
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
220 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
221 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
222 close F1;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
223
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
224 sub ConvertMonth
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
225 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
226 my ($refS_month) = @_;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
227
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
228 if($$refS_month == 1) { $$refS_month = "janv"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
229 elsif($$refS_month == 2) { $$refS_month = "feb"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
230 elsif($$refS_month == 3) { $$refS_month = "mar"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
231 elsif($$refS_month == 4) { $$refS_month = "apr"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
232 elsif($$refS_month == 5) { $$refS_month = "may"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
233 elsif($$refS_month == 6) { $$refS_month = "jun"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
234 elsif($$refS_month == 7) { $$refS_month = "jul"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
235 elsif($$refS_month == 8) { $$refS_month = "aug"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
236 elsif($$refS_month == 9) { $$refS_month = "sept"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
237 elsif($$refS_month == 10) { $$refS_month = "oct"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
238 elsif($$refS_month == 11) { $$refS_month = "nov"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
239 elsif($$refS_month == 12) { $$refS_month = "dec"; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
240 else { print STDERR "Month number don't considered\n"; exit; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
241 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
242 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
243
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
244
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
245 sub recoverNumCol
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
246 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
247 my ($input, $name_of_column) = @_;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
248
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
249 # With Annovar updates the databases name changed and are present in an array
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
250 if( ref($name_of_column) eq "ARRAY" )
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
251 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
252 my $test = "";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
253 my @tab = @$name_of_column;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
254 foreach (@tab)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
255 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
256 open(F1,$input) or die "$!: $input\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
257 # For having the name of the columns
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
259 close F1;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
260 # The number of the column
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
261 my $name_of_column_NB = "toto";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
262 for(my $i=0; $i<=$#tab_search_header; $i++)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
263 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
265 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
266 if($name_of_column_NB eq "toto") { next; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
267 else { return $name_of_column_NB; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
268 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
270 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
271 # Only one name is pass
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
272 else
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
273 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
274 open(FT,$input) or die "$!: $input\n";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
275 # For having the name of the columns
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
277 close FT;
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
278 # The number of the column
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
279 my $name_of_column_NB = "toto";
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
280 for(my $i=0; $i<=$#tab_search_header; $i++)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
281 {
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
283 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
285 else { return $name_of_column_NB; }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
286 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
287 }
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
288
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
289 =head1 NAME
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
290
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
292
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
293 =head1 SYNOPSIS
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
294
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
295 mutspecFilter.pl [arguments] <query-file>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
296
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
297 <query-file> an annotated file
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
298
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
299 Arguments:
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
300 -h, --help print help message
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
301 -m, --man print complete documentation
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
302 -v, --verbose use verbose output
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
304 --segDup filter against genomic duplicate database
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
305 --esp filter against Exome Sequencing Project database (only for human)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
306 --thG filter against 1000 genome database (onyl for human)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
307 -o, --outfile <string> name of output file
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
308 --refGenome reference genome to use
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
309 --pathAVDBList path to the list of Annovar databases installed
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
310
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
311
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
312 Function: Filter out variants present in public databases
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
313
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
314 Example: # Filter against dbSNP
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
316
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
317 # Filter against the four databases
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
319
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
320
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
321 Version: 03-2016 (March 2016)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
322
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
323
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
324 =head1 OPTIONS
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
325
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
326 =over 8
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
327
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
328 =item B<--help>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
329
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
330 print a brief usage message and detailed explanation of options.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
331
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
332 =item B<--man>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
333
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
334 print the complete manual of the program.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
335
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
336 =item B<--verbose>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
337
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
338 use verbose output.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
339
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
340 =item B<--dbSNP>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
341
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
342 Remove all the variants presents in the dbSNP databases
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
343 Specify the number of column containing the annotation
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
344 For human and mouse genome
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
345
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
346 =item B<--segDup>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
347
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
349 For human and mouse genome
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
350
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
351 =item B<--esp>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
352
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
354 For human genome only
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
355
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
356 =item B<--thG>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
357
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
359
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
360 =item B<--refGenome>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
361
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
362 the reference genome to use, could be hg19 or mm9.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
363
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
364 =item B<--outfile>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
365
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
366 the name of the output file
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
367
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
368 =item B<--pathAVDBList>
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
369
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
370 the path to a texte file containing the list of the Annovar databases installed.
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
371
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
372 =back
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
373
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
374 =head1 DESCRIPTION
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
375
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
377
8c682b3a7c5b Uploaded
iarc
parents:
diff changeset
378 =cut