annotate mutspecFilter.pl @ 6:46a10309dfe2 draft

Uploaded
author iarc
date Tue, 28 Jun 2016 02:59:32 -0400
parents 097ae310ced0
children eda59b985b1c
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
1 # !/usr/bin/perl
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
2
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
3 #-----------------------------------#
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
4 # Author: Maude #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
5 # Script: mutspecFilter.pl #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
6 # Last update: 18/03/16 #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
7 #-----------------------------------#
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
8
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
9 use strict;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
10 use warnings;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
11 use Getopt::Long;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
12 use Pod::Usage;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
14 use File::Path;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
15
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
16 ################################################################################################################################################################################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
17 # Filter an Annotaed file with Annovar #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
18 ################################################################################################################################################################################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
19
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
20 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
21 our ($dbSNP_value, $segDup, $esp, $thG) = (0, 0, 0, 0); # For filtering agains the databases dbSNP, genomic duplicate segments, Exome Sequencing Project and 1000 genome.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
22 our ($output, $refGenome) = ("", ""); # The path for saving the result; The reference genome to use.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
23 our ($listAVDB) = "empty"; # Text file with the list Annovar databases.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
24 our ($dir) = "";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
25
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
26 GetOptions('dir|d=s'=>\$dir,'verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'dbSNP=i'=>\$dbSNP_value, 'segDup'=>\$segDup, 'esp'=>\$esp, 'thG'=>\$thG, 'outfile|o=s' => \$output, 'refGenome=s'=>\$refGenome, 'pathAVDBList=s' => \$listAVDB) or pod2usage(2);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
27
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
28 our ($input) = @ARGV;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
29
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
30 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
31 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
32 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
33 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
34
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
35
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
36
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
37 # If the dbSNP value is not equal to zero filter using the dbSNP column specify
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
38 our $dbSNP = 0;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
39 if($dbSNP_value > 0) { $dbSNP = 1; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
40
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
41
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
42 ############ Check flags ############
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
43 if($listAVDB eq "empty") { $listAVDB = "$dir/${refGenome}_listAVDB.txt" }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
44
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
45 # Zero databases is specified
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
46 if( ($dbSNP == 0) && ($segDup == 0) && ($esp == 0) && ($thG == 0) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
47 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
48 print STDERR "There is no databases selected for filtering against!!!\nPlease chose at least one between dbSNP, SegDup, ESP (only for human genome) or 1000 genome (only for human genome)\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
49 exit;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
50 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
51
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
52
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
53
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
54 ############ Recover the name of the databases to filter against ############
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
55 my ($segDup_name, $espAll_name, $thousandGenome_name) = ("", "", "");
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
56 my @tab_protocol = ();
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
57
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
58 if( ($segDup == 1) || ($esp == 1) || ($thG == 1) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
59 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
60 ### Recover the name of the column
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
61 my $protocol = "";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
62 ExtractAVDBName($listAVDB, \$protocol);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
63 @tab_protocol = split(",", $protocol);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
64
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
65 for(my $i=0; $i<=$#tab_protocol; $i++)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
66 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
67 if($tab_protocol[$i] =~ /genomicSuperDups/) { $segDup_name = $tab_protocol[$i]; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
68 elsif($tab_protocol[$i] =~ /1000g/) { $thousandGenome_name = $tab_protocol[$i]; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
69 elsif($tab_protocol[$i] =~ /esp/) { $espAll_name = $tab_protocol[$i]; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
70 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
71 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
72
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
73
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
74 ############ Filter the file ############
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
75 filterAgainstPublicDB();
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
76
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
77
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
78 print STDOUT "\tFilter selected\tdbSNP = ".$dbSNP."\tsegDup = ".$segDup."\tesp = ".$esp."\tthG = ".$thG."\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
79
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
80
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
81 sub filterAgainstPublicDB
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
82 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
83 open(FILTER, ">", "$output") or die "$!: $output\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
84
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
85 open(F1, $input) or die "$!: $input\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
86 my $header = <F1>; print FILTER $header;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
87 while(<F1>)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
88 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
89 $_ =~ s/[\r\n]+$//;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
90 my @tab = split("\t", $_);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
91
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
92 my ($segDupInfo, $espAllInfo, $thgInfo) = (0, 0 ,0);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
93
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
94 if($segDup == 1)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
95 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
96 my $segDup_value = recoverNumCol($input, $segDup_name);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
97 $segDupInfo = formatSegDupInfo($tab[$segDup_value]);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
98 # Replace NA by 0 for making test on the same type of variable
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
99 $segDupInfo =~ s/NA/0/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
100 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
101 if($esp == 1)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
102 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
103 my $espAll_value = recoverNumCol($input, $espAll_name);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
104 $espAllInfo = $tab[$espAll_value];
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
105 # Replace NA by 0 for making test on the same type of variable
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
106 $espAllInfo =~ s/NA/0/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
107 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
108 if($thG == 1)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
109 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
110 my $thousandGenome_value = recoverNumCol($input, $thousandGenome_name);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
111 # Replace NA by 0 for making test on the same type of variable
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
112 $thgInfo = $tab[$thousandGenome_value];
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
113 $thgInfo =~ s/NA/0/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
114 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
115
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
116
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
117 ##############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
118 # One Filter #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
119 ##############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
120 # Remove all the variants present in dbSNP
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
121 if( ($dbSNP == 1) && ($segDup==0) && ($esp==0) && ($thG==0) ) { if($tab[$dbSNP_value-1] eq "NA") { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
122 # Remove all the variants with a frequency greater than or equal to 0.9 in genomic duplicate segments database
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
123 if( ($dbSNP==0) && ($segDup == 1) && ($esp==0) && ($thG==0) ) { if($segDupInfo < 0.9) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
124 # Remove all the variants with greater than 0.001 in Exome sequencing project
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
125 if( ($dbSNP==0) && ($segDup==0) && ($esp == 1) && ($thG==0) ) { if($espAllInfo <= 0.001) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
126 # Remove all the variants with greater than 0.001 in 1000 genome database
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
127 if( ($dbSNP==0) && ($segDup==0) && ($esp==0) && ($thG == 1) ) { if($thgInfo <= 0.001) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
128
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
129
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
130 #############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
131 # Two Filter #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
132 ##############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
133 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG== 0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
134 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
135 if( ($dbSNP==1) && ($segDup==0) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
136
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
137 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
138 if( ($dbSNP==0) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
139
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
140 if( ($dbSNP==0) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) ) { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
141
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
142
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
143 #############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
144 # Three Filter #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
145 ##############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
146 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==0) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
147 { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
148 if( ($dbSNP==1) && ($segDup==1) && ($esp==0) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($thgInfo <= 0.001) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
149 { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
150 if( ($dbSNP==1) && ($segDup==0) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
151 { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
152 if( ($dbSNP==0) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
153 { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
154
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
155
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
156 #############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
157 # FOUR Filter #
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
158 ##############################
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
159 if( ($dbSNP==1) && ($segDup==1) && ($esp==1) && ($thG==1) ) { if( ($tab[$dbSNP_value-1] eq "NA") && ($segDupInfo < 0.9) && ($espAllInfo <= 0.001) && ($thgInfo <= 0.001) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
160 { print FILTER "$_\n"; } }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
161
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
162 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
163 close F1; close FILTER;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
164 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
165
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
166
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
167 sub formatSegDupInfo
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
168 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
169 my ($segDup_info) = @_;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
170
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
171 if($segDup_info ne "NA") # Score=0.907883;Name=chr9:36302931
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
172 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
173 my @segDup = split(";", $segDup_info);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
174 $segDup[0] =~ /Score=(.+)/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
175 return $1;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
176 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
177 else { return $segDup_info; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
178 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
179
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
180
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
181 sub ExtractAVDBName
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
182 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
183 my ($listAVDB, $refS_protocol) = @_;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
184
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
185 open(F1, $listAVDB) or die "$!: $listAVDB\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
186 while(<F1>)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
187 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
188 if ($_ =~ /^#/) { next; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
189
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
190 $_ =~ s/[\r\n]+$//;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
191 my @tab = split("\t", $_);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
192
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
193 # db name like refGenome_dbName.txt
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
194 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /sift/) && ($tab[0] !~ /pp2/) )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
195 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
196 my $temp = $1;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
197 if($temp =~ /genomicSuperDups/) { $$refS_protocol .= $temp.","; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
198 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
199 # 1000 genome
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
200 if($tab[0] =~ /sites/)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
201 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
202 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
203 my ($dbName, $year, $month) = ($1, $2, $3);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
204 $dbName =~ tr/A-Z/a-z/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
205
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
206 # convert the month number into the month name
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
207 ConvertMonth(\$month);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
208
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
209 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
210
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
211 if($dbName eq "all") { $$refS_protocol .=$AVdbName_final.","; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
212 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
213 # ESP
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
214 if($tab[0] =~ /esp/)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
215 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
216 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
217 my $AVdbName_final = $1."_".$2;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
218
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
219 if($2 eq "all") { $$refS_protocol .=$AVdbName_final.","; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
220 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
221 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
222 close F1;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
223
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
224 sub ConvertMonth
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
225 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
226 my ($refS_month) = @_;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
227
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
228 if($$refS_month == 1) { $$refS_month = "janv"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
229 elsif($$refS_month == 2) { $$refS_month = "feb"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
230 elsif($$refS_month == 3) { $$refS_month = "mar"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
231 elsif($$refS_month == 4) { $$refS_month = "apr"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
232 elsif($$refS_month == 5) { $$refS_month = "may"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
233 elsif($$refS_month == 6) { $$refS_month = "jun"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
234 elsif($$refS_month == 7) { $$refS_month = "jul"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
235 elsif($$refS_month == 8) { $$refS_month = "aug"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
236 elsif($$refS_month == 9) { $$refS_month = "sept"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
237 elsif($$refS_month == 10) { $$refS_month = "oct"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
238 elsif($$refS_month == 11) { $$refS_month = "nov"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
239 elsif($$refS_month == 12) { $$refS_month = "dec"; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
240 else { print STDERR "Month number don't considered\n"; exit; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
241 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
242 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
243
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
244
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
245 sub recoverNumCol
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
246 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
247 my ($input, $name_of_column) = @_;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
248
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
249 # With Annovar updates the databases name changed and are present in an array
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
250 if( ref($name_of_column) eq "ARRAY" )
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
251 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
252 my $test = "";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
253 my @tab = @$name_of_column;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
254 foreach (@tab)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
255 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
256 open(F1,$input) or die "$!: $input\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
257 # For having the name of the columns
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
258 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
259 close F1;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
260 # The number of the column
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
261 my $name_of_column_NB = "toto";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
262 for(my $i=0; $i<=$#tab_search_header; $i++)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
263 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
264 if($tab_search_header[$i] eq $_) { $name_of_column_NB = $i; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
265 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
266 if($name_of_column_NB eq "toto") { next; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
267 else { return $name_of_column_NB; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
268 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
269 if($name_of_column eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
270 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
271 # Only one name is pass
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
272 else
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
273 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
274 open(FT,$input) or die "$!: $input\n";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
275 # For having the name of the columns
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
276 my $search_header = <FT>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
277 close FT;
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
278 # The number of the column
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
279 my $name_of_column_NB = "toto";
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
280 for(my $i=0; $i<=$#tab_search_header; $i++)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
281 {
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
282 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
283 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
284 if($name_of_column_NB eq "toto") { print "Error recoverNumCol: the column named $name_of_column doesn't exits in the input file $input!!!!!\n"; exit; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
285 else { return $name_of_column_NB; }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
286 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
287 }
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
288
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
289 =head1 NAME
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
290
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
291 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
292
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
293 =head1 SYNOPSIS
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
294
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
295 mutspecFilter.pl [arguments] <query-file>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
296
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
297 <query-file> an annotated file
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
298
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
299 Arguments:
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
300 -h, --help print help message
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
301 -m, --man print complete documentation
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
302 -v, --verbose use verbose output
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
303 --dbSNP <value> filter against dbSNP database. Specify the number of the dbSNP column in the file
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
304 --segDup filter against genomic duplicate database
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
305 --esp filter against Exome Sequencing Project database (only for human)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
306 --thG filter against 1000 genome database (onyl for human)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
307 -o, --outfile <string> name of output file
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
308 --refGenome reference genome to use
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
309 --pathAVDBList path to the list of Annovar databases installed
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
310
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
311
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
312 Function: Filter out variants present in public databases
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
313
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
314 Example: # Filter against dbSNP
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
315 mutspecFilter.pl --dbSNP col_number --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
316
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
317 # Filter against the four databases
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
318 mutspecFilter.pl --dbSNP col_number --segDup --esp --thG --refGenome hg19 --pathAVDBList path_to_the_list_of_annovar_DB --outfile output_filename input
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
319
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
320
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
321 Version: 03-2016 (March 2016)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
322
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
323
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
324 =head1 OPTIONS
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
325
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
326 =over 8
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
327
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
328 =item B<--help>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
329
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
330 print a brief usage message and detailed explanation of options.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
331
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
332 =item B<--man>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
333
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
334 print the complete manual of the program.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
335
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
336 =item B<--verbose>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
337
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
338 use verbose output.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
339
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
340 =item B<--dbSNP>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
341
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
342 Remove all the variants presents in the dbSNP databases
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
343 Specify the number of column containing the annotation
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
344 For human and mouse genome
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
345
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
346 =item B<--segDup>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
347
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
348 Remove all the variants with a frequency greater or equal to 0.9 in genomic duplicate segments database
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
349 For human and mouse genome
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
350
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
351 =item B<--esp>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
352
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
353 Remove all the variants with a frequency greater than 0.001 in Exome sequencing project
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
354 For human genome only
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
355
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
356 =item B<--thG>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
357
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
358 Remove all the variants with a frequency greater than 0.001 in 1000 genome database
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
359
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
360 =item B<--refGenome>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
361
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
362 the reference genome to use, could be hg19 or mm9.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
363
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
364 =item B<--outfile>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
365
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
366 the name of the output file
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
367
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
368 =item B<--pathAVDBList>
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
369
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
370 the path to a texte file containing the list of the Annovar databases installed.
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
371
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
372 =back
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
373
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
374 =head1 DESCRIPTION
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
375
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
376 mutspecFilter - Filter a file annotated with MutSpec-Annot tool. Variants present in public databases (dbSNP, SegDup, ESP, 1000 genome obtained from Annovar) will be removed from the input file (with frequency limits described above)
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
377
46a10309dfe2 Uploaded
iarc
parents: 5
diff changeset
378 =cut