annotate mutspecAnnot.pl @ 7:eda59b985b1c draft default tip

Uploaded
author iarc
date Mon, 13 Mar 2017 08:21:19 -0400
parents 46a10309dfe2
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
7
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1 #!/usr/bin/env perl
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
2
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
3 #-----------------------------------#
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
4 # Author: Maude #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
5 # Script: mutspecAnnot.pl #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
6 # Last update: 02/03/17 #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
7 #-----------------------------------#
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
8
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
9 use strict;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
10 use warnings;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
11 use Getopt::Long;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
12 use Pod::Usage;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
13 use File::Basename; # my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
14 use File::Path;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
15 use Parallel::ForkManager;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
16
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
17
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
18 our ($verbose, $man, $help) = (0, 0, 0); # Parse options and print usage if there is a syntax error, or if usage was explicitly requested.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
19 our ($refGenome, $output, $path_AVDB, $pathAVDBList, $folder_temp) = ("empty", "empty", "empty", "empty", "empty"); # The reference genome to use; The path for saving the result; The path to Annovar database; Text file with the list of the databases for Annovar; the path for saving the temporary files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
20 our ($intervalEnd) = (10); # Number of bases for the flanking region for the sequence context.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
21 our ($fullAVDB) = "yes"; # Add an option for using all Annovar databases for the annotation or only refGene + strand + context for having a quicker annotation (for large file with million of lines)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
22
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
23
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
24 #########################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
25 ### SPECIFY THE NUMBER OF CPU ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
26 #########################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
27 our ($max_cpu) = 8; # Max number of CPU to use for the annotation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
28
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
29
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
30
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
31
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
32 GetOptions('verbose|v'=>\$verbose, 'help|h'=>\$help, 'man|m'=>\$man, 'refGenome=s'=>\$refGenome, 'interval=i' => \$intervalEnd, 'fullAnnotation=s' => \$fullAVDB, 'outfile|o=s' => \$output, 'pathAnnovarDB|AVDB=s' => \$path_AVDB, 'pathAVDBList=s' => \$pathAVDBList, 'pathTemporary|temp=s' => \$folder_temp, 'max_cpu=i' => \$max_cpu) or pod2usage(2);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
33
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
34 our ($input) = @ARGV;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
35
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
36 pod2usage(-verbose=>1, -exitval=>1, -output=>\*STDERR) if ($help);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
37 pod2usage(-verbose=>2, -exitval=>1, -output=>\*STDERR) if ($man);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
38 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 0); # No argument is pass to the command line print the usage of the script
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
39 pod2usage(-verbose=>0, -exitval=>1, -output=>\*STDERR) if(@ARGV == 2); # Only one argument is expected to be pass to @ARGV (the input)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
40
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
41
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
42
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
43 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
44 # GLOBAL VARIABLES #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
45 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
46
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
47 # Recover the current path
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
48 our $pwd = `pwd`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
49 chomp($pwd);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
50 # Recover the filename and the input directory
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
51 our ($filename, $directories, $suffix) = fileparse($input, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
52 # Output directories
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
53 our ($folderMutAnalysis, $folderAnnovar) = ("", "");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
54 # File with the list of Annovar databases to use
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
55 our $listAVDB = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
56 # Initialisation of chromosome, position, ref and alt values
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
57 our ($chrValue, $positionValue, $refValue, $altValue) = ("c", "s", "r", "a");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
58
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
59
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
60
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
61 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
62 # MAIN #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
63 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
64 ## Check the presence of the flags and create the output and temp directories
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
65 CheckFlags();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
66
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
67 ## Format the file correctly:
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
68 ## 1) Check the length of the filename (must be <= 31 characters)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
69 ## 2) Recover the input format. If MuTect output consider only "KEEP" variants
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
70 ## 3) Recover the column number for chr, start, ref and alt
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
71 FormatingInputFile();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
72
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
73 # Annotate the file with Annovar, add the strand orientation and the sequence context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
74 FullAnnotation();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
75
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
76 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
77 # FUNCTIONS #
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
78 ######################################################################################################################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
79
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
80 ## Check the presence of the flags and create the output and temp directories
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
81 sub CheckFlags
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
82 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
83 # Check the reference genome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
84 if($refGenome eq "empty")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
85 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
86 print STDERR "Missing flag !\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
87 print STDERR "You forget to specify the name for the reference genome!!!\nPlease specify it with the flag --refGenome\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
88 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
89 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
90 if($intervalEnd eq "empty")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
91 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
92 print STDERR "Missing flag !\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
93 print STDERR "You forget to specify the length for the sequence context!!!\nPlease specify it with the flag --intervalEnd\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
94 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
95 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
96 # If no output is specified write the result as the same place as the input file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
97 if($output eq "empty")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
98 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
99 my $directory = dirname( $input );
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
100
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
101 $folderMutAnalysis = "$directory/Mutational_Analysis";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
102 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
103 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
104 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
105 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
106 if(!-e $output) { mkdir($output) or die "$!: $output\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
107
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
108 $folderMutAnalysis = "$output/Mutational_Analysis";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
109 if(!-e $folderMutAnalysis) { mkdir($folderMutAnalysis) or die "$!: $folderMutAnalysis\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
110 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
111 # Create the output folder for Annovar
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
112 $folderAnnovar = "$folderMutAnalysis/Annovar";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
113 if(!-e $folderAnnovar) { mkdir($folderAnnovar) or die "$!: $folderAnnovar\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
114
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
115 # Verify the access to Annovar databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
116 if($path_AVDB eq "empty")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
117 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
118 print STDERR "Missing flag !\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
119 print STDERR "You forget to specify the path to Annovar databases!!!\nPlease specify it with the flag --pathAnnovarDB\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
120 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
121 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
122 elsif(!-e $path_AVDB)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
123 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
124 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
125 print STDERR"\nCan't access Annovar databases!\nPlease check the access to the disk\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
126 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
127
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
128 # Check the file list AV DB
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
129 if($pathAVDBList eq "empty")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
130 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
131 print STDERR "Missing flag !\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
132 print STDERR "You forget to specify the path to the list of Annovar databases!!!\nPlease specify it with the flag --pathAVDBList\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
133 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
134 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
135 else { $listAVDB = "$pathAVDBList/${refGenome}_listAVDB.txt" }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
136
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
137 # If no temp folder is specified write the result in the current path
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
138 if($folder_temp eq "empty") { $folder_temp = "$pwd/TEMP_MutationalAnalysis_$filename"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
139 if(!-e $folder_temp) { mkdir($folder_temp) or die "$!: $folder_temp\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
140
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
141 # Verify listAVDB is not empty
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
142 if($listAVDB eq "")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
143 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
144 print STDERR "Path to the text file containing the list of Annovar databases installed is not specified !!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
145 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
146 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
147 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
148
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
149 ## Format the file in the correct format if they are vcf or MuTect output and recover the column positions
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
150 sub FormatingInputFile
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
151 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
152 # The input is a folder
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
153 if(-d $input)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
154 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
155 foreach my $file (`ls $input`)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
156 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
157 my $headerOriginalFile = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
158 chomp($file);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
159
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
160 CheckLengthFilename("$input/$file");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
161
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
162 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
163 ### Recover the input format ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
164 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
165 RecoverInputFormat("$input/$file", \$headerOriginalFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
166 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
167 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
168 # The input is one file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
169 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
170 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
171 my $headerOriginalFile = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
172
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
173 CheckLengthFilename($input);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
174
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
175 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
176 ### Recover the input format ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
177 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
178 RecoverInputFormat($input, \$headerOriginalFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
179 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
180 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
181
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
182 # The name for the Excel sheet can't be longer than 31 characters
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
183 sub CheckLengthFilename
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
184 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
185 my ($inputFile) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
186
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
187 my ($filenameInputFile, $directoriesInputFile, $suffixInputFile) = fileparse($inputFile, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
188
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
189 if(length($filenameInputFile) > 32)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
190 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
191 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
192 print STDERR "The file: $inputFile must be <= 31 chars\nPlease modify it before running the script\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
193 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
194 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
195
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
196 # Recover the input format. If MuTect output consider only "KEEP" variants
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
197 sub RecoverInputFormat
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
198 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
199 my ($file, $refS_headerOriginalFile) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
200
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
201 my ($filename, $directories, $suffix) = fileparse($file, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
202
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
203 my $inputFormat = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
204
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
205 open(F1, $file) or die "$!: $file\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
206 my $header = <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
207 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
208
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
209 ### VCF and MuTect files have in their first line the type of the file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
210 if($header =~ /fileformat=VCF/i) { $inputFormat = "vcf"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
211 elsif($header =~ /mutect/i) { $inputFormat = "mutect"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
212 else { $inputFormat = "unknown"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
213
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
214
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
215 ### VCF files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
216 if($inputFormat eq "vcf")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
217 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
218 ### MuTect2 output VCFs
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
219 my $testVC = `grep MuTect2 $file`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
220 if($testVC =~ /MuTect2/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
221 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
222 # Keep only the variants passing MuTect2 filters
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
223 `grep PASS $file > $folder_temp/$filename-PASS.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
224
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
225 # Recover the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
226 $$refS_headerOriginalFile = `grep '#CHROM' $file`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
227
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
228 # Add the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
229 # Sed command doesn't work... sed 's/^/some text\n/' file > res
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
230 open(OUT, ">", "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
231 print OUT $$refS_headerOriginalFile;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
232 open(F1, "$folder_temp/$filename-PASS.txt") or die "$!: $folder_temp/$filename-PASS.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
233 while(<F1>) { print OUT $_; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
234 close F1; close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
235
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
236 `rm $folder_temp/$filename-PASS.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
237
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
238 # Check if there if no empty column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
239 CheckEmptyColumn("$folder_temp/$filename-KEEP.txt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
240 `rm $folder_temp/$filename-KEEP.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
241
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
242 # Set the col number for the chr,start,ref and alt
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
243 ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
244 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
245 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
246 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
247 open(F1, $file) or die "$!: $file\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
248 open(OUT, ">", "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
249 while (<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
250 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
251 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
252 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
253 # Print the VCF header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
254 if($tab[0] eq "#CHROM")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
255 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
256 $tab[0] =~ /#(.+)/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
257 print OUT "$1";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
258 for(my $i=1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
259 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
260 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
261 elsif($tab[0] !~ /##/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
262 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
263 # Don't consider chromosome random, GL and MT
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
264 if( ($tab[0] =~ /random/) || ($tab[0] =~ /GL/i) ) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
265 print OUT "$_\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
266 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
267 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
268 close F1; close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
269
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
270 ## Recover the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
271 open(F1, "$folder_temp/$filename.txt") or die "$!: $folder_temp/$filename.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
272 $$refS_headerOriginalFile = <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
273 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
274
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
275 # Check if there if no empty column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
276 CheckEmptyColumn("$folder_temp/$filename.txt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
277 `rm $folder_temp/$filename.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
278
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
279
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
280 # Set the col number for the chr,start,ref and alt
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
281 ($chrValue, $positionValue, $refValue, $altValue) = (0, 1, 3, 4);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
282 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
283 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
284 ### MuTect files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
285 elsif($inputFormat eq "mutect")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
286 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
287 `sed '1d' $file > $folder_temp/$filename-HeaderOK`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
288 # Keep only the SNVs of good quality
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
289 `grep -v REJECT $folder_temp/$filename-HeaderOK > $folder_temp/$filename-KEEP.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
290 `rm $folder_temp/$filename-HeaderOK`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
291
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
292 # Recover the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
293 open(F1, "$folder_temp/$filename-KEEP.txt") or die "$!: $folder_temp/$filename-KEEP.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
294 $$refS_headerOriginalFile = <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
295 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
296
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
297 # Check if there if no empty column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
298 CheckEmptyColumn("$folder_temp/$filename-KEEP.txt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
299 `rm $folder_temp/$filename-KEEP.txt`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
300
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
301 # Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
302 # Use the dictionary for recover the names and the position of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
303 RecoverColNameAuto("$folder_temp/$filename-KEEPColumnCorrect.txt", $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
304 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
305 ### Unknown type
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
306 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
307 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
308 ## Recover the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
309 open(F1, $file) or die "$!: $file\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
310 $$refS_headerOriginalFile = <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
311 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
312
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
313 # Check if there if no empty column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
314 CheckEmptyColumn($file);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
315
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
316 ## Recover the name and the number of the columns that contain the chromosome number, the start position, the ref and alt alleles.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
317 # Use the dictionary for recover the names and the position of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
318 RecoverColNameAuto($file, $$refS_headerOriginalFile, \$chrValue, \$positionValue, \$refValue, \$altValue);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
319 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
320 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
321
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
322 # Some files can have empty column with no information
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
323 sub CheckEmptyColumn
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
324 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
325 my ($inputFile) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
326
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
327 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
328
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
329 if($filename =~ /(.+)-KEEP/) { $filename = $1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
330
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
331 open(OUT, ">", "$folder_temp/$filename-ColumnCorrect.txt") or die "$!: $folder_temp/$filename-ColumnCorrect.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
332
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
333 open(F1, $inputFile) or die "$!: $inputFile\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
334 my $header = <F1>; $header =~ s/[\r\n]+$//; my @tabHeader = split("\t", $header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
335 print OUT $header, "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
336 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
337 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
338 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
339 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
340
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
341 if(scalar(@tab) != scalar(@tabHeader))
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
342 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
343 print OUT $tab[0];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
344 for(my $i=1; $i<=$#tabHeader; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
345 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
346 if(defined($tab[$i])) { print OUT "\t$tab[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
347 else { print OUT "\tNA"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
348 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
349 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
350 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
351 else { print OUT "$_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
352 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
353 close F1; close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
354 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
355
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
356 # Dictionnary for extracting the name and number of columns for the chromosome, start position, ref and alt alleles.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
357 sub RecoverColNameAuto
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
358 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
359 our ($inputFile, $header, $ref_chrValue, $ref_positionValue, $ref_refValue, $ref_altValue) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
360
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
361 $header =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
362
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
363 ## Name of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
364 my @mutect = qw(contig position ref_allele alt_allele);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
365 my @vcf = qw(CHROM POS REF ALT);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
366 my @cosmic = qw(Mutation_GRCh37_chromosome_number Mutation_GRCh37_genome_position Description_Ref_Genomic Description_Alt_Genomic);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
367 my @icgc = qw(chromosome chromosome_start reference_genome_allele mutated_to_allele);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
368 my @tcga = qw(Chromosome Start_position Reference_Allele Tumor_Seq_Allele2);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
369 my @ionTorrent = qw(chr Position Ref Alt);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
370 my @proton = qw(Chrom Position Ref Variant);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
371 my @varScan2 = qw(Chrom Position Ref VarAllele);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
372 my @varScan2_somatic = qw(chrom position ref var);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
373 my @annovar = qw(Chr Start Ref Obs);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
374 my @custom = qw(Chromosome Start Wild_Type Mutant);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
375
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
376 my @allTab = (\@mutect, \@vcf, \@cosmic, \@icgc, \@tcga, \@ionTorrent, \@proton, \@varScan2, \@varScan2_somatic, \@annovar, \@custom);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
377 my $timer = 0; # For controlling if the names are present on the dictionnary or not
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
378
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
379 foreach my $refTab (@allTab)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
380 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
381 my @tab = @$refTab;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
382
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
383 SearchCol(\@tab);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
384
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
385 # The columns names were find
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
386 if( ($$ref_chrValue ne "c") && ($$ref_positionValue ne "s") && ($$ref_refValue ne "r") && ($$ref_altValue ne "a") ) { last; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
387 # The names of the columns are not present in the dictionnary
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
388 else { $timer++; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
389 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
390
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
391 if($timer == scalar(@allTab))
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
392 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
393 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
394 print STDERR "The columns name are not in the dictionnary please change them before running the tool again\nFile concerning: $inputFile\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
395 print STDERR "TIP: Use one of the columns names proposed in the section Input formats of the tool\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
396 exit;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
397 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
398
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
399 # Extract the number of the column that contain the information
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
400 sub SearchCol
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
401 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
402 my ($refTab) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
403
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
404 my @tabNames = @$refTab;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
405 my @tabHeader = split("\t", $header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
406
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
407 # For VCF
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
408 if($tabHeader[0] eq "#CHROM") { ($$ref_chrValue, $$ref_positionValue, $$ref_refValue, $$ref_altValue) = (0, 1, 3, 4); }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
409 # For tabular files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
410 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
411 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
412 for(my $i=0; $i<=$#tabNames; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
413 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
414 for(my $j=0; $j<=$#tabHeader; $j++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
415 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
416 if($tabHeader[$j] eq $tabNames[$i])
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
417 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
418 if($i == 0) { $$ref_chrValue = $j; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
419 if($i == 1) { $$ref_positionValue = $j; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
420 if($i == 2) { $$ref_refValue = $j; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
421 if($i == 3) { $$ref_altValue = $j; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
422 last; # Once find pass to the next name
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
423 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
424 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
425 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
426 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
427 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
428 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
429
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
430 # Annotate the file with Annovar, add the strand orientation and the sequence context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
431 sub FullAnnotation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
432 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
433 print "-----------------------------------------------------------------\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
434 print "---------------------------Annotation----------------------------\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
435 print "-----------------------------------------------------------------\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
436
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
437
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
438 # If the input is a folder
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
439 if(-d $input)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
440 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
441 foreach my $file (`ls $folder_temp/*.txt`)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
442 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
443 chomp($file);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
444
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
445 # For recover the name of the file without extension, the directory where the file is and the extension of the file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
446 my ($filename, $directories, $suffix) = fileparse("$folder_temp/$file", qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
447 my $filenameOK = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
448 # For removing the ColumnCorrect for txt files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
449 if($filename =~ /(.+)-ColumnCorrect/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
450 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
451 if($filename =~ /(.+)-VariantListVCF-ColumnCorrect/) { $filenameOK = $1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
452 else { $filenameOK = $1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
453 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
454 else { print STDERR "Error message:\n"; print STDERR "Case not considered for $filename!!!\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
455
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
456
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
457 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
458 ### Cut the files in n part ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
459 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
460 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
461 my $cpu = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
462 # Keep the original header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
463 my $headerOriginalFile = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
464 # Save the numer of lines
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
465 my $nbLine = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
466 splitInputFile($file, \$cpu, \$headerOriginalFile, $filenameOK, \$nbLine);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
467
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
468
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
469 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
470 ### Annotate the n part ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
471 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
472 annotateFile($cpu, $filenameOK, $headerOriginalFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
473
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
474
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
475 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
476 ### Paste the file together ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
477 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
478 createOutput($filenameOK, $headerOriginalFile, $nbLine);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
479 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
480 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
481 # The input file is one file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
482 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
483 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
484 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
485 ### Cut the files in n part ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
486 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
487 # Cut the file in n part depending on the number of lines and set the number of CPU to use for the annotation depending of the number of n parts
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
488 my $cpu = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
489 # Keep the original header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
490 my $headerOriginalFile = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
491 # Save the numer of lines
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
492 my $nbLine = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
493 splitInputFile("$folder_temp/$filename-ColumnCorrect.txt", \$cpu, \$headerOriginalFile, $filename, \$nbLine);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
494
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
495
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
496 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
497 ### Annotate the n part ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
498 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
499 annotateFile($cpu, $filename, $headerOriginalFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
500
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
501
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
502 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
503 ### Paste the file together ###
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
504 #################################################
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
505 createOutput($filename, $headerOriginalFile, $nbLine);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
506 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
507 # Remove the temporary directory
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
508 rmtree($folder_temp);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
509 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
510
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
511
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
512
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
513 sub splitInputFile
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
514 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
515 my ($inputFile, $ref_cpu, $ref_header, $filename, $ref_nbLine) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
516
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
517 my $nbVariants = `wc -l $inputFile`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
518 $nbVariants =~ /(\d+).+/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
519 $$ref_nbLine = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
520
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
521 if($$ref_nbLine-1 <= 5000) { $$ref_cpu = 1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
522 elsif( ($$ref_nbLine-1 > 5000) && ($$ref_nbLine-1 < 25000) ) { $$ref_cpu = 2; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
523 elsif( ($$ref_nbLine-1 >= 25000) && ($$ref_nbLine-1 < 100000) ) { $$ref_cpu = 8; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
524 else { $$ref_cpu = $max_cpu; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
525
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
526 # If the number predefined can't be used on the machine use the maximum number specify by the administrator
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
527 if($$ref_cpu > $max_cpu) { $$ref_cpu = $max_cpu }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
528
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
529
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
530 ## Recover the header
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
531 open(F1, $inputFile) or die "$!: $inputFile\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
532 $$ref_header= <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
533 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
534
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
535 ## Remove the first line of the file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
536 my $fileNoHeader = "$folder_temp/${filename}-NoHeader";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
537 `sed 1d $inputFile > $fileNoHeader`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
538
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
539 if(!-e "$folder_temp/$filename") { mkdir("$folder_temp/$filename") or die "Can't create the directory $folder_temp/$filename\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
540 my $lines_per_temp = int(1+($1 / $$ref_cpu)); # +1 in case of the div == 0
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
541 `split -l $lines_per_temp $fileNoHeader $folder_temp/$filename/$filename-`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
542
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
543 if($$ref_header eq "") { print STDERR "Error message:\n"; print STDERR "No header for the file $inputFile!!!\nPlease check the format of your file\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
544 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
545
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
546 sub annotateFile
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
547 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
548 my ($cpu, $filename, $headerOriginalFile) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
549
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
550 my $pm = Parallel::ForkManager->new($cpu);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
551
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
552 foreach my $tempFile (`ls $folder_temp/$filename/$filename-*`)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
553 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
554 chomp($tempFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
555
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
556 # Forks and returns the pid for the child:
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
557 my $pid = $pm->start and next;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
558
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
559 # Convert the file in a correct format for Annovar: Chr Start End Ref Alt Otherinfo
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
560 my ($filenameTempFile, $directoriesTempFile, $suffixTempFile) = fileparse($tempFile, qr/\-[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
561 my $outFilenameTemp = $filenameTempFile.$suffixTempFile;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
562 Convert2AV($tempFile, $chrValue, $positionValue, $refValue, $altValue, "$folder_temp/$outFilenameTemp-AVInput");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
563
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
564 # Annotate the file with Annovar
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
565 my $tempFileName_AVOutput = $filename.$suffixTempFile.".".${refGenome}."_multianno.txt";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
566 if($fullAVDB eq "yes") { AnnotateAV("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
567 else { annotateAV_min("$folder_temp/$outFilenameTemp-AVInput", "$folder_temp/$outFilenameTemp"); }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
568
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
569 # Check if the annotations worked
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
570 open(F1, "$folderMutAnalysis/log_annovar.txt") or die "$!: $folderMutAnalysis/log_annovar.txt\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
571 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
572 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
573 if($_ =~ /ERROR/i)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
574 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
575 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
576 print STDERR $_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
577 print STDERR "\n\n\t\tANNOVAR LOG FILE\n\n\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
578 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
579 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
580 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
581
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
582 # Recover the strand orientation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
583 my $length_AVheader = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
584 RecoverStrand("$folder_temp/$tempFileName_AVOutput", $headerOriginalFile, $path_AVDB, $refGenome, "$folder_temp/$outFilenameTemp-Strand", \$length_AVheader);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
585
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
586 # Recover the sequence context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
587 RecoverGenomicSequence("$folder_temp/$outFilenameTemp-Strand", $length_AVheader, $intervalEnd, $refGenome, $path_AVDB, "$folder_temp/$filename/$outFilenameTemp".".".${refGenome}."_multianno.txt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
588
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
589 $pm->finish; # Terminates the child process
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
590 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
591 # Wait all the child process
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
592 $pm->wait_all_children;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
593 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
594
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
595 sub createOutput
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
596 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
597 my ($filename, $headerOriginalFile, $nbLine) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
598
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
599 ## For MuTect and MuTect2 calling only variants passing MuTect filters are kept and sometines there is no variant passing these filters making error in Galaxy when using "collection".
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
600 if($nbLine == 1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
601 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
602 print STDOUT "\nThe sample $filename didn't pass MuTect filters\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
603
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
604 ### Print Annovar minimal header + the original header of the input file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
605 my $outputFile = "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
606 open(OUT, ">", $outputFile) or die "$!: $outputFile\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
607
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
608 if($fullAVDB eq "no")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
609 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
610 print OUT "Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tStrand\tcontext";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
611 print OUT "\t".$headerOriginalFile;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
612 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
613 ### Print complete Annovar header (using the database name present in the file listAVDB) + the original header of the input file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
614 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
615 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
616 print OUT "Chr\tStart\tEnd\tRef\tAlt";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
617 open(F1, $listAVDB) or die "$!: $listAVDB\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
618 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
619 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
620 if($_ =~ /^#/) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
621
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
622 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
623 $tab[0] =~ /$refGenome\_(.+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
624 my $dbName = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
625
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
626 if($dbName =~ /refGene|knownGene|ensGene/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
627 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
628 print OUT "\t"."Func.$dbName\tGene.$dbName\tGeneDetail.$dbName\tExonicFunc.$dbName\tAAChange.$dbName";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
629 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
630 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
631 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
632 print OUT "\t".$dbName;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
633 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
634 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
635 print OUT "\tStrand\tcontext\t".$headerOriginalFile;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
636
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
637 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
638 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
639 close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
640 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
641 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
642 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
643 CombinedTempFile("$folder_temp/$filename", "$folderAnnovar/$filename".".".${refGenome}."_multianno.txt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
644 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
645 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
646
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
647 sub Convert2AV
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
648 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
649 my ($inputFile, $chr_value, $start_value, $ref_value, $alt_value, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
650
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
651 my ($filename, $directories, $suffix) = fileparse($inputFile, qr/\.[^.]*/);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
652
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
653 open(F1, $inputFile) or die "$!: $inputFile\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
654
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
655 open(OUT, ">", $output) or die "$!: $output\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
656 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
657 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
658 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
659 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
660 my $chr = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
661
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
662 # Replace chr23 or chr24 by X or Y
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
663 if($tab[$chr_value] =~ /23/) { $chr = "chrX"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
664 elsif($tab[$chr_value] =~ /24/) { $chr = "chrY"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
665 elsif($tab[$chr_value] =~ /chr/) { $chr = $tab[$chr_value]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
666 else { $chr = "chr".$tab[$chr_value]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
667
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
668
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
669 ### Consider only "normal" chromosomes for the annotation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
670 if( ($chr !~ /chr\d{1,2}$|chrX|chrY/) ) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
671
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
672
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
673 ### Don't consider variants with two or more alt bases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
674 if($tab[$alt_value] =~ /\,/) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
675
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
676 ### Reformat the Indels for Annovar
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
677 # chr1 85642631 C CT => chr1 85642631 85642631 - T (mm10)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
678 # chr5 26085724 ACTT A => chr5 26085725 26085727 CTT - (mm10)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
679 if( ((length($tab[$ref_value]) != 1) || (length($tab[$alt_value]) != 1)) || (($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") ) )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
680 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
681 ### First check if the indels in the file are not already correctly formated
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
682 if( ($tab[$ref_value] eq "-") || ($tab[$alt_value] eq "-") )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
683 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
684 # For indels count the number of bases deleted or inserted for modifying the end position (if start + end is the same the annotations are not retrieved for indels)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
685 # Insertion: start = start & end = start
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
686 if($tab[$ref_value] =~ /\-/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
687 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
688 print OUT "$chr\t$tab[$start_value]\t$tab[$start_value]\t$tab[$ref_value]\t$tab[$alt_value]";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
689 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
690 ## Deletion: start = start & end = start + length(del) -1
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
691 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
692 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
693 my $end = $tab[$start_value] + (length($tab[$ref_value]) - 1);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
694 print OUT "$chr\t$tab[$start_value]\t$end\t$tab[$ref_value]\t$tab[$alt_value]";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
695 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
696 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
697 ### Indels not correctly formated for Annovar
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
698 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
699 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
700 my @tabRef = split("", $tab[$ref_value]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
701 my @tabAlt = split("", $tab[$alt_value]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
702
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
703 # Remove the first base
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
704 my $ref2 = join("", @tabRef[1 .. $#tabRef]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
705 my $alt2 = join("", @tabAlt[1 .. $#tabAlt]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
706
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
707 if(length($alt2) == 0)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
708 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
709 my $altOK = "-";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
710 my $startOK = $tab[$start_value] + 1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
711 my $stopOK = $startOK + length($ref2) - 1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
712 print OUT $chr."\t".$startOK."\t".$stopOK."\t".$ref2."\t".$altOK;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
713 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
714
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
715 if(length($ref2) == 0)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
716 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
717 my $refOK = "-";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
718 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$refOK."\t".$alt2;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
719 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
720 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
721 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
722 ### SBS
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
723 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
724 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
725 print OUT $chr."\t".$tab[$start_value]."\t".$tab[$start_value]."\t".$tab[$ref_value]."\t".$tab[$alt_value];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
726 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
727
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
728 ## Print the original file at the end
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
729 foreach (@tab) { print OUT "\t$_"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
730 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
731 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
732 close F1; close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
733 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
734
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
735 sub AnnotateAV
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
736 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
737 my ($inputFile, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
738
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
739 if(!-e $path_AVDB)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
740 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
741 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
742 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
743 print STDERR "Please install the database for this genome before running Annovar\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
744 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
745
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
746 # Extract the name of the databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
747 my $protocol = ""; my $operation = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
748 ExtractAVDBName($listAVDB, \$protocol, \$operation);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
749
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
750 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
751
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
752 sub ExtractAVDBName
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
753 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
754 my ($listAVDB, $refS_protocol, $refS_operation) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
755
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
756 open(F1, $listAVDB) or die "$!: $listAVDB\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
757 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
758 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
759 if ($_ =~ /^#/) { next; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
760
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
761 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
762 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
763
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
764 # db name like refGenome_dbName.txt
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
765 if( ($tab[0] =~ /\w+_(\w+)\.txt/) && ($tab[0] !~ /sites/) && ($tab[0] !~ /esp/) && ($tab[0] !~ /ljb26/) )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
766 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
767 $$refS_protocol .= $1.","; $$refS_operation .= $tab[1].",";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
768 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
769 # 1000 genome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
770 if($tab[0] =~ /sites/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
771 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
772 $tab[0] =~ /\w+_(\w+)\.sites.(\d+)_(\d+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
773 my ($dbName, $year, $month) = ($1, $2, $3);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
774 $dbName =~ tr/A-Z/a-z/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
775
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
776 # convert the month number into the month name
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
777 ConvertMonth(\$month);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
778
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
779 my $AVdbName_final = "1000g".$year.$month."_".$dbName;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
780 $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].",";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
781 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
782 # ESP
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
783 if( ($tab[0] =~ /esp/) || ($tab[0] =~ /ljb26/) )
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
784 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
785 $tab[0] =~ /\w+_(\w+)_(\w+)\.txt/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
786 my $AVdbName_final = $1."_".$2;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
787 $$refS_protocol .=$AVdbName_final.","; $$refS_operation .= $tab[1].",";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
788 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
789 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
790 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
791
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
792 sub ConvertMonth
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
793 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
794 my ($refS_month) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
795
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
796 if($$refS_month == 1) { $$refS_month = "janv"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
797 elsif($$refS_month == 2) { $$refS_month = "feb"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
798 elsif($$refS_month == 3) { $$refS_month = "mar"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
799 elsif($$refS_month == 4) { $$refS_month = "apr"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
800 elsif($$refS_month == 5) { $$refS_month = "may"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
801 elsif($$refS_month == 6) { $$refS_month = "jun"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
802 elsif($$refS_month == 7) { $$refS_month = "jul"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
803 elsif($$refS_month == 8) { $$refS_month = "aug"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
804 elsif($$refS_month == 9) { $$refS_month = "sept"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
805 elsif($$refS_month == 10) { $$refS_month = "oct"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
806 elsif($$refS_month == 11) { $$refS_month = "nov"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
807 elsif($$refS_month == 12) { $$refS_month = "dec"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
808 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
809 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
810 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
811
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
812 ### Add the minimum of annotations (refGene + strand + context)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
813 sub annotateAV_min
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
814 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
815 my ($inputFile, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
816
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
817 if(!-e $path_AVDB)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
818 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
819 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
820 print STDERR "The Annovar database doesn't exists for the reference genome $refGenome!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
821 print STDERR "Please install the database for this genome before running Annovar\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
822 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
823
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
824 # Extract the name of the databases
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
825 my ($protocol, $operation) = ("refGene", "g");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
826
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
827 `table_annovar.pl $inputFile $path_AVDB -buildver $refGenome -protocol $protocol -operation $operation -remove -nastring NA -otherinfo -outfile $output > $folderMutAnalysis/log_annovar.txt 2>&1`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
828 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
829
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
830 sub RecoverStrand
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
831 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
832 my ($input, $headerOriginalFile, $pathDB, $refGenome, $output, $refS_lengthAVheader) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
833
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
834 my ($chr_value, $start_value, $ref_value, $alt_value, $func_value, $geneSymbol_value) = ("", "", "", "", "", "", "", "");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
835
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
836 $chr_value = recoverNumCol($input, "Chr");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
837 $start_value = recoverNumCol($input, "Start");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
838 $ref_value = recoverNumCol($input, "Ref");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
839 $alt_value = recoverNumCol($input, "Alt");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
840 $func_value = recoverNumCol($input, "Func.refGene");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
841 $geneSymbol_value = recoverNumCol($input, "Gene.refGene");
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
842
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
843 #################### Convert the input file into a hash table
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
844 my %h_inputFile = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
845 open(F1, $input) or die "$!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
846 my $annovar_header = <F1>;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
847
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
848 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
849 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
850 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
851 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
852
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
853 # In COSMIC the chromosome X and Y are annotated 23 and 24
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
854 my $chr = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
855 if($tab[$chr_value] eq "chr23") { $chr = "chrX"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
856 elsif($tab[$chr_value] eq "chr24") { $chr = "chrY"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
857 elsif($tab[$chr_value] eq "chr25") { $chr = "chrM"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
858 else { $chr = $tab[$chr_value]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
859
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
860 # Verify if the element exists
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
861 if($chr eq "") { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The chromosome value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
862 if(! exists $tab[$start_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The start value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
863 if(! exists $tab[$ref_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The reference value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
864 if(! exists $tab[$alt_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The alternate value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
865 if(! exists $tab[$func_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The functional value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
866 if(! exists $tab[$geneSymbol_value]) { print STDERR "Error message:\n"; print STDERR "Error RecoverStrand: The gene symbol value is nor defined for $_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
867
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
868 my $geneSymbol = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
869 ######## For the splicing annotation we separate the gene symbol from the aa change
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
870 if($tab[$func_value] eq "splicing")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
871 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
872 if($tab[$geneSymbol_value] =~ /(.+)\((.+)\)/) { $geneSymbol = $1; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
873 else { $geneSymbol = $tab[$geneSymbol_value]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
874 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
875 else { $geneSymbol = $tab[$geneSymbol_value]; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
876
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
877 push(@{$h_inputFile{"$chr:$tab[$start_value]:$tab[$start_value]:$tab[$ref_value]:$tab[$alt_value]:$geneSymbol"}}, $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
878 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
879 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
880
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
881 # print "\t\tRecoverStrand: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
882
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
883 #################### Convert the database file into a hash table
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
884 my %h_database = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
885 my ($db_geneSymbol_value, $db_strandInfo_value, $db_chr_value) = (12, 3, 2);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
886
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
887 my $folderNameDB = $refGenome."db";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
888 my $fileNameDB = $refGenome."_refGene.txt";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
889
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
890 open(F1, "$pathDB/$fileNameDB") or die "$!: $pathDB/$fileNameDB\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
891 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
892 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
893 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
894 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
895 my $strand = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
896 $strand = $tab[$db_strandInfo_value];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
897 if($strand eq "") { print STDERR "Error message:\n"; print STDERR "Error: the strand orientation is not specify in the database refGene\n$_\n"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
898 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
899 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
900 # Some genes have several strand orientation, keep the first in the database
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
901 if(! exists $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"}) { $h_database{"$tab[$db_geneSymbol_value]:$tab[$db_chr_value]"} = $strand; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
902 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
903 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
904 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
905
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
906 #################### Parse the two hash tables for recover the strand information
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
907 open(OUT, ">", $output) or die "$!: $output\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
908
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
909
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
910 ## Add the header only for the firts part of the files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
911 if($input =~ /\-aa/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
912 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
913 my @tabHeaderInput = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
914 $annovar_header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $annovar_header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
915 # Save the length of the Annovar header for the next function (RecoverGenomicSequence)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
916 $$refS_lengthAVheader = $#tabHeaderInput;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
917
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
918 # Print the Annovar header until the column before OtherInfo
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
919 print OUT "$tabHeaderInput[0]";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
920 for(my $i=1; $i<$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
921 print OUT "\tStrand";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
922 print OUT "\t",$headerOriginalFile;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
923 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
924
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
925
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
926 # Timer for comparing the number of SNVs present in the hash table
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
927 my $timerUniqueSNVs = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
928 # Timer for comparing the number of SNVs with the strand orientation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
929 my $timerSNVsStrand = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
930
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
931 foreach my $kFile (sort keys %h_inputFile)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
932 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
933 my $test = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
934 my @tab = split(":", $kFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
935
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
936 # Sometimes the line is not printed correctely !!!!! :@
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
937 my @tHeaderInput = split("\t", $annovar_header); my @lengthLine = split("\t", $h_inputFile{$kFile}[0]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
938 my @tHeaderOriginalFile = split("\t", $headerOriginalFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
939 my $lengthHeader = @tHeaderInput + (scalar(@tHeaderOriginalFile)-1) ; my $lengthLine = @lengthLine;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
940
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
941 # Save the length of the Annovar header for the next function (RecoverGenomicSequence)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
942 $$refS_lengthAVheader = $#tHeaderInput;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
943
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
944 foreach my $kDB (sort keys %h_database)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
945 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
946 if("$tab[5]:$tab[0]" eq $kDB)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
947 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
948 if($lengthHeader != $lengthLine)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
949 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
950 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
951 print STDERR "Error Recover Strand the length of the current line is not valid!!!!!\nExpected length: $lengthHeader\tlength of the line: $lengthLine\n$h_inputFile{$kFile}[0]\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
952 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
953
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
954 foreach my $line (@{$h_inputFile{$kFile}})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
955 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
956 my @tab = split("\t", $line);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
957 my $j = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
958
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
959 for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
960 print OUT $h_database{$kDB};
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
961 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
962 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
963 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
964 $timerSNVsStrand++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
965 $test = 1; last;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
966 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
967 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
968 # The strand orientation isn't defined
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
969 if($test == 0)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
970 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
971 my @tHeaderInput = split("\t", $annovar_header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
972 foreach my $line (@{$h_inputFile{$kFile}})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
973 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
974 my @tab = split("\t", $line);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
975 my $j = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
976 for(my $i=0; $i<$#tHeaderInput; $i++) { print OUT $tab[$i],"\t"; $j=$i }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
977 print OUT "NA";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
978 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
979 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
980 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
981 $timerSNVsStrand++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
982 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
983 $timerUniqueSNVs++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
984 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
985 close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
986
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
987 # print "Strand orientation recover for $timerSNVsStrand SNVs out of $timerUniqueSNVs uniques\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
988 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
989
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
990 sub RecoverGenomicSequence
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
991 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
992 my ($inputFile, $length_AVheader, $intervalEnd, $referenceGenome, $pathToRefSeq, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
993
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
994 ############ 1) Transform the input file in a hash table: one for recover the sequence context and one for keeping the original file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
995 my %h_inputFileForSeqContext = (); my %h_inputFile = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
996 my $header = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
997 CreateHashTable_from_InputFile($inputFile, $length_AVheader, \$header, $intervalEnd, \%h_inputFileForSeqContext, \%h_inputFile);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
998
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
999 sub CreateHashTable_from_InputFile
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1000 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1001 my ($input, $length_AVheader, $refS_header, $intervalEnd, $refH_inputFileForSeqContext, $refH_inputFile) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1002
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1003 my ($chr_value, $start_value, $strand_value) = (0, 1, $length_AVheader);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1004
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1005 my $countregion = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1006 my %allchr = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1007
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1008 open(F1, $input) or die "$!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1009 if($input =~ /\-aa/) { $$refS_header = <F1>; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1010
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1011 while(<F1>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1012 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1013 $_ =~ s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1014 my @tab = split("\t", $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1015
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1016 my $name = "$tab[$chr_value]:$tab[$start_value]";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1017 my $start = $tab[$start_value] - $intervalEnd;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1018 my $end = $tab[$start_value] + $intervalEnd;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1019
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1020 $start--; #make zero-start coordinate, to be consistent with UCSC
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1021 my $exonpos = "$tab[$chr_value]:$start";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1022
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1023 push @{$refH_inputFileForSeqContext->{$tab[$chr_value]}}, [$name, $start, $end, $tab[$strand_value], $exonpos];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1024 push(@{$refH_inputFile->{"$tab[$chr_value]\t$start\t$end"}}, $_);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1025 $countregion++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1026 $allchr{$tab[$chr_value]}++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1027 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1028 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1029 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1030
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1031 ############ 2) Extract the sequence context from the hash table
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1032 my %h_allRegionSeqContext = ();
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1033 my $refSeq = $pathToRefSeq;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1034 Extract_SequenceContext(\%h_inputFileForSeqContext, $referenceGenome, $refSeq, \%h_allRegionSeqContext);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1035
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1036 sub Extract_SequenceContext
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1037 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1038 my ($refH_allRegion, $referenceGenome, $refSeq, $refH_allRegionSeqContext) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1039
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1040 my $folderDB = $referenceGenome."db";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1041 my $folderSeq = $referenceGenome."_seq";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1042 my $seqdir = "$refSeq/$folderSeq";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1043
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1044 my %seqhash = (); #database sequence for each chromosome
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1045 my %name_seq = (); #sequence for each region
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1046 my (%seqlen, %discordlen, %badorf); #store the length of each sequence, and the ID of sequences with discordant length, ORF contains stop codon
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1047 my ($count_success, @failure) = (0);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1048
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1049 for my $curchr (sort keys %{$refH_allRegion})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1050 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1051 my ($seqid, $curseq) = ('', '');
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1052 my $fastafile = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1053 if ($curchr =~ m/^chr/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1054 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1055 %seqhash = (); #clear the seqhash storage
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1056 $fastafile = "$seqdir/$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1057 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1058 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1059 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1060 %seqhash = (); #clear the seqhash storage
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1061 $fastafile = "$seqdir/chr$curchr.fa"; #by default, all FASTA files should be saved at fastadir, with the same name
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1062 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1063 if (not -e $fastafile) { #to handle cases where no "chr" prefix is given
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1064 print "WARNING: the FASTA file $curchr.fa cannot be retrieved from the specified directory $seqdir. Sequences in this chromosome will not be processed\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1065 next;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1066 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1067
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1068 if (not %seqhash)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1069 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1070 open (FASTA, $fastafile) or print "WARNING: cannot read from FASTA file $fastafile so sequences in $curchr will not be processed: $!\n" and next;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1071 while (<FASTA>)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1072 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1073 if (m/^>(\S+)/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1074 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1075 $seqid and $seqhash{$seqid} = $curseq; #finish reading the sequence for seqid and save it
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1076 $seqid = $1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1077 $curseq = '';
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1078 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1079 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1080 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1081 s/[\r\n]+$//;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1082 $curseq .= uc $_; #only use upper case characters
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1083 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1084 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1085 close FASTA;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1086 $seqhash{$seqid} = $curseq;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1087 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1088 if (not $seqhash{$curchr})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1089 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1090 #this chromosome just do not have FASTA sequences (maybe users used a wrong seqdir
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1091 print "WARNING: Unable to retrieve regions at $curchr due to lack of sequence information\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1092 next;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1093 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1094
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1095 for my $i (0 .. @{$refH_allRegion->{$curchr}}-1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1096 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1097 my ($name, $start, $end, $strand, $exonpos) = @{$refH_allRegion->{$curchr}[$i]};
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1098 my @start = split (/,/, $start);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1099 my @end = split (/,/, $end);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1100 my $seq;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1101 for my $i (0..@start-1)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1102 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1103 if ($start[$i] >= length ($seqhash{$curchr}))
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1104 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1105 #here there must be an annotation error in user-specified gene/region definition file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1106 print "WARNING: Ignoring the start position start=$start[$i] since it is longer than the $curchr sequence (length=" , length($seqhash{$curchr}), ")\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1107 undef $seq;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1108 last;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1109 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1110 $seq .= substr ($seqhash{$curchr}, $start[$i], $end[$i]-$start[$i]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1111 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1112
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1113 if (defined $seq)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1114 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1115 if (defined $seqlen{$name})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1116 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1117 $seqlen{$name} != length ($seq) and warn "WARNING: the sequence $name was found more than once with different sequence lengths\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1118 $seqlen{$name} != length ($seq) and $discordlen{$name}++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1119 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1120 else { $seqlen{$name} = length ($seq); }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1121
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1122 $name_seq{$name, $exonpos} = $seq;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1123 $count_success++;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1124
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1125 # Put the sequence context in a hash table for Write the result after
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1126 ## Some sequence context are NNNNNN or empty
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1127 if( ($seq ne "NA") && ($seq =~ /N/i) ) { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = "NA"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1128 else { $refH_allRegionSeqContext->{"$curchr\t$start\t$end"} = $seq; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1129 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1130 else
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1131 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1132 print "WARNING: DNA sequence for $name cannot be inferred\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1133 push @failure, $name;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1134 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1135 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1136 } # End for $curchr
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1137 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1138
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1139 ############ 3) Create a file with the sequence context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1140 WriteFile_SeqContext($inputFile, $length_AVheader, \%h_inputFile, $header, \%h_allRegionSeqContext, $output);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1141
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1142 sub WriteFile_SeqContext
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1143 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1144 my ($inputFile, $length_AVheader, $refH_InputFile, $header, $refH_allRegionSeqContext, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1145
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1146 open(OUT, ">", $output) or die "$!: $output\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1147
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1148 ## Add the header only for the firts part of the files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1149 if($inputFile =~ /\-aa/)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1150 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1151 my @tabHeaderInput = "";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1152
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1153 $header =~ s/[\r\n]+$//; @tabHeaderInput = split("\t", $header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1154 # Print the Annovar header until the column before OtherInfo
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1155 print OUT "$tabHeaderInput[0]";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1156 my $j = 0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1157 for(my $i=1; $i<$length_AVheader+1; $i++) { print OUT "\t$tabHeaderInput[$i]"; $j=$i; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1158 print OUT "\tcontext\ttrinucleotide_context";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1159 for(my $i=$j+1; $i<=$#tabHeaderInput; $i++) { print OUT "\t$tabHeaderInput[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1160 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1161 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1162
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1163 foreach my $k_hFile (sort keys %{$refH_InputFile})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1164 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1165 foreach my $k_allRegonSeqContext (sort keys %{$refH_allRegionSeqContext})
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1166 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1167 if($k_hFile eq $k_allRegonSeqContext)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1168 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1169 my $j=0;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1170
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1171 for(my $k=0; $k<=$#{$refH_InputFile->{$k_hFile}};$k++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1172 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1173 my @tab = split("\t", ${$refH_InputFile->{$k_hFile}}[$k]);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1174
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1175 # Write Annovar annotation + strand orientation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1176 for(my $i=0; $i<$length_AVheader+1; $i++) { print OUT $tab[$i],"\t"; $j=$i; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1177 # Write the sequence context with the length defined by the user (default is 10)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1178 print OUT $refH_allRegionSeqContext->{$k_allRegonSeqContext};
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1179 # Write the trinucleotide context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1180 my $contextSequence = $refH_allRegionSeqContext->{$k_allRegonSeqContext}; $contextSequence =~ tr/a-z/A-Z/;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1181 my @tempContextSequence = split("", $contextSequence);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1182 my $midlle_totalNbBaseContext = (scalar(@tempContextSequence)-1)/2; # For having the middle of the sequence
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1183 print OUT "\t".$tempContextSequence[$midlle_totalNbBaseContext-1]."x".$tempContextSequence[$midlle_totalNbBaseContext+1];
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1184 # Write the original columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1185 for(my $i=$j+1; $i<=$#tab; $i++) { print OUT "\t$tab[$i]"; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1186 print OUT "\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1187 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1188 last;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1189 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1190 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1191 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1192 close OUT;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1193 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1194 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1195
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1196 sub CombinedTempFile
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1197 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1198 my ($folderTempFile, $output) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1199
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1200 my $cmd_cat_mt_results = "cat ";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1201
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1202 foreach my $file (`ls $folderTempFile/*.txt`)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1203 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1204 chomp($file);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1205 $cmd_cat_mt_results = $cmd_cat_mt_results." $file";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1206 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1207 $cmd_cat_mt_results = $cmd_cat_mt_results." > $output";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1208 `$cmd_cat_mt_results`;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1209 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1210
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1211
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1212 sub recoverNumCol
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1213 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1214 my ($input, $name_of_column) = @_;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1215
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1216 open(F1,$input) or die "recoverNumCol: $!: $input\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1217 # For having the name of the columns
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1218 my $search_header = <F1>; $search_header =~ s/[\r\n]+$//; my @tab_search_header = split("\t",$search_header);
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1219 close F1;
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1220 # The number of the column
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1221 my $name_of_column_NB = "toto";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1222 for(my $i=0; $i<=$#tab_search_header; $i++)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1223 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1224 if($tab_search_header[$i] eq $name_of_column) { $name_of_column_NB = $i; last; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1225 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1226 if($name_of_column_NB eq "toto")
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1227 {
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1228 print STDERR "Error message:\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1229 print STDERR "Error recoverNumCol(): the column named $name_of_column doesn't exits in the input file $input!!!!!\n";
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1230 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1231 else { return $name_of_column_NB; }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1232 }
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1233
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1234
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1235
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1236
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1237 =head1 NAME
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1238
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1239 mutspec-Annot
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1240
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1241 =head1 SYNOPSIS
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1242
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1243 mutspecannot.pl [arguments] <query-file>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1244
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1245 <query-file> can be a folder with multiple VCF or a single VCF
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1246
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1247 Arguments:
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1248 -h, --help print help message
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1249 -m, --man print complete documentation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1250 -v, --verbose use verbose output
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1251 --refGenome the reference genome to use
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1252 --interval <interger> the number of bases for the sequence context
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1253 -o, --outfile <string> output directory for the result. If none is specify the result will be write in the same directory as the input file
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1254 -AVDB --pathAnnovarDB <string> the path to Annovar database and the files with the chromosome size
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1255 --pathAVDBList the path to a text file containing the list of Annovar databases installed
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1256 -temp --pathTemporary <string> the path for saving the temporary files
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1257 --fullAnnotation <string> recover all Annovar annotations (yes) or only the minimum for MutSpec-Stat (no)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1258 --max_cpu <integer> number of CPUs to be used for the annotation
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1259
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1260
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1261 Function: automatically run a pipeline on a list of variants and annote them using Annovar
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1262
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1263 Example: # Annotation only
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1264 mutspecannot.pl --refGenome hg19 --interval 10 --outfile output_directory --pathAnnovarDB path_to_annovar_database --pathAVDBList path_to_the_list_of_annovar_DB --temp path_to_temporary_directory --fullAnnotation yes|no input
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1265
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1266
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1267 Version: 03-2017 (March 2017)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1268
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1269
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1270 =head1 OPTIONS
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1271
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1272 =over 8
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1273
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1274 =item B<--help>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1275
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1276 print a brief usage message and detailed explanation of options.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1277
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1278 =item B<--man>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1279
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1280 print the complete manual of the program.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1281
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1282 =item B<--verbose>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1283
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1284 use verbose output.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1285
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1286 =item B<--refGenome>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1287
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1288 the reference genome to use, could be hg19 or mm9.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1289
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1290 =item B<--interval>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1291
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1292 the number of bases surrounding the mutated bases, for the sequence context analysis.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1293
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1294 =item B<--outfile>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1295
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1296 the directory of output file names. If it is nor specify the same directory as the input file is used.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1297
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1298 =item B<--pathAnnovarDB>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1299
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1300 the path to the directory containing the Annovar databases and the files with the chromosome size.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1301
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1302 =item B<--pathAVDBList>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1303
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1304 the path to a text file containing the list of Annovar databases installed.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1305
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1306 =item B<--pathTemporary>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1307
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1308 the path for saving temporary files generated by the script.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1309 If any is specify a temporary folder is created in the same directory where the script is running.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1310 Deleted when the script is finish
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1311
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1312 =item B<--fullAnnotation>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1313
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1314 Use all Annovar databases for the annotation (set to yes) or only refGene + strand + context (set to no) for having a quicker annotation (for large file with million of lines)
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1315
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1316 =item B<--max_cpu>
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1317
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1318 Specify the number of CPUs to be used. This number is used for spliting the file in n part and running the annotations in each part in parallel.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1319
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1320
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1321 =head1 DESCRIPTION
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1322
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1323 MutSpec-Annot is a perl script for added annotations on a list of genetic variants generated with NGS.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1324 Functional annotations are added using ANNOVAR software. Strand transcript orientation is added using RefSeq database and the sequence context for x bases flanking the variant positions is also added.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1325 A text tab delimited file is produced.
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1326
eda59b985b1c Uploaded
iarc
parents: 6
diff changeset
1327 =cut