Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/ecoli_mlst/ecoli_mlst.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 ecoli_mlst.pl 25-10-2011 | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl ecoli_mlst.pl -a fas -g fasta> | |
16 | |
17 =head1 DESCRIPTION | |
18 | |
19 The script searches for multilocus sequence type (MLST) alleles in | |
20 I<E. coli> genomes according to Mark Achtman's scheme with seven | |
21 house-keeping genes (I<adk>, I<fumC>, I<gyrB>, I<icd>, I<mdh>, | |
22 I<purA>, and I<recA>) [Wirth et al., 2006]. I<NUCmer> from the | |
23 L<I<MUMmer package>|http://mummer.sourceforge.net/> is used to | |
24 compare the given allele sequences to bacterial genomes via | |
25 nucleotide alignments. | |
26 | |
27 Download the allele files (adk.fas ...) and the sequence type file | |
28 ('publicSTs.txt') from this website: | |
29 http://mlst.warwick.ac.uk/mlst/dbs/Ecoli | |
30 | |
31 To run C<ecoli_mlst.pl> include all I<E. coli> genome files (file | |
32 extension e.g. 'fasta'), all allele sequence files (file extension | |
33 'fas') and 'publicSTs.txt' in the current working directory. The | |
34 allele profiles are parsed from the created *.coord files and written | |
35 to a result file, plus additional information from the file | |
36 'publicSTs.txt'. Also, the corresponding allele sequences (obtained | |
37 from the allele input files) are concatenated for each I<E. coli> | |
38 genome into a result multi-fasta file. Option B<-c> can be used to | |
39 initiate an alignment for this multi-fasta file with | |
40 L<I<ClustalW>|http://www.clustal.org/clustal2/> (standard alignment | |
41 parameters; has to be in the C<$PATH> or change variable | |
42 C<$clustal_call>). The alignment fasta output file can be used | |
43 directly for L<I<RAxML>|http://sco.h-its.org/exelixis/web/software/raxml/index.html>. CAREFUL the Phylip alignment format from | |
44 I<ClustalW> allows only 10 characters per strain ID. | |
45 | |
46 C<ecoli_mlst.pl> works with complete and draft genomes. However, | |
47 several genomes cannot be included in a single input file! | |
48 | |
49 Obviously, only for those genomes whose allele sequences have been | |
50 deposited in Achtman's allele database results can be obtained. If an | |
51 allele is not found in a genome it is marked by a '?' in the result | |
52 profile file and a place holder 'XXX' in the result fasta file. For | |
53 these cases a manual I<NUCmer> or I<BLASTN> might be useful to fill the | |
54 gaps and L<C<run_sub_seq.pl>|/run_sub_seq> to get the corresponding | |
55 'new' allele sequences. | |
56 | |
57 Non-NCBI fasta headers for the genome files have to have a unique ID | |
58 directly following the '>' (e.g. 'Sakai', '55989' ...). | |
59 | |
60 =head1 OPTIONS | |
61 | |
62 =head2 Mandatory options | |
63 | |
64 =over 22 | |
65 | |
66 =item B<-a>=I<str>, B<-alleles>=I<str> | |
67 | |
68 File extension of the MLST allele fasta files, e.g. 'fas' (<=> B<-g>). | |
69 | |
70 =item B<-g>=I<str>, B<-genomes>=I<str> | |
71 | |
72 File extension of the I<E. coli> genome fasta files, e.g. 'fasta' (<=> B<-a>). | |
73 | |
74 =back | |
75 | |
76 =head2 Optional options | |
77 | |
78 =over | |
79 | |
80 =item B<-h>, B<-help> | |
81 | |
82 Help (perldoc POD) | |
83 | |
84 =item B<-c>, B<-clustalw> | |
85 | |
86 Call L<I<ClustalW>|http://www.clustal.org/clustal2/> for alignment | |
87 | |
88 =back | |
89 | |
90 =head1 OUTPUT | |
91 | |
92 =over 17 | |
93 | |
94 =item F<ecoli_mlst_profile.txt> | |
95 | |
96 Tab-separated allele profiles for the I<E. coli> genomes, plus additional info from 'publicSTs.txt' | |
97 | |
98 =item F<ecoli_mlst_seq.fasta> | |
99 | |
100 Multi-fasta file of all concatenated allele sequences for each genome | |
101 | |
102 =item F<*.coord> | |
103 | |
104 Text files that contain the coordinates of the I<NUCmer> hits for each genome and allele | |
105 | |
106 =item (F<errors.txt>) | |
107 | |
108 Error file, summarizing number of not found alleles or unclear I<NUCmer> hits | |
109 | |
110 =item (F<ecoli_mlst_seq_aln.fasta>) | |
111 | |
112 Optional, L<I<ClustalW>|http://www.clustal.org/clustal2/> alignment in Phylip format | |
113 | |
114 =item (F<ecoli_mlst_seq_aln.dnd>) | |
115 | |
116 Optional, I<ClustalW> alignment guide tree | |
117 | |
118 =back | |
119 | |
120 =head1 EXAMPLE | |
121 | |
122 =over | |
123 | |
124 =item C<perl ecoli_mlst.pl -a fas -g fasta -c> | |
125 | |
126 =back | |
127 | |
128 =head1 VERSION | |
129 | |
130 0.3 update: 30-01-2013 | |
131 | |
132 =head1 AUTHOR | |
133 | |
134 Andreas Leimbach aleimba[at]gmx[dot]de | |
135 | |
136 =head1 LICENSE | |
137 | |
138 This program is free software: you can redistribute it and/or modify | |
139 it under the terms of the GNU General Public License as published by | |
140 the Free Software Foundation; either version 3 (GPLv3) of the License, | |
141 or (at your option) any later version. | |
142 | |
143 This program is distributed in the hope that it will be useful, but | |
144 WITHOUT ANY WARRANTY; without even the implied warranty of | |
145 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
146 General Public License for more details. | |
147 | |
148 You should have received a copy of the GNU General Public License | |
149 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
150 | |
151 =cut | |
152 | |
153 | |
154 ######## | |
155 # MAIN # | |
156 ######## | |
157 | |
158 use strict; | |
159 use warnings; | |
160 use Getopt::Long; # module to get options from the command line | |
161 | |
162 | |
163 ### Get the options with Getopt::Long, works also abbreviated and with two "--": -g, --g, -genomes ... | |
164 my $usage = "perldoc $0"; | |
165 die system($usage) unless @ARGV; | |
166 my $allele_ext = ''; # file extension of allele fasta files (<=> $genome_ext) | |
167 my $genome_ext = ''; # file extension of E. coli genome fasta files (<=> allele_ext) | |
168 my $clustalw = ''; # optionally, call ClustalW for alignment | |
169 my $help = ''; # run perldoc on the POD | |
170 GetOptions ('alleles=s' => \$allele_ext, 'genomes=s' => \$genome_ext, 'clustalw' => \$clustalw, 'help' => \$help); | |
171 | |
172 | |
173 ### Run perldoc on POD if no arguments or help | |
174 if (!$genome_ext || !$allele_ext) { | |
175 die system($usage); | |
176 } elsif ($help) { | |
177 die system($usage); | |
178 } | |
179 | |
180 | |
181 ### Check if result files already exist, overwrite or exit script after user question | |
182 my $ecoli_allele_seq = 'ecoli_mlst_seq.fasta'; # multi-fasta result file with concatenated allele seqs for each E. coli genome (sequences are from the MLST allele files) | |
183 my $clustal_aln = 'ecoli_mlst_seq_aln.fasta'; # fasta alignment file from optional clustalW alignment | |
184 if (-e $ecoli_allele_seq) { | |
185 print "A previous analysis exists, overwrite the old result files [y|n]? "; | |
186 my $stdin = <STDIN>; | |
187 chomp $stdin; | |
188 if ($stdin =~ /n/i) { | |
189 die "Script abborted!\n\n"; | |
190 } elsif ($stdin =~ /y/i) { | |
191 if (-e $clustal_aln) { # get rid of the optional clustalW alignment fasta-file before program run | |
192 unlink $clustal_aln; | |
193 } | |
194 } | |
195 } | |
196 | |
197 | |
198 ### The MLST alleles from the Achtman scheme | |
199 my @alleles = ('adk', 'fumC', 'gyrB', 'icd', 'mdh', 'purA', 'recA'); | |
200 | |
201 | |
202 ### Read directory and look for corresponding files | |
203 my $dirname = "."; | |
204 my @genome_files; # array to save all the fasta genome files | |
205 my %allele_files; # hash to save all the multi-fasta allele files | |
206 opendir(DIR, $dirname) or die "Can't opendir $dirname: $!\n"; | |
207 while (defined(my $file = readdir(DIR))) { # go through each file in the directory | |
208 if ($file eq 'ecoli_mlst_seq.fasta') { # don't use the result file from a previous analysis, the allele multi fasta file 'ecoli_mlst_multi.fasta' (see below)! | |
209 next; | |
210 } elsif ($file =~ /^(\S+)\.$genome_ext$/ && (-s $file < 9000000)) { # don't use files > 10 MB, E. coli fasta file shouldn't be too big (normally around 5 MB), otherwise probably wrong fasta file in folder | |
211 push (@genome_files, $file); | |
212 } | |
213 foreach my $allele (@alleles) { | |
214 if ($file =~ /^$allele\S*\.$allele_ext$/i) { | |
215 $allele_files{$allele} = $file; | |
216 } | |
217 } | |
218 } | |
219 closedir(DIR); | |
220 die "No E. coli genome fasta-files were found!\n" unless scalar @genome_files; # empty array in scalar context returns 0 | |
221 die "No allele fasta-files were found!\n" unless scalar %allele_files; # empty hash in scalar context returns 0 | |
222 | |
223 | |
224 ### Look for multi-fasta genome files and concatenate them to a single-fasta entry (e.g. draft genomes). Subsequently concatenate all E. coli genome sequence files to one multi-fasta/-genome file for the subsequent nucmer run. Additionally parse and associate the E. coli accession#s/unique IDs and descriptions (strain names ...) from the genomes | |
225 my $genome_file = 'ecoli_multi.fasta'; # multi-genome/-fasta file for the nucmer run, used as a temp file, will be deleted at the end of the script | |
226 open(GENOMES, ">$genome_file") or die "Failed to create file '$genome_file': $!\n"; | |
227 my $genome_number = 0; # used to control if lines in *.coords files correspond to the overall genome count | |
228 my %genome_desc; # hash to associate accession#/unique ID with genome descriptions | |
229 foreach my $file (@genome_files) { | |
230 $genome_number++; | |
231 open(MULTI, "<$file") or die "Failed to open E. coli genome file '$file': $!\n"; | |
232 my @multi = <MULTI>; # read the whole genome file (potential multi-fasta file) | |
233 my @IDs = grep(/^>/, @multi); # get ID lines in fasta file | |
234 if (scalar @IDs > 1) { # multi-fasta files | |
235 my $new_id; # new ID line for the multi-fasta file | |
236 foreach (@IDs) { # discard plasmid ID lines in complete genomes for new file (name with chromosome ID line) | |
237 if (!/plasmid/) { | |
238 chomp; | |
239 $new_id = $_; | |
240 last; # jump out of the loop if a non-plasmid ID found | |
241 } | |
242 } | |
243 if (!defined($new_id)) { | |
244 die "The file '$file' only contains plasmids, program exits!\n"; | |
245 } | |
246 my ($acc, $desc) = acc_desc($new_id); # subroutine to fill hash %genome_desc with acc#/unique ID and genome description | |
247 print GENOMES ">$acc $desc; artificial genome\n"; # print the now shortened ID-line for the new single-fasta entry in the result multi-genome file | |
248 foreach (@multi) { # print the rest of the new single-fasta file | |
249 if (/^>/) { # skip ID lines of the old multi-fasta file | |
250 next; | |
251 } | |
252 print GENOMES; | |
253 } | |
254 } elsif (scalar @IDs == 1) { # non-multi-fasta files | |
255 my ($acc, $desc) = acc_desc($IDs[0]); | |
256 print GENOMES ">$acc $desc\n"; # print the new shortened ID-line | |
257 foreach (@multi) { | |
258 if (/^>/) { # skip ID line, since new one already printed | |
259 next; | |
260 } | |
261 print GENOMES; | |
262 } | |
263 } else { # wrong fasta files | |
264 die "File '$file' does not include a fasta ID line, program exits!\n"; | |
265 } | |
266 print GENOMES "\n"; | |
267 close MULTI; | |
268 } | |
269 close GENOMES; | |
270 my @delete; # files to be deleted after program run | |
271 if (-e $genome_file) { | |
272 push (@delete, $genome_file); | |
273 } else { | |
274 die "The multi-fasta E. coli genome file \'$genome_file\' could not be created for the NUCmer run: $!\n"; | |
275 } | |
276 | |
277 | |
278 ### Parse file 'publicSTs.txt' to get additional info for each sequence type | |
279 my $st_file = 'publicSTs.txt'; | |
280 open(ST, "<$st_file") or die "Failed to open sequence type file '$st_file': $!\n"; | |
281 my %seq_type; # hash to save ST info to each allele profile | |
282 my $header = <ST>; # get rid of file header | |
283 while (<ST>) { | |
284 chomp; | |
285 my @st = split (/\t/, $_); | |
286 my $profile = "$st[3] $st[4] $st[5] $st[6] $st[7] $st[8] $st[9]"; # allele profile | |
287 my $info = "$st[0]\t$st[1]\t$st[2]\t$st[10]\t$st[11]"; # associated info to allele profile | |
288 # $st[0] = ST, 1 = ST_COMPLEX, 2 = ANCESTRAL_GROUP, 3-9 = adk-recA alleles, 10 = SOURCE, 11 = REFSTRAIN | |
289 $seq_type{$profile} = $info; | |
290 } | |
291 close ST; | |
292 | |
293 | |
294 ### Run nucmer with allele sequences as REFERENCES and the created 'ecoli_multi.fasta' file as QUERY | |
295 my @created_files; # used to print out files that are created at the end of the script | |
296 my %coord_files; | |
297 foreach (keys %allele_files) { | |
298 system ("nucmer --prefix $_-all_ecoli $allele_files{$_} $genome_file"); # system call; seperate args not possible, probably because nucmer not a system command?! | |
299 system ("show-coords -lT $_-all_ecoli.delta > $_-all_ecoli.coords"); # '-l' include seq length in output, '-T' tab-separated output | |
300 if (-e "$_-all_ecoli.coords") { | |
301 $coord_files{$_} = "$_-all_ecoli.coords"; | |
302 push (@delete, "$_-all_ecoli.delta"); | |
303 push (@created_files, "\t\t\t$_-all_ecoli.coords\n"); | |
304 } else { | |
305 die "Coord file '$_-all_ecoli.coords' could not be created: $!\n"; | |
306 } | |
307 } | |
308 | |
309 | |
310 ### Parse *.coords nucmer result files for MLST alleles and corresponding accession#s | |
311 my @summary_err; # array to store error summary for error file | |
312 my @detail_err; # array to store the more detailed errors for error file | |
313 my $error = 0; # switches to '1' if an error is detected | |
314 my %strain_allele; # declare hash, that's subsequently used as 'hash in hash' in sub parse_coords to associate MLST alleles and accession#s | |
315 foreach (sort keys %coord_files) { | |
316 parse_coord($coord_files{$_}, $_); # subroutine to fill %strain_allele | |
317 } | |
318 | |
319 | |
320 ### Print the corresponding allele sequences and allele profile (plus additional info) for each E. coli genome | |
321 open(SEQ, ">$ecoli_allele_seq") or die "Failed to create file '$ecoli_allele_seq': $!\n"; | |
322 my $ecoli_allele_profile = 'ecoli_mlst_profile.txt'; # tab-separated result file for the allele profile of each E. coli genome plus additional info from file 'publicSTs.txt' from the Achtman MLST DB | |
323 open(PROFILE, ">$ecoli_allele_profile") or die "Failed to create file '$ecoli_allele_profile': $!\n"; | |
324 print PROFILE "Strain"; # header for profile file | |
325 foreach (sort @alleles) { | |
326 print PROFILE "\t$_"; | |
327 } | |
328 print PROFILE "\tST\tST_COMPLEX\tANCESTRAL_GROUP\tSOURCE\tREFSTRAIN\n"; # info from file 'publicSTs.txt' | |
329 foreach my $acc (sort {lc $genome_desc{$a} cmp lc $genome_desc{$b}} keys %genome_desc) { # call hash %genome_desc by keys (acc#s), but sort by values (genome desc.s) of the hash case-insensitively (all in lowercase, because Perl sorts lowercase after uppercase!) | |
330 print SEQ ">$genome_desc{$acc} $acc\n"; # print fasta ID line for 'ecoli_mlst_seq.fasta' | |
331 print PROFILE "$genome_desc{$acc}"; # print genome desc for strain in first column for 'ecoli_mlst_profile.txt' | |
332 my $profile = ''; # allele profile to extract info from %seq_type | |
333 foreach my $allele (sort keys %strain_allele) { | |
334 if (defined $strain_allele{$allele}->{$acc}) { | |
335 open(ALLELE, "<$allele_files{$allele}") or die "Failed to open allele file $allele_files{$allele}: $!\n"; | |
336 while (my $line = <ALLELE>) { # search for corresponding allele in multi-fasta allele file | |
337 chomp $line; | |
338 if ($line =~ /^>$strain_allele{$allele}->{$acc}$/i) { | |
339 $line = <ALLELE>; # don't need the ID line but the following seq lines | |
340 while ($line !~ /^>/) { # print sequence in result file until another '>' is found | |
341 chomp $line; | |
342 print SEQ "$line"; | |
343 $line = <ALLELE>; | |
344 } | |
345 } | |
346 } | |
347 close ALLELE; | |
348 $strain_allele{$allele}->{$acc} =~ s/^(\D+)(\d+)$/$2/; # only keep allele number | |
349 print PROFILE "\t$strain_allele{$allele}->{$acc}"; | |
350 $profile .= "$strain_allele{$allele}->{$acc} "; # concat allele numbers in $profile to extract info from %seq_type | |
351 } else { | |
352 print SEQ "XXX"; # place holder if allele of genome not determined | |
353 print PROFILE "\t?"; # place holder as well | |
354 unshift (@detail_err, "$genome_desc{$acc} didn't give a hit with $allele, marked by \'XXX\' in allele sequences and \'?\' in allele profile!\n"); | |
355 next; | |
356 } | |
357 } | |
358 print SEQ "\n\n"; # blank line in front of each ID line | |
359 chop $profile; # get rid of the last space, so it can be used as the key in %seq_type | |
360 if (defined $seq_type{$profile}) { # print ST and additional info from file 'publicST.txt' in 'ecoli_mlst_profile.txt' | |
361 print PROFILE "\t$seq_type{$profile}\n"; | |
362 } else { | |
363 print PROFILE "\t?\t?\t?\t?\t?\n"; | |
364 } | |
365 } | |
366 close SEQ; | |
367 close PROFILE; | |
368 if (-e $ecoli_allele_seq && $ecoli_allele_profile) { # push new created files in array for print out | |
369 push (@created_files, "\n\tResult files:\n\t\t\t$ecoli_allele_seq\t-> Allele sequences!\n", "\t\t\t$ecoli_allele_profile\t-> Allele profile plus info from 'publicSTs.txt'!\n"); | |
370 } else { | |
371 die "The result files $ecoli_allele_seq and $ecoli_allele_profile could not be created: $!\n"; | |
372 } | |
373 | |
374 | |
375 ### Print errors in error file, | |
376 if ($error == 1) { # error(s) occurred | |
377 my $err_file = 'errors.txt'; | |
378 open(ERR, ">$err_file") or die "Failed to create file '$err_file': $!\n"; | |
379 print ERR "Error summary:\n"; | |
380 print ERR @summary_err; # filled in sub 'parse_coords' | |
381 print ERR "\nDetailed error output:\n"; | |
382 print ERR @detail_err; | |
383 push (@created_files, "\t\t\t$err_file\t\t-> Error file!\n"); | |
384 close ERR; | |
385 } | |
386 | |
387 | |
388 ### Delete unneeded files, temp file 'ecoli_multi.fasta' (see above) and the *.delta files from the NUCmer run | |
389 foreach my $goners (@delete) { | |
390 unlink $goners or warn "Could not remove file '$goners': $!"; | |
391 } | |
392 | |
393 | |
394 ### Print newly created files | |
395 print "\n###########################################################################\n"; | |
396 print "The following files were created:\n"; | |
397 print "\tCoordinates of MLST alleles in each genome:\n"; | |
398 print @created_files; | |
399 print "\n###########################################################################\n"; | |
400 | |
401 | |
402 ### Align with ClustalW if option '-c' is given | |
403 if ($clustalw) { | |
404 print "\nStarting ClustalW alignment with file $ecoli_allele_seq ..."; | |
405 my $out = $ecoli_allele_seq; | |
406 $out =~ s/\.fasta$//; | |
407 my $clustal_call = "clustalw -infile=$ecoli_allele_seq -outfile=$clustal_aln -align -type=DNA -output=phylip -tree -newtree=$out\_aln.dnd -outputtree=phylip"; | |
408 system ($clustal_call); | |
409 } | |
410 | |
411 exit; | |
412 | |
413 | |
414 ############### | |
415 # Subroutines # | |
416 ############### | |
417 | |
418 ### Subroutine to associate the acc# to the genome description in hash %genome_desc (see above) | |
419 sub acc_desc { | |
420 my $ID = shift; | |
421 chomp $ID; | |
422 if ($ID =~ /^>gi\|\d+\|(emb|gb|dbj|ref)\|(\w+\d+\.\d)\|\s(.+)$/) { # NCBI fasta header | |
423 # e.g.: >gi|387605479|ref|NC_017626.1| Escherichia coli 042, complete genome | |
424 # $1 = DB (embl, genbank, ddbj, refseq), $2 = acc#, $3 = genome description | |
425 my $desc = shorten_desc($3); # subroutine to shorten the genome description | |
426 $genome_desc{$2} = $desc; # associate accession# with genome description | |
427 return ($2, $desc); | |
428 } elsif ($ID =~ /^>(\S+)\s*(\S*)\s*(.*)$/) { # headers after EMBOSS's union of multi-fasta files, and other headers with a unique ID after '>' | |
429 # e.g.: >NC_011748.1 NC_011748.1 Escherichia coli 55989 chromosome, complete genome | |
430 # $1 = acc#, $2 = genome desc for drafts|duplicated acc# for complete genomes, $3 = genome desc for complete genomes | |
431 my $desc = $2 . ' ' . $3; | |
432 if ($1 eq $2) { # complete genomes have acc# double after EMBOSS's union (see above) | |
433 $desc = $3; | |
434 } | |
435 $desc = shorten_desc($desc); | |
436 $genome_desc{$1} = $desc; | |
437 return ($1, $desc); | |
438 } | |
439 return 0; | |
440 } | |
441 | |
442 | |
443 ### Subroutine to parse *.coord nucmer result files for MLST alleles and corresponding accession#s | |
444 sub parse_coord { | |
445 my ($coord_file, $allele) = @_; | |
446 my $lines = 0; # lines of respective *.coords file (without header); control if hit is missing or insensitive hits are present in comparison to $genome_number for the error file | |
447 my $discarded = 0; # number of discarded hits/lines in coord file for error file | |
448 open (COORD, "$coord_file") or die "Failed to open file '$coord_file': $!\n"; | |
449 while (<COORD>) { | |
450 chomp; | |
451 if ( /\d+\t\d+\t\d+\t\d+\t(\d+)\t(\d+)\t(\d+\.\d+)\t(\d+)\t\d+\t(\D+\d+)\t(\w*\d+\.\d)$/ ) { | |
452 # since the ID lines of the fastas were shortened in temp file 'ecoli_multi.fasta', the acc#s should be the last element of each line in the coords file | |
453 # $1 = length of ref alignment, $2 = length of query alignment, $3 = identity percentage, | |
454 # $4 = length of ref seq, $5 = allele-nr. (e.g. ADK15), $6 = accession-nr. | |
455 $lines++; # after 'if' to skip headers | |
456 if ($1 != $4) { # write error to @detail_err for error file | |
457 push (@detail_err, "$genome_desc{$6}, $5: Reference (allele) alignment doesn't have a correct length, therefore allele is not included in output!\n"); | |
458 $discarded++; | |
459 next; | |
460 } elsif ($2 != $4) { | |
461 push (@detail_err, "$genome_desc{$6}, $5: Query (genome) alignment doesn't have a correct length, therefore allele is not included in output!\n"); | |
462 $discarded++; | |
463 next; | |
464 } elsif ($3 != '100.00') { | |
465 push (@detail_err, "$genome_desc{$6}, $5: Identity is not 100\%, therefore is not included in output!\n"); | |
466 $discarded++; | |
467 next; | |
468 } | |
469 $strain_allele{$allele}{$6} = $5; # associate allele# with acc# and store as hash in hash in %strain_allele | |
470 } | |
471 } | |
472 close COORD; | |
473 # error identifications for later print out in error.txt | |
474 if ($discarded == 0) { | |
475 if ($lines < $genome_number) { | |
476 $error = 1; # switch $error from 0 to 1 to indicate error was found | |
477 push (@summary_err, $genome_number - $lines, " $allele allele(s) missing!\n"); | |
478 } | |
479 } elsif ($discarded > 0) { | |
480 $error = 1; | |
481 if (($lines - $discarded) == $genome_number) { | |
482 push (@summary_err, "Total number of specific assigned $allele alleles is correct, but $discarded non-specific hit(s) discarded!\n"); | |
483 } elsif (($lines - $discarded) < $genome_number) { | |
484 push (@summary_err, $genome_number - ($lines - $discarded), " $allele allele(s) missing and $discarded non-specific hit(s)!\n"); | |
485 } | |
486 } | |
487 return 1; | |
488 } | |
489 | |
490 | |
491 ### Shorten the genome descriptions of the ID headers | |
492 sub shorten_desc { | |
493 my $desc = shift; | |
494 $desc =~ s/Escherichia coli/Ecoli/; | |
495 $desc =~ s/\'//g; | |
496 $desc =~ s/( DNA, complete genome| chromosome, complete genome|, complete genome| chromosome, complete sequence| complete genome|, complete sequence|, strain (\S+)|, whole genome shotgun sequence)$//; | |
497 $desc =~ s/\s/_/g; | |
498 return $desc; | |
499 } |