comparison COG/bac-genomics-scripts/ecoli_mlst/ecoli_mlst.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 ecoli_mlst.pl 25-10-2011
12
13 =head1 SYNOPSIS
14
15 C<perl ecoli_mlst.pl -a fas -g fasta>
16
17 =head1 DESCRIPTION
18
19 The script searches for multilocus sequence type (MLST) alleles in
20 I<E. coli> genomes according to Mark Achtman's scheme with seven
21 house-keeping genes (I<adk>, I<fumC>, I<gyrB>, I<icd>, I<mdh>,
22 I<purA>, and I<recA>) [Wirth et al., 2006]. I<NUCmer> from the
23 L<I<MUMmer package>|http://mummer.sourceforge.net/> is used to
24 compare the given allele sequences to bacterial genomes via
25 nucleotide alignments.
26
27 Download the allele files (adk.fas ...) and the sequence type file
28 ('publicSTs.txt') from this website:
29 http://mlst.warwick.ac.uk/mlst/dbs/Ecoli
30
31 To run C<ecoli_mlst.pl> include all I<E. coli> genome files (file
32 extension e.g. 'fasta'), all allele sequence files (file extension
33 'fas') and 'publicSTs.txt' in the current working directory. The
34 allele profiles are parsed from the created *.coord files and written
35 to a result file, plus additional information from the file
36 'publicSTs.txt'. Also, the corresponding allele sequences (obtained
37 from the allele input files) are concatenated for each I<E. coli>
38 genome into a result multi-fasta file. Option B<-c> can be used to
39 initiate an alignment for this multi-fasta file with
40 L<I<ClustalW>|http://www.clustal.org/clustal2/> (standard alignment
41 parameters; has to be in the C<$PATH> or change variable
42 C<$clustal_call>). The alignment fasta output file can be used
43 directly for L<I<RAxML>|http://sco.h-its.org/exelixis/web/software/raxml/index.html>. CAREFUL the Phylip alignment format from
44 I<ClustalW> allows only 10 characters per strain ID.
45
46 C<ecoli_mlst.pl> works with complete and draft genomes. However,
47 several genomes cannot be included in a single input file!
48
49 Obviously, only for those genomes whose allele sequences have been
50 deposited in Achtman's allele database results can be obtained. If an
51 allele is not found in a genome it is marked by a '?' in the result
52 profile file and a place holder 'XXX' in the result fasta file. For
53 these cases a manual I<NUCmer> or I<BLASTN> might be useful to fill the
54 gaps and L<C<run_sub_seq.pl>|/run_sub_seq> to get the corresponding
55 'new' allele sequences.
56
57 Non-NCBI fasta headers for the genome files have to have a unique ID
58 directly following the '>' (e.g. 'Sakai', '55989' ...).
59
60 =head1 OPTIONS
61
62 =head2 Mandatory options
63
64 =over 22
65
66 =item B<-a>=I<str>, B<-alleles>=I<str>
67
68 File extension of the MLST allele fasta files, e.g. 'fas' (<=> B<-g>).
69
70 =item B<-g>=I<str>, B<-genomes>=I<str>
71
72 File extension of the I<E. coli> genome fasta files, e.g. 'fasta' (<=> B<-a>).
73
74 =back
75
76 =head2 Optional options
77
78 =over
79
80 =item B<-h>, B<-help>
81
82 Help (perldoc POD)
83
84 =item B<-c>, B<-clustalw>
85
86 Call L<I<ClustalW>|http://www.clustal.org/clustal2/> for alignment
87
88 =back
89
90 =head1 OUTPUT
91
92 =over 17
93
94 =item F<ecoli_mlst_profile.txt>
95
96 Tab-separated allele profiles for the I<E. coli> genomes, plus additional info from 'publicSTs.txt'
97
98 =item F<ecoli_mlst_seq.fasta>
99
100 Multi-fasta file of all concatenated allele sequences for each genome
101
102 =item F<*.coord>
103
104 Text files that contain the coordinates of the I<NUCmer> hits for each genome and allele
105
106 =item (F<errors.txt>)
107
108 Error file, summarizing number of not found alleles or unclear I<NUCmer> hits
109
110 =item (F<ecoli_mlst_seq_aln.fasta>)
111
112 Optional, L<I<ClustalW>|http://www.clustal.org/clustal2/> alignment in Phylip format
113
114 =item (F<ecoli_mlst_seq_aln.dnd>)
115
116 Optional, I<ClustalW> alignment guide tree
117
118 =back
119
120 =head1 EXAMPLE
121
122 =over
123
124 =item C<perl ecoli_mlst.pl -a fas -g fasta -c>
125
126 =back
127
128 =head1 VERSION
129
130 0.3 update: 30-01-2013
131
132 =head1 AUTHOR
133
134 Andreas Leimbach aleimba[at]gmx[dot]de
135
136 =head1 LICENSE
137
138 This program is free software: you can redistribute it and/or modify
139 it under the terms of the GNU General Public License as published by
140 the Free Software Foundation; either version 3 (GPLv3) of the License,
141 or (at your option) any later version.
142
143 This program is distributed in the hope that it will be useful, but
144 WITHOUT ANY WARRANTY; without even the implied warranty of
145 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
146 General Public License for more details.
147
148 You should have received a copy of the GNU General Public License
149 along with this program. If not, see L<http://www.gnu.org/licenses/>.
150
151 =cut
152
153
154 ########
155 # MAIN #
156 ########
157
158 use strict;
159 use warnings;
160 use Getopt::Long; # module to get options from the command line
161
162
163 ### Get the options with Getopt::Long, works also abbreviated and with two "--": -g, --g, -genomes ...
164 my $usage = "perldoc $0";
165 die system($usage) unless @ARGV;
166 my $allele_ext = ''; # file extension of allele fasta files (<=> $genome_ext)
167 my $genome_ext = ''; # file extension of E. coli genome fasta files (<=> allele_ext)
168 my $clustalw = ''; # optionally, call ClustalW for alignment
169 my $help = ''; # run perldoc on the POD
170 GetOptions ('alleles=s' => \$allele_ext, 'genomes=s' => \$genome_ext, 'clustalw' => \$clustalw, 'help' => \$help);
171
172
173 ### Run perldoc on POD if no arguments or help
174 if (!$genome_ext || !$allele_ext) {
175 die system($usage);
176 } elsif ($help) {
177 die system($usage);
178 }
179
180
181 ### Check if result files already exist, overwrite or exit script after user question
182 my $ecoli_allele_seq = 'ecoli_mlst_seq.fasta'; # multi-fasta result file with concatenated allele seqs for each E. coli genome (sequences are from the MLST allele files)
183 my $clustal_aln = 'ecoli_mlst_seq_aln.fasta'; # fasta alignment file from optional clustalW alignment
184 if (-e $ecoli_allele_seq) {
185 print "A previous analysis exists, overwrite the old result files [y|n]? ";
186 my $stdin = <STDIN>;
187 chomp $stdin;
188 if ($stdin =~ /n/i) {
189 die "Script abborted!\n\n";
190 } elsif ($stdin =~ /y/i) {
191 if (-e $clustal_aln) { # get rid of the optional clustalW alignment fasta-file before program run
192 unlink $clustal_aln;
193 }
194 }
195 }
196
197
198 ### The MLST alleles from the Achtman scheme
199 my @alleles = ('adk', 'fumC', 'gyrB', 'icd', 'mdh', 'purA', 'recA');
200
201
202 ### Read directory and look for corresponding files
203 my $dirname = ".";
204 my @genome_files; # array to save all the fasta genome files
205 my %allele_files; # hash to save all the multi-fasta allele files
206 opendir(DIR, $dirname) or die "Can't opendir $dirname: $!\n";
207 while (defined(my $file = readdir(DIR))) { # go through each file in the directory
208 if ($file eq 'ecoli_mlst_seq.fasta') { # don't use the result file from a previous analysis, the allele multi fasta file 'ecoli_mlst_multi.fasta' (see below)!
209 next;
210 } elsif ($file =~ /^(\S+)\.$genome_ext$/ && (-s $file < 9000000)) { # don't use files > 10 MB, E. coli fasta file shouldn't be too big (normally around 5 MB), otherwise probably wrong fasta file in folder
211 push (@genome_files, $file);
212 }
213 foreach my $allele (@alleles) {
214 if ($file =~ /^$allele\S*\.$allele_ext$/i) {
215 $allele_files{$allele} = $file;
216 }
217 }
218 }
219 closedir(DIR);
220 die "No E. coli genome fasta-files were found!\n" unless scalar @genome_files; # empty array in scalar context returns 0
221 die "No allele fasta-files were found!\n" unless scalar %allele_files; # empty hash in scalar context returns 0
222
223
224 ### Look for multi-fasta genome files and concatenate them to a single-fasta entry (e.g. draft genomes). Subsequently concatenate all E. coli genome sequence files to one multi-fasta/-genome file for the subsequent nucmer run. Additionally parse and associate the E. coli accession#s/unique IDs and descriptions (strain names ...) from the genomes
225 my $genome_file = 'ecoli_multi.fasta'; # multi-genome/-fasta file for the nucmer run, used as a temp file, will be deleted at the end of the script
226 open(GENOMES, ">$genome_file") or die "Failed to create file '$genome_file': $!\n";
227 my $genome_number = 0; # used to control if lines in *.coords files correspond to the overall genome count
228 my %genome_desc; # hash to associate accession#/unique ID with genome descriptions
229 foreach my $file (@genome_files) {
230 $genome_number++;
231 open(MULTI, "<$file") or die "Failed to open E. coli genome file '$file': $!\n";
232 my @multi = <MULTI>; # read the whole genome file (potential multi-fasta file)
233 my @IDs = grep(/^>/, @multi); # get ID lines in fasta file
234 if (scalar @IDs > 1) { # multi-fasta files
235 my $new_id; # new ID line for the multi-fasta file
236 foreach (@IDs) { # discard plasmid ID lines in complete genomes for new file (name with chromosome ID line)
237 if (!/plasmid/) {
238 chomp;
239 $new_id = $_;
240 last; # jump out of the loop if a non-plasmid ID found
241 }
242 }
243 if (!defined($new_id)) {
244 die "The file '$file' only contains plasmids, program exits!\n";
245 }
246 my ($acc, $desc) = acc_desc($new_id); # subroutine to fill hash %genome_desc with acc#/unique ID and genome description
247 print GENOMES ">$acc $desc; artificial genome\n"; # print the now shortened ID-line for the new single-fasta entry in the result multi-genome file
248 foreach (@multi) { # print the rest of the new single-fasta file
249 if (/^>/) { # skip ID lines of the old multi-fasta file
250 next;
251 }
252 print GENOMES;
253 }
254 } elsif (scalar @IDs == 1) { # non-multi-fasta files
255 my ($acc, $desc) = acc_desc($IDs[0]);
256 print GENOMES ">$acc $desc\n"; # print the new shortened ID-line
257 foreach (@multi) {
258 if (/^>/) { # skip ID line, since new one already printed
259 next;
260 }
261 print GENOMES;
262 }
263 } else { # wrong fasta files
264 die "File '$file' does not include a fasta ID line, program exits!\n";
265 }
266 print GENOMES "\n";
267 close MULTI;
268 }
269 close GENOMES;
270 my @delete; # files to be deleted after program run
271 if (-e $genome_file) {
272 push (@delete, $genome_file);
273 } else {
274 die "The multi-fasta E. coli genome file \'$genome_file\' could not be created for the NUCmer run: $!\n";
275 }
276
277
278 ### Parse file 'publicSTs.txt' to get additional info for each sequence type
279 my $st_file = 'publicSTs.txt';
280 open(ST, "<$st_file") or die "Failed to open sequence type file '$st_file': $!\n";
281 my %seq_type; # hash to save ST info to each allele profile
282 my $header = <ST>; # get rid of file header
283 while (<ST>) {
284 chomp;
285 my @st = split (/\t/, $_);
286 my $profile = "$st[3] $st[4] $st[5] $st[6] $st[7] $st[8] $st[9]"; # allele profile
287 my $info = "$st[0]\t$st[1]\t$st[2]\t$st[10]\t$st[11]"; # associated info to allele profile
288 # $st[0] = ST, 1 = ST_COMPLEX, 2 = ANCESTRAL_GROUP, 3-9 = adk-recA alleles, 10 = SOURCE, 11 = REFSTRAIN
289 $seq_type{$profile} = $info;
290 }
291 close ST;
292
293
294 ### Run nucmer with allele sequences as REFERENCES and the created 'ecoli_multi.fasta' file as QUERY
295 my @created_files; # used to print out files that are created at the end of the script
296 my %coord_files;
297 foreach (keys %allele_files) {
298 system ("nucmer --prefix $_-all_ecoli $allele_files{$_} $genome_file"); # system call; seperate args not possible, probably because nucmer not a system command?!
299 system ("show-coords -lT $_-all_ecoli.delta > $_-all_ecoli.coords"); # '-l' include seq length in output, '-T' tab-separated output
300 if (-e "$_-all_ecoli.coords") {
301 $coord_files{$_} = "$_-all_ecoli.coords";
302 push (@delete, "$_-all_ecoli.delta");
303 push (@created_files, "\t\t\t$_-all_ecoli.coords\n");
304 } else {
305 die "Coord file '$_-all_ecoli.coords' could not be created: $!\n";
306 }
307 }
308
309
310 ### Parse *.coords nucmer result files for MLST alleles and corresponding accession#s
311 my @summary_err; # array to store error summary for error file
312 my @detail_err; # array to store the more detailed errors for error file
313 my $error = 0; # switches to '1' if an error is detected
314 my %strain_allele; # declare hash, that's subsequently used as 'hash in hash' in sub parse_coords to associate MLST alleles and accession#s
315 foreach (sort keys %coord_files) {
316 parse_coord($coord_files{$_}, $_); # subroutine to fill %strain_allele
317 }
318
319
320 ### Print the corresponding allele sequences and allele profile (plus additional info) for each E. coli genome
321 open(SEQ, ">$ecoli_allele_seq") or die "Failed to create file '$ecoli_allele_seq': $!\n";
322 my $ecoli_allele_profile = 'ecoli_mlst_profile.txt'; # tab-separated result file for the allele profile of each E. coli genome plus additional info from file 'publicSTs.txt' from the Achtman MLST DB
323 open(PROFILE, ">$ecoli_allele_profile") or die "Failed to create file '$ecoli_allele_profile': $!\n";
324 print PROFILE "Strain"; # header for profile file
325 foreach (sort @alleles) {
326 print PROFILE "\t$_";
327 }
328 print PROFILE "\tST\tST_COMPLEX\tANCESTRAL_GROUP\tSOURCE\tREFSTRAIN\n"; # info from file 'publicSTs.txt'
329 foreach my $acc (sort {lc $genome_desc{$a} cmp lc $genome_desc{$b}} keys %genome_desc) { # call hash %genome_desc by keys (acc#s), but sort by values (genome desc.s) of the hash case-insensitively (all in lowercase, because Perl sorts lowercase after uppercase!)
330 print SEQ ">$genome_desc{$acc} $acc\n"; # print fasta ID line for 'ecoli_mlst_seq.fasta'
331 print PROFILE "$genome_desc{$acc}"; # print genome desc for strain in first column for 'ecoli_mlst_profile.txt'
332 my $profile = ''; # allele profile to extract info from %seq_type
333 foreach my $allele (sort keys %strain_allele) {
334 if (defined $strain_allele{$allele}->{$acc}) {
335 open(ALLELE, "<$allele_files{$allele}") or die "Failed to open allele file $allele_files{$allele}: $!\n";
336 while (my $line = <ALLELE>) { # search for corresponding allele in multi-fasta allele file
337 chomp $line;
338 if ($line =~ /^>$strain_allele{$allele}->{$acc}$/i) {
339 $line = <ALLELE>; # don't need the ID line but the following seq lines
340 while ($line !~ /^>/) { # print sequence in result file until another '>' is found
341 chomp $line;
342 print SEQ "$line";
343 $line = <ALLELE>;
344 }
345 }
346 }
347 close ALLELE;
348 $strain_allele{$allele}->{$acc} =~ s/^(\D+)(\d+)$/$2/; # only keep allele number
349 print PROFILE "\t$strain_allele{$allele}->{$acc}";
350 $profile .= "$strain_allele{$allele}->{$acc} "; # concat allele numbers in $profile to extract info from %seq_type
351 } else {
352 print SEQ "XXX"; # place holder if allele of genome not determined
353 print PROFILE "\t?"; # place holder as well
354 unshift (@detail_err, "$genome_desc{$acc} didn't give a hit with $allele, marked by \'XXX\' in allele sequences and \'?\' in allele profile!\n");
355 next;
356 }
357 }
358 print SEQ "\n\n"; # blank line in front of each ID line
359 chop $profile; # get rid of the last space, so it can be used as the key in %seq_type
360 if (defined $seq_type{$profile}) { # print ST and additional info from file 'publicST.txt' in 'ecoli_mlst_profile.txt'
361 print PROFILE "\t$seq_type{$profile}\n";
362 } else {
363 print PROFILE "\t?\t?\t?\t?\t?\n";
364 }
365 }
366 close SEQ;
367 close PROFILE;
368 if (-e $ecoli_allele_seq && $ecoli_allele_profile) { # push new created files in array for print out
369 push (@created_files, "\n\tResult files:\n\t\t\t$ecoli_allele_seq\t-> Allele sequences!\n", "\t\t\t$ecoli_allele_profile\t-> Allele profile plus info from 'publicSTs.txt'!\n");
370 } else {
371 die "The result files $ecoli_allele_seq and $ecoli_allele_profile could not be created: $!\n";
372 }
373
374
375 ### Print errors in error file,
376 if ($error == 1) { # error(s) occurred
377 my $err_file = 'errors.txt';
378 open(ERR, ">$err_file") or die "Failed to create file '$err_file': $!\n";
379 print ERR "Error summary:\n";
380 print ERR @summary_err; # filled in sub 'parse_coords'
381 print ERR "\nDetailed error output:\n";
382 print ERR @detail_err;
383 push (@created_files, "\t\t\t$err_file\t\t-> Error file!\n");
384 close ERR;
385 }
386
387
388 ### Delete unneeded files, temp file 'ecoli_multi.fasta' (see above) and the *.delta files from the NUCmer run
389 foreach my $goners (@delete) {
390 unlink $goners or warn "Could not remove file '$goners': $!";
391 }
392
393
394 ### Print newly created files
395 print "\n###########################################################################\n";
396 print "The following files were created:\n";
397 print "\tCoordinates of MLST alleles in each genome:\n";
398 print @created_files;
399 print "\n###########################################################################\n";
400
401
402 ### Align with ClustalW if option '-c' is given
403 if ($clustalw) {
404 print "\nStarting ClustalW alignment with file $ecoli_allele_seq ...";
405 my $out = $ecoli_allele_seq;
406 $out =~ s/\.fasta$//;
407 my $clustal_call = "clustalw -infile=$ecoli_allele_seq -outfile=$clustal_aln -align -type=DNA -output=phylip -tree -newtree=$out\_aln.dnd -outputtree=phylip";
408 system ($clustal_call);
409 }
410
411 exit;
412
413
414 ###############
415 # Subroutines #
416 ###############
417
418 ### Subroutine to associate the acc# to the genome description in hash %genome_desc (see above)
419 sub acc_desc {
420 my $ID = shift;
421 chomp $ID;
422 if ($ID =~ /^>gi\|\d+\|(emb|gb|dbj|ref)\|(\w+\d+\.\d)\|\s(.+)$/) { # NCBI fasta header
423 # e.g.: >gi|387605479|ref|NC_017626.1| Escherichia coli 042, complete genome
424 # $1 = DB (embl, genbank, ddbj, refseq), $2 = acc#, $3 = genome description
425 my $desc = shorten_desc($3); # subroutine to shorten the genome description
426 $genome_desc{$2} = $desc; # associate accession# with genome description
427 return ($2, $desc);
428 } elsif ($ID =~ /^>(\S+)\s*(\S*)\s*(.*)$/) { # headers after EMBOSS's union of multi-fasta files, and other headers with a unique ID after '>'
429 # e.g.: >NC_011748.1 NC_011748.1 Escherichia coli 55989 chromosome, complete genome
430 # $1 = acc#, $2 = genome desc for drafts|duplicated acc# for complete genomes, $3 = genome desc for complete genomes
431 my $desc = $2 . ' ' . $3;
432 if ($1 eq $2) { # complete genomes have acc# double after EMBOSS's union (see above)
433 $desc = $3;
434 }
435 $desc = shorten_desc($desc);
436 $genome_desc{$1} = $desc;
437 return ($1, $desc);
438 }
439 return 0;
440 }
441
442
443 ### Subroutine to parse *.coord nucmer result files for MLST alleles and corresponding accession#s
444 sub parse_coord {
445 my ($coord_file, $allele) = @_;
446 my $lines = 0; # lines of respective *.coords file (without header); control if hit is missing or insensitive hits are present in comparison to $genome_number for the error file
447 my $discarded = 0; # number of discarded hits/lines in coord file for error file
448 open (COORD, "$coord_file") or die "Failed to open file '$coord_file': $!\n";
449 while (<COORD>) {
450 chomp;
451 if ( /\d+\t\d+\t\d+\t\d+\t(\d+)\t(\d+)\t(\d+\.\d+)\t(\d+)\t\d+\t(\D+\d+)\t(\w*\d+\.\d)$/ ) {
452 # since the ID lines of the fastas were shortened in temp file 'ecoli_multi.fasta', the acc#s should be the last element of each line in the coords file
453 # $1 = length of ref alignment, $2 = length of query alignment, $3 = identity percentage,
454 # $4 = length of ref seq, $5 = allele-nr. (e.g. ADK15), $6 = accession-nr.
455 $lines++; # after 'if' to skip headers
456 if ($1 != $4) { # write error to @detail_err for error file
457 push (@detail_err, "$genome_desc{$6}, $5: Reference (allele) alignment doesn't have a correct length, therefore allele is not included in output!\n");
458 $discarded++;
459 next;
460 } elsif ($2 != $4) {
461 push (@detail_err, "$genome_desc{$6}, $5: Query (genome) alignment doesn't have a correct length, therefore allele is not included in output!\n");
462 $discarded++;
463 next;
464 } elsif ($3 != '100.00') {
465 push (@detail_err, "$genome_desc{$6}, $5: Identity is not 100\%, therefore is not included in output!\n");
466 $discarded++;
467 next;
468 }
469 $strain_allele{$allele}{$6} = $5; # associate allele# with acc# and store as hash in hash in %strain_allele
470 }
471 }
472 close COORD;
473 # error identifications for later print out in error.txt
474 if ($discarded == 0) {
475 if ($lines < $genome_number) {
476 $error = 1; # switch $error from 0 to 1 to indicate error was found
477 push (@summary_err, $genome_number - $lines, " $allele allele(s) missing!\n");
478 }
479 } elsif ($discarded > 0) {
480 $error = 1;
481 if (($lines - $discarded) == $genome_number) {
482 push (@summary_err, "Total number of specific assigned $allele alleles is correct, but $discarded non-specific hit(s) discarded!\n");
483 } elsif (($lines - $discarded) < $genome_number) {
484 push (@summary_err, $genome_number - ($lines - $discarded), " $allele allele(s) missing and $discarded non-specific hit(s)!\n");
485 }
486 }
487 return 1;
488 }
489
490
491 ### Shorten the genome descriptions of the ID headers
492 sub shorten_desc {
493 my $desc = shift;
494 $desc =~ s/Escherichia coli/Ecoli/;
495 $desc =~ s/\'//g;
496 $desc =~ s/( DNA, complete genome| chromosome, complete genome|, complete genome| chromosome, complete sequence| complete genome|, complete sequence|, strain (\S+)|, whole genome shotgun sequence)$//;
497 $desc =~ s/\s/_/g;
498 return $desc;
499 }