3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<po2anno.pl> - create an annotation comparison matrix from Proteinortho5 output
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl po2anno.pl -i matrix.proteinortho -d genome_fasta_dir/ -l -a E<gt> annotation_comparison.tsv>
|
|
16
|
|
17 =head1 DESCRIPTION
|
|
18
|
|
19 Supplement an ortholog/paralog output matrix from a
|
|
20 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/>
|
|
21 calculation with annotation information. The resulting tab-separated
|
|
22 annotation comparison matrix (ACM) is mainly intended for the
|
|
23 transfer of high quality annotations from reference genomes to
|
|
24 homologs (orthologs and co-orthologs/paralogs) in a query genome
|
|
25 (e.g. in conjunction with L<C<tbl2tab.pl>|/tbl2tab>). But of course
|
|
26 it can also be used to have a quick glance at the annotation of
|
|
27 genes present only in a couple of input genomes in comparison to the
|
|
28 others.
|
|
29
|
|
30 Annotation is retrieved from multi-FASTA files created with
|
|
31 L<C<cds_extractor.pl>|/cds_extractor>. See
|
|
32 L<C<cds_extractor.pl>|/cds_extractor> for a description of the
|
|
33 format. These files are used as input for the PO analysis and option
|
|
34 B<-d> for C<po2anno.pl>.
|
|
35
|
|
36 B<Proteinortho5> (PO) has to be run with option B<-singles> to include
|
|
37 also genes without orthologs, so-called singletons/ORFans, for each
|
|
38 genome in the PO matrix (see the
|
|
39 L<PO manual|http://www.bioinf.uni-leipzig.de/Software/proteinortho/manual.html>).
|
|
40 Additionally, option B<-selfblast> is recommended to enhance paralog
|
|
41 detection by PO.
|
|
42
|
|
43 Each orthologous group (OG) is listed in a row of the resulting ACM,
|
|
44 the first column holds the OG numbers from the PO input matrix (i.e.
|
|
45 line number minus one). The following columns specify the
|
|
46 orthologous CDS for each input genome. For each CDS the ID,
|
|
47 optionally the length in bp (option B<-l>), gene, EC number(s), and
|
|
48 product are shown depending on their presence in the CDS's
|
|
49 annotation. The ID is in most cases the locus tag (see
|
|
50 L<C<cds_extractor.pl>|/cds_extractor>). If several EC numbers exist
|
|
51 for a single CDS they're separated by ';'. If an OG includes
|
|
52 paralogs, i.e. co-orthologs from a single genome, these will be
|
|
53 printed in the following row(s) B<without> a new OG number in the
|
|
54 first column. The order of paralogous CDSs within an OG is
|
|
55 arbitrarily.
|
|
56
|
|
57 The OGs are sorted numerically via the query ID (see option B<-q>).
|
|
58 If option B<-a> is set, the non-query OGs are appended to the output
|
|
59 after the query OGs, sorted numerically via OG number.
|
|
60
|
|
61 =head1 OPTIONS
|
|
62
|
|
63 =head2 Mandatory options
|
|
64
|
|
65 =over 20
|
|
66
|
|
67 =item B<-i>=I<str>, B<-input>=I<str>
|
|
68
|
|
69 Proteinortho (PO) result matrix (*.proteinortho or *.poff), or piped C<STDIN> (-)
|
|
70
|
|
71 =item B<-d>=I<str>, B<-dir_genome>=I<str>
|
|
72
|
|
73 Path to the directory including the genome multi-FASTA PO input
|
|
74 files (*.faa or *.ffn), created with L<C<cds_extractor.pl>|/cds_extractor>
|
|
75
|
|
76 =back
|
|
77
|
|
78 =head2 Optional options
|
|
79
|
|
80 =over 20
|
|
81
|
|
82 =item B<-h>, B<-help>
|
|
83
|
|
84 Help (perldoc POD)
|
|
85
|
|
86 =item B<-q>=I<str>, B<-query>=I<str>
|
|
87
|
|
88 Query genome (has to be identical to the string in the PO matrix)
|
|
89 [default = first one in alphabetical order]
|
|
90
|
|
91 =item B<-l>, B<-length>
|
|
92
|
|
93 Include length of each CDS in bp
|
|
94
|
|
95 =item B<-a>, B<-all>
|
|
96
|
|
97 Append non-query orthologous groups (OGs) to the output
|
|
98
|
|
99 =item B<-v>, B<-version>
|
|
100
|
|
101 Print version number to C<STDERR>
|
|
102
|
|
103 =back
|
|
104
|
|
105 =head1 OUTPUT
|
|
106
|
|
107 =over 20
|
|
108
|
|
109 =item C<STDOUT>
|
|
110
|
|
111 The resulting tab-delimited ACM is printed to C<STDOUT>. Redirect or
|
|
112 pipe into another tool as needed (e.g. C<cut>, C<grep>, C<head>, or
|
|
113 C<tail>).
|
|
114
|
|
115 =back
|
|
116
|
|
117 =head1 EXAMPLES
|
|
118
|
|
119 =head2 L<C<cds_extractor.pl>|/cds_extractor>
|
|
120
|
|
121 =over
|
|
122
|
|
123 =item C<for i in *.[gbk|embl]; do perl cds_extractor.pl -i $i [-p|-n]; done>
|
|
124
|
|
125 =back
|
|
126
|
|
127 =head2 L<B<Proteinortho5>|http://www.bioinf.uni-leipzig.de/Software/proteinortho/>
|
|
128
|
|
129 =over
|
|
130
|
|
131 =item C<proteinortho5.pl -graph [-synteny] -cpus=# -selfblast -singles -identity=50 -cov=50 -blastParameters='-use_sw_tback [-seg no|-dust no]' *.[faa|ffn]>
|
|
132
|
|
133 =back
|
|
134
|
|
135 =head2 C<po2anno.pl>
|
|
136
|
|
137 =over
|
|
138
|
|
139 =item C<perl po2anno.pl -i matrix.[proteinortho|poff] -d genome_fasta_dir/ -q query.[faa|ffn] -l -a E<gt> annotation_comparison.tsv>
|
|
140
|
|
141 =back
|
|
142
|
|
143 =head1 VERSION
|
|
144
|
|
145 0.2.2 update: 23-10-2015
|
|
146 0.1 18-12-2014
|
|
147
|
|
148 =head1 AUTHOR
|
|
149
|
|
150 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
151
|
|
152 =head1 LICENSE
|
|
153
|
|
154 This program is free software: you can redistribute it and/or modify
|
|
155 it under the terms of the GNU General Public License as published by
|
|
156 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
157 License, or (at your option) any later version.
|
|
158
|
|
159 This program is distributed in the hope that it will be useful, but
|
|
160 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
161 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
162 General Public License for more details.
|
|
163
|
|
164 You should have received a copy of the GNU General Public License
|
|
165 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
166
|
|
167 =cut
|
|
168
|
|
169
|
|
170 ########
|
|
171 # MAIN #
|
|
172 ########
|
|
173
|
|
174 use strict;
|
|
175 use warnings;
|
|
176 use autodie;
|
|
177 use Getopt::Long;
|
|
178 use Pod::Usage;
|
|
179
|
|
180 my $Cmdline = "$0 @ARGV"; # used call command
|
|
181
|
|
182 ### Get the options with Getopt::Long
|
|
183 my $PO_Matrix_File; # PO result matrix (*.proteinortho or *.poff)
|
|
184 my $Genome_Dir; # directory with genome multi-FASTAs (PO input)
|
|
185 my $Query; # query genome (first column in output)
|
|
186 my $Opt_Length; # include CDS nucleotide lengths in output
|
|
187 my $Opt_All_OGs; # print also all non-query OGs to output
|
|
188 my $VERSION = '0.2.2';
|
|
189 my ($Opt_Version, $Opt_Help);
|
|
190 GetOptions ('input=s' => \$PO_Matrix_File,
|
|
191 'dir_genome=s' => \$Genome_Dir,
|
|
192 'query=s' => \$Query,
|
|
193 'length' => \$Opt_Length,
|
|
194 'all' => \$Opt_All_OGs,
|
|
195 'version' => \$Opt_Version,
|
|
196 'help|?' => \$Opt_Help);
|
|
197
|
|
198
|
|
199
|
|
200 ### Run perldoc on POD
|
|
201 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
202 die "$0 $VERSION\n" if ($Opt_Version);
|
|
203
|
|
204 if (!$PO_Matrix_File || !$Genome_Dir) {
|
|
205 my $warning = "\n### Fatal error: Mandatory options '-i' or '-d' or their arguments are missing!\n";
|
|
206 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
|
|
207 }
|
|
208 die "\n### Fatal error: Proteinortho matrix file '$PO_Matrix_File' does not exist: $!\n" if (!-e $PO_Matrix_File);
|
|
209 $Genome_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Genome_Dir path
|
|
210 die "\n### Fatal error: Genome directory '$Genome_Dir' does not exist: $!\n" if (!-d $Genome_Dir);
|
|
211
|
|
212 print STDERR "Script call command: $Cmdline\n"; # print call command after '-h|-v'
|
|
213
|
|
214
|
|
215
|
|
216 ### Pipe input from STDIN or open input file
|
|
217 my $PO_Matrix_Fh;
|
|
218 if ($PO_Matrix_File eq '-') { # file input via STDIN
|
|
219 $PO_Matrix_Fh = *STDIN; # capture typeglob of STDIN
|
|
220 } else { # input via input file
|
|
221 open ($PO_Matrix_Fh, "<", "$PO_Matrix_File");
|
|
222 }
|
|
223
|
|
224
|
|
225
|
|
226 ### Parse OGs in input PO matrix
|
|
227 print STDERR "Parsing Proteinortho input matrix '$PO_Matrix_File' ...\n"; # run status of script
|
|
228 my @Genome_Files; # store genome filename column order
|
|
229 my %Ortho_Groups; # hash of hash with OG# as key, internal hash with genome filename as key and IDs (e.g. locus tags) in anonymous array
|
|
230 while (<$PO_Matrix_Fh>) {
|
|
231 chomp;
|
|
232
|
|
233 # check PO input file header and get genome file names
|
|
234 if ($. == 1) { # header of PO matrix file (first line)
|
|
235 die "\n### Fatal error:\nProteinortho input matrix '$PO_Matrix_File' does not have the mandatory header line, which starts with the first three tab-separated mandatory columns:\n# Species\tGenes\tAlg.-Conn.\n" if (!/^# Species\tGenes\tAlg\.-Conn\./);
|
|
236
|
|
237 # get input genomes from PO matrix and check $Query existence
|
|
238 my $query_present = 0 if ($Query);
|
|
239 foreach (split(/\t/, $_)) {
|
|
240 next if (/# Species|Genes|Alg\.-Conn\./); # non-genome header columns
|
|
241 $query_present = 1 if ($Query && $Query eq $_);
|
|
242 push (@Genome_Files, $_); # store genome in array
|
|
243 }
|
|
244 die "\n### Fatal error:\nGiven query '$Query' not found in the header of the input Proteinortho matrix '$PO_Matrix_File'! Make sure the string corresponds exactly (also lower/uppercase) to the header of the matrix and the input file in '$Genome_Dir'!\n" if ($Query && !$query_present);
|
|
245 next; # skip header line of PO matrix for subsequent code
|
|
246 }
|
|
247
|
|
248 # parse PO ortholog matrix
|
|
249 my ($organisms, $genes, $alg_conn, @og) = split(/\t/); # $alg_conn not used
|
|
250 my $organism_count = 0; # count number of organisms in OG to compare to '$organisms' (should be ok from PO)
|
|
251 my $gene_count = 0; # count number of genes in OG to compare to '$genes' (should be ok from PO)
|
|
252 my $column = -1; # column counter to associate column to correct genome file in OG (@Genome_Files array zero-based, thus start with '-1')
|
|
253 foreach (@og) {
|
|
254 $column++;
|
|
255 next if (/^\*$/); # skip empty columns in PO matrix (indicated by '*')
|
|
256 $organism_count++;
|
|
257 my @paralogs = split(',');
|
|
258 foreach (sort {$a cmp $b} @paralogs) {
|
|
259 $gene_count++;
|
|
260 push(@{ $Ortho_Groups{$.-1}->{$Genome_Files[$column]} }, $_); # push into anonymous array in hash of hash ($.-1 = OG number, because of header)
|
|
261 }
|
|
262 }
|
|
263 die "\n### Fatal error:\nThe indicated number of species ($organisms) does not fit to the counted number of organisms ($organism_count) in the orthologous group of line '$.' of the Proteinortho input matrix '$PO_Matrix_File'!\n" if ($organism_count != $organisms); # to check PO matrix
|
|
264 die "\n### Fatal error:\nThe indicated number of genes ($genes) does not fit to the counted number of orthologs/paralogs ($gene_count) in the orthologous group of line '$.' of the Proteinortho input matrix '$PO_Matrix_File'!\n" if ($gene_count != $genes); # to check PO matrix
|
|
265 }
|
|
266 close $PO_Matrix_Fh;
|
|
267
|
|
268
|
|
269
|
|
270 ### Sort @Genome_Files and set $Query as first element (not possible above because has to follow $column!)
|
|
271 @Genome_Files = sort {lc($a) cmp lc($b)} @Genome_Files; # sort regardless of upper/lowercase
|
|
272 if ($Query) {
|
|
273 @Genome_Files = grep ($_ ne $Query, @Genome_Files); # remove $Query from @Genome_Files
|
|
274 unshift (@Genome_Files, $Query); # set as first element
|
|
275 } elsif (!$Query) {
|
|
276 $Query = $Genome_Files[0]; # $Query first element in sorted array, when not given with option '-q'
|
|
277 }
|
|
278
|
|
279
|
|
280
|
|
281 ### Parse annotations in genome multi-FASTAs
|
|
282 print STDERR "Parsing annotation from multi-FASTA CDS genome files in directory '$Genome_Dir' ...\n";
|
|
283 my %Annotation; # hash of hash to store the annotation of the genome files
|
|
284 my %Anno_Features; # hash of hash to store which annotation features are present overall in each individual genome multi-FASTA (optional are 'gene/g=', 'product/p=' and 'EC_number/ec=' tags)
|
|
285 foreach my $genome (@Genome_Files) {
|
|
286 my $genome_file_path = "$Genome_Dir/$genome";
|
|
287 check_file_exist($genome_file_path); # subroutine
|
|
288 open (my $genome_fh, "<", "$genome_file_path");
|
|
289
|
|
290 # parse anno in current genome file
|
|
291 while (my $anno_line = <$genome_fh>) {
|
|
292 chomp $anno_line;
|
|
293 die "\n### Fatal error:\n'$genome_file_path' is not a FASTA input file. First line of the file should be a FASTA ID/header line and start with a '>':\n$anno_line\n" if ($anno_line !~ /^>/ && $. == 1);
|
|
294 next if ($anno_line !~ /^>/); # skip non-ID FASTA lines
|
|
295
|
|
296 my ($id, $gene, $product, $length, $organism, $ec); # $organism not used
|
|
297 foreach (split(/\s/, $anno_line)) {
|
|
298 if (/^>(.+)$/) {
|
|
299 $id = $1;
|
|
300 } elsif (/g=(.*)$/) {
|
|
301 $gene = $1;
|
|
302 } elsif (/p=(.*)$/) {
|
|
303 warn "\n### Warning:\nNo product annotation (p=) in file '$genome_file_path' on line:\n$anno_line\nProceeding ...\n" if (!$1);
|
|
304 $product = $1;
|
|
305 $product =~ s/\_/ /g;
|
|
306 } elsif (/l=(.*)$/) {
|
|
307 my ($start, $stop) = split(/\.\./, $1);
|
|
308 $length = abs($stop - $start) + 1;
|
|
309 } elsif (/o=(.*)$/) { # $organism not used
|
|
310 $organism = $1;
|
|
311 $organism =~ s/\_/ /g;
|
|
312 } elsif (/ec=(.*)$/) {
|
|
313 $ec = $1;
|
|
314 } else {
|
|
315 die "\n### Fatal error:\nAnnotation in file '$genome_file_path' is not recognized on the following line. Please use 'cds_extractor.pl' to create your multi-FASTA protein genome files.\n$anno_line\n";
|
|
316 }
|
|
317 }
|
|
318 die "\n### Fatal error:\nThe following FASTA ID of file '$genome_file_path' is not unique but has to be considering ALL input genome files (actually Proteinortho should have complained already). Please modify all repetitive occurences.\n$id\n" if ($Annotation{$id});
|
|
319
|
|
320 # fill annotation hash of hash
|
|
321 $Annotation{$id} = {'genome' => $genome,
|
|
322 'gene' => $gene,
|
|
323 'product' => $product,
|
|
324 'length' => $length,
|
|
325 'ec' => $ec};
|
|
326
|
|
327 # store which 'optional' annotation features are present overall in a genome
|
|
328 # (used to have lots of greps on %Annotation for this purpose below, but lets save performance)
|
|
329 $Anno_Features{$genome}{'gene'} = 1 if ($gene && !$Anno_Features{$genome}->{'gene'});
|
|
330 $Anno_Features{$genome}{'product'} = 1 if ($product && !$Anno_Features{$genome}->{'product'});
|
|
331 $Anno_Features{$genome}{'ec'} = 1 if ($ec && !$Anno_Features{$genome}->{'ec'});
|
|
332 }
|
|
333 close $genome_fh;
|
|
334 }
|
|
335
|
|
336
|
|
337
|
|
338 ### Check if CDS counts in multi-FASTA genome files and PO matrix are equal and print header of output
|
|
339 print STDERR "Comparing annotation CDS counts to orthologous group CDS counts and printing header for output matrix ...\n";
|
|
340 my $Query_CDS_Count; # store number of query CDS for stat report at end
|
|
341 print "# OG"; # first column of header
|
|
342 foreach my $genome (@Genome_Files) {
|
|
343 my $ortho_cds_count = map ($Ortho_Groups{$_}->{$genome} ? @{ $Ortho_Groups{$_}->{$genome} } : (), keys %Ortho_Groups); # de-reference anonymous array in hash of hash
|
|
344 # map evaluates BLOCK or EXPR in list context and returns the LIST value composed of the results of each such evaluation (each element of LIST may produce zero, one, or more elements in the returned value <=> grep returns only one value). In scalar context, returns the total number of elements so generated.
|
|
345
|
|
346 # the map line above is a fancy way of writing
|
|
347 #foreach (keys %Ortho_Groups) {
|
|
348 #$ortho_cds_count += @{ $Ortho_Groups{$_}->{$genome} } if ($Ortho_Groups{$_}->{$genome});
|
|
349 #}
|
|
350
|
|
351 my $anno_cds_count = grep ($Annotation{$_}->{'genome'} eq $genome, keys %Annotation);
|
|
352 $Query_CDS_Count = $anno_cds_count if ($Query eq $genome);
|
|
353 die "\n### Fatal error:\nThere are more CDSs in file '$genome' than for this genome in the Proteinortho matrix '$PO_Matrix_File', but counts have to be equal. Please run Proteinortho5 with option '-singles' to include also genes without orthologs, so-called singletons/ORFans (recommended is also option '-selfblast' to enhance paralog detection).\n" if ($ortho_cds_count < $anno_cds_count);
|
|
354 die "\n### Fatal error:\nThere are less CDSs in file '$genome' than for this genome in the Proteinortho matrix '$PO_Matrix_File', but counts have to be equal. Please check if the files are correct.\n" if ($ortho_cds_count > $anno_cds_count); # overlaps with ID-specific check in sub 'print_matrix'
|
|
355
|
|
356 # print header fields for output
|
|
357 print "\t$genome";
|
|
358 print "\tlength [bp]" if ($Opt_Length);
|
|
359 print "\tgene" if ($Anno_Features{$genome}->{'gene'});
|
|
360 print "\tec" if ($Anno_Features{$genome}->{'ec'});
|
|
361 print "\tproduct" if ($Anno_Features{$genome}->{'product'}); # very unlikely that there's a whole genome without product annotation ...
|
|
362 }
|
|
363 print "\n";
|
|
364
|
|
365
|
|
366
|
|
367 ### Print annotation comparison matrix
|
|
368 print STDERR "Printing output annotation comparison matrix for query OGs ...\n";
|
|
369 my %Query_ID_Seen; # query IDs already processed for output (because of paralogous CDSs in the same OGs)
|
|
370 my %Query_OGs; # OGs containing query CDSs, to skip for non-query OGs (option '-a') below
|
|
371 my $Query_Specific_OGs = 0; # count number of OGs including only query CDSs (not including CDSs of the other genomes)
|
|
372 my $Query_Singletons = 0; # count query singletons/ORFans (different to '$Query_Singleton_OGs' because of paralogous CDSs)
|
|
373 foreach my $id (sort { $a cmp $b } grep ($Annotation{$_}->{'genome'} eq $Query, keys %Annotation)) { # get IDs only for the query genome, sorted
|
|
374 next if ($Query_ID_Seen{$id});
|
|
375
|
|
376 # get OG for current query $id (I'm sure this can also be resolved by a fancy map/grep construction, but I didn't get it to work)
|
|
377 my $id_og; # OG containing the current query ID
|
|
378 foreach my $og (keys %Ortho_Groups) {
|
|
379 foreach (@{ $Ortho_Groups{$og}->{$Query} }) {
|
|
380 if ($_ eq $id) {
|
|
381 $id_og = $og;
|
|
382 last;
|
|
383 }
|
|
384 }
|
|
385 last if ($id_og);
|
|
386 }
|
|
387 $Query_OGs{$id_og} = 1;
|
|
388
|
|
389 print_matrix($id_og, \%Query_ID_Seen); # subroutine to print the annotation comparison matrix for the current OG to STDOUT
|
|
390 }
|
|
391
|
|
392
|
|
393
|
|
394 ### Optionally, print also non-query OGs/singletons
|
|
395 if ($Opt_All_OGs) {
|
|
396 print STDERR "Printing output annotation comparison matrix for non-query OGs (option '-all') ...\n";
|
|
397 foreach my $og (sort {$a <=> $b} keys %Ortho_Groups) { # sort from smallest to largest OG
|
|
398 next if ($Query_OGs{$og});
|
|
399 print_matrix($og); # subroutine
|
|
400 }
|
|
401 }
|
|
402
|
|
403
|
|
404
|
|
405 ### Print some final stats
|
|
406 print STDERR "\nFinished creating Proteinortho annotation comparison matrix with query genome '$Query':\n";
|
|
407 print STDERR "Total genomes: ", scalar @Genome_Files, "\n";
|
|
408 print STDERR "Total CDSs: ", scalar keys %Annotation, "\n";
|
|
409 print STDERR "Total query CDSs: $Query_CDS_Count\n";
|
|
410 print STDERR "Total OGs: ", scalar keys %Ortho_Groups, "\n";
|
|
411 print STDERR "OGs including query CDSs: ", scalar keys %Query_OGs, "\n"; # 'scalar grep($Ortho_Groups{$_}->{$Query} , keys %Ortho_Groups)' should actually be the same, but somehow doesn't work ... (what about the fancy map in the CDS count check above?)
|
|
412 print STDERR "Query-specific OGs (not including CDSs of the other genomes): $Query_Specific_OGs\n";
|
|
413 print STDERR "Total query singletons/ORFans: $Query_Singletons\n";
|
|
414
|
|
415
|
|
416 exit;
|
|
417
|
|
418
|
|
419 #############
|
|
420 #Subroutines#
|
|
421 #############
|
|
422
|
|
423 ### Subroutine to test for file existence
|
|
424 sub check_file_exist {
|
|
425 my $file = shift;
|
|
426 die "\n### Fatal error:\nGenome file '$file' is listed in the Proteinortho matrix, but does not exist in directory '$Genome_Dir': $!\n" if (!-e $file);
|
|
427 return 1;
|
|
428 }
|
|
429
|
|
430
|
|
431
|
|
432 ### Subroutine to print resulting annotation comparison matrix to STDOUT
|
|
433 sub print_matrix {
|
|
434 my ($og, $query_id_seen) = @_; # $query_id_seen hash-ref to %Query_ID_Seen ($query_id_seen not given/defined if called above for non-query OGs, option '-a')
|
|
435
|
|
436 # get max count of paralogs/co-orthologs over all genomes in current OG for row printing below
|
|
437 my $max_paralog_count = 0;
|
|
438 foreach my $genome (keys %{$Ortho_Groups{$og}}) {
|
|
439 my $paralog_count = 0;
|
|
440 foreach (@{ $Ortho_Groups{$og}->{$genome} }) { # only needed for genomes in the CURRENT $og <=> ALL genomes needed below
|
|
441 $query_id_seen->{$_} = 1 if ($query_id_seen && $genome eq $Query); # store query IDs that have been processed
|
|
442 $paralog_count++;
|
|
443 }
|
|
444 if ($genome eq $Query && keys %{$Ortho_Groups{$og}} == 1) { # query-specific OGs (not including CDSs of the other genomes)
|
|
445 $Query_Specific_OGs++;
|
|
446 $Query_Singletons += $paralog_count;
|
|
447 }
|
|
448 $max_paralog_count = $paralog_count if ($paralog_count > $max_paralog_count);
|
|
449 }
|
|
450
|
|
451 # print output matrix for current OG
|
|
452 print "$og";
|
|
453 for (my $i = 0; $i < $max_paralog_count; $i++) { # '<' because array zero-based
|
|
454 print "\t"; # tab after OG-nr
|
|
455
|
|
456 foreach my $genome (@Genome_Files) { # for ALL genomes
|
|
457
|
|
458 # CDS(s) of genome present in current $og and within $max_paralog_count
|
|
459 if ($Ortho_Groups{$og}->{$genome}->[$i]) {
|
|
460 die "\n### Fatal error:\nID '$Ortho_Groups{$og}->{$genome}->[$i]' present in the Proteinortho matrix '$PO_Matrix_File' in OG '$og' but not present in the corresponding genome '$genome' multi-FASTA file. However, all IDs in the matrix and the genome files in directory '$Genome_Dir' need to be correspondent to each other. Make sure you chose the correct directory with the input genome files for the current Proteinortho matrix!\n" unless ($Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}); # overlaps with check during the CDS count comparison of FASTA files and PO matrix (but here not all IDs are controlled without option '-a')
|
|
461
|
|
462 print "$Ortho_Groups{$og}->{$genome}->[$i]\t"; # print CDS ID
|
|
463 print $Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'length'}, "\t" if ($Opt_Length);
|
|
464 if ($Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'gene'}) {
|
|
465 print $Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'gene'}, "\t";
|
|
466 } elsif ($Anno_Features{$genome}->{'gene'}) { # tab only if this genome has a 'g=/gene' annotation at all (stored in hash of hash %Anno_Features)
|
|
467 print "\t";
|
|
468 }
|
|
469 if ($Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'ec'}) {
|
|
470 print $Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'ec'}, "\t";
|
|
471 } elsif ($Anno_Features{$genome}->{'ec'}) {
|
|
472 print "\t";
|
|
473 }
|
|
474 if ($Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'product'}) {
|
|
475 print $Annotation{$Ortho_Groups{$og}->{$genome}->[$i]}->{'product'}, "\t";
|
|
476 } elsif ($Anno_Features{$genome}->{'product'}) {
|
|
477 warn "\n### Warning:\nNo product annotation (p=) in file '$genome' for ID:\n$Ortho_Groups{$og}->{$genome}->[$i]\nProceeding ...\n"; # overlaps with 'warn' for missing product annotation in parsing the annotations of the genomes
|
|
478 print "\t";
|
|
479 }
|
|
480
|
|
481 } else { # genome without a CDS in current $og or less paralogous CDSs than $max_paralog_count
|
|
482 my $times = 5;
|
|
483 $times -= 1 if (!$Opt_Length);
|
|
484 $times -= 1 if (!$Anno_Features{$genome}->{'gene'}); # genome doesn't have a 'g=/gene' annotation at all
|
|
485 $times -= 1 if (!$Anno_Features{$genome}->{'ec'});
|
|
486 $times -= 1 if (!$Anno_Features{$genome}->{'product'}); # very unlikely that there's a whole genome without product annotation ...
|
|
487 print "\t" x $times;
|
|
488 }
|
|
489 }
|
|
490 print "\n";
|
|
491 }
|
|
492
|
|
493 return 1;
|
|
494 }
|
|
495
|