3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 blast_rod_finder.pl 07-11-2011
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl blast_rod_finder.pl -q query.embl -r blastn.out -m 2000>
|
|
16
|
|
17 =head1 DESCRIPTION
|
|
18
|
|
19 This script is intended to identify region of differences (RODs)
|
|
20 between a nucleotide query and a nucleotide subject/reference sequence.
|
|
21 In order to do so, a BLASTN (L<http://blast.ncbi.nlm.nih.gov/Blast.cgi>)
|
|
22 needs to be performed beforehand with the query and the subject sequences.
|
|
23 I<blast_rod_finder.pl> is mainly designed to work with bacterial genomes,
|
|
24 while a query genome can be blasted against several subject sequences to
|
|
25 detect RODs over a number of references. Although the results are
|
|
26 optimized towards a complete query genome, both the reference(s) as well
|
|
27 as the query can be used in draft form. To create artificial genomes use
|
|
28 I<cat_seq.pl> or the EMBOSS application union (L<http://emboss.sourceforge.net/>).
|
|
29
|
|
30 The BLASTN report file, the query sequence file (preferably in embl or
|
|
31 genbank format with annotation, see below) and a minimum size for ROD
|
|
32 detection have to be provided for I<blast_rod_finder.pl> to run.
|
|
33 Subsequently, RODs are summarized in a tab-separated result file, a gff3
|
|
34 (usable e.g. in Artemis/DNAPlotter,
|
|
35 L<http://www.sanger.ac.uk/resources/software/artemis/>) and a BRIG
|
|
36 (BLAST Ring Image Generator, L<http://brig.sourceforge.net/>) output file.
|
|
37 Nucleotide sequences of each ROD are written to a multi-fasta file.
|
|
38
|
|
39 The query sequence can be provided in embl or genbank format, but has to
|
|
40 correspond to the fasta file used in querying the BLAST database (the
|
|
41 accession numbers have to correspond to the fasta headers). Use
|
|
42 I<seq_format-converter.pl> to create a corresponding fasta file from
|
|
43 embl|genbank files for BLASTN if needed. With annotated query files
|
|
44 additional info is given in the result summary and the amino acid
|
|
45 sequences of all non-pseudo CDSs, which are contained or overlap a ROD,
|
|
46 are written to a result file. Furthermore, all detected RODs are saved in
|
|
47 individual sequence files in the corresponding query sequence format.
|
|
48
|
|
49 Run BLASTN and the script I<blast_rod_finder.pl> manually or use the bash
|
|
50 shell scripts I<blast_rod_finder*.sh> (see examples below) to perform the
|
|
51 pipeline consecutively in one folder.
|
|
52
|
|
53 The Perl script runs on BioPerl (L<http://www.bioperl.org>).
|
|
54
|
|
55 =head1 OPTIONS
|
|
56
|
|
57 =head2 Mandatory options
|
|
58
|
|
59 =over 21
|
|
60
|
|
61 =item B<-m>=I<int>, B<-min>=I<int> Minimum size of RODs that are reported
|
|
62
|
|
63 =item B<-q>=I<str>, B<-query>=I<str>
|
|
64
|
|
65 Query sequence file [fasta, embl, or genbank format]
|
|
66
|
|
67 =item B<-r>=I<str>, B<-report>=I<str> BLASTN report/output file
|
|
68
|
|
69 =back
|
|
70
|
|
71 =head2 Optional options
|
|
72
|
|
73 =over
|
|
74
|
|
75 =item B<-h>, B<-help> Help (perldoc POD)
|
|
76
|
|
77 =back
|
|
78
|
|
79 =head1 OUTPUT
|
|
80
|
|
81 =over 17
|
|
82
|
|
83 =item F<./results>
|
|
84
|
|
85 All output files are stored in this result folder
|
|
86
|
|
87 =item F<rod_summary.txt>
|
|
88
|
|
89 Summary of detected ROD regions (for embl/genbank queries includes annotation), tab-separated
|
|
90
|
|
91 =item F<rod.gff>
|
|
92
|
|
93 GFF3 file with ROD coordinates to use in Artemis/DNAPlotter etc.
|
|
94
|
|
95 =item F<rod_BRIG.txt>
|
|
96
|
|
97 ROD coordinates to use in BRIG (BLAST Ring Image Generator), tab-separated
|
|
98
|
|
99 =item F<rod_seq.fasta>
|
|
100
|
|
101 Nucleotide sequences of ROD regions (>ROD# size start..stop), multi-fasta
|
|
102
|
|
103 =item (F<rod_aa_fasta.txt>)
|
|
104
|
|
105 Only present if query is given with annotation, i.e. embl|genbank format
|
|
106
|
|
107 Amino acid sequences of all CDSs that are contained in or overlap a ROD region
|
|
108 in multi-fasta format (>locus_tag gene product). RODs are seperated in the file via
|
|
109 '~~' (~~ROD# size start..stop).
|
|
110
|
|
111 =item (F<ROD#.embl|gbk>)
|
|
112
|
|
113 Only present if query is given with annotation, i.e. embl|genbank format.
|
|
114
|
|
115 Each identified ROD is written to an individual sequence file (in the same format
|
|
116 as the query).
|
|
117
|
|
118 =back
|
|
119
|
|
120 =head1 EXAMPLES
|
|
121
|
|
122 =head2 Legacy BLASTN
|
|
123
|
|
124 =over
|
|
125
|
|
126 =item C<formatdb -p F -i subject.fasta -n blast_db>
|
|
127
|
|
128 =item C<blastall -p blastn -d blast_db -i query.fasta -o blastn.out -e 2e-11 -F F>
|
|
129
|
|
130 =back
|
|
131
|
|
132 =head2 BLASTN+
|
|
133
|
|
134 =over
|
|
135
|
|
136 =item C<makeblastdb -in subject.fasta -input_type fasta -dbtype nucl -out blast_db>
|
|
137
|
|
138 =item C<blastn -db blast_db -query query.fasta -out blastn.out -evalue 2e-11 -soft_masking false>
|
|
139
|
|
140 =back
|
|
141
|
|
142 =head2 blast_rod_finder.pl
|
|
143
|
|
144 =over
|
|
145
|
|
146 =item C<perl blast_rod_finder.pl -q query.embl|gbk|fasta -r blastn.out -m 5000>
|
|
147
|
|
148 =back
|
|
149
|
|
150 =head2 All-in-one with unix bash-shell scripts
|
|
151
|
|
152 =over
|
|
153
|
|
154 =item C<./blast_rod_finder_legacy.sh subject.fasta query.fasta query.embl|gbk|fasta 5000>
|
|
155
|
|
156 =item C<./blast_rod_finder_plus.sh subject.fasta query.fasta query.embl|gbk|fasta 5000>
|
|
157
|
|
158 =back
|
|
159
|
|
160 =head1 VERSION
|
|
161
|
|
162 0.4 update: 13-02-2013
|
|
163
|
|
164 =head1 AUTHORS
|
|
165
|
|
166 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
167 David Studholme D[dot]J[dot]Studholme[at]exeter[dot]ac[dot]uk
|
|
168
|
|
169 =head1 LICENSE
|
|
170
|
|
171 This program is free software: you can redistribute it and/or modify
|
|
172 it under the terms of the GNU General Public License as published by
|
|
173 the Free Software Foundation; either version 3 (GPLv3) of the License,
|
|
174 or (at your option) any later version.
|
|
175
|
|
176 This program is distributed in the hope that it will be useful, but
|
|
177 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
178 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
179 General Public License for more details.
|
|
180
|
|
181 You should have received a copy of the GNU General Public License
|
|
182 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
183
|
|
184 =cut
|
|
185
|
|
186
|
|
187 ########
|
|
188 # MAIN #
|
|
189 ########
|
|
190
|
|
191 use strict;
|
|
192 use warnings;
|
|
193 use Getopt::Long; # module to get options from the command line
|
|
194 use Bio::SearchIO; # bioperl module to handle blast reports
|
|
195 use Bio::SeqIO; # bioperl module to handle sequence input/output
|
|
196 use Bio::Seq; # bioperl module to handle sequences with features
|
|
197 use Bio::SeqFeatureI; # bioperl module to handle features in a sequence
|
|
198 use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects (e.g. revcom, truncate, concatenate)
|
|
199
|
|
200
|
|
201 ### Get the options with Getopt::Long, works also abbreviated and with two "--": -r, --r, -report ...
|
|
202 my $usage = "perldoc $0";
|
|
203 my $blast_report = ''; # blastn report/output file
|
|
204 my $q_file = ''; # query sequence file to get the RODs and possible including annotations
|
|
205 my $minimum_size = ''; # minimum size of uncovered regions (RODs) that are reported
|
|
206 my $help = ''; # run perldoc on the POD
|
|
207 GetOptions ('report=s' => \$blast_report, 'query=s' => \$q_file, 'min=s' => \$minimum_size, 'help' => \$help);
|
|
208
|
|
209
|
|
210 ### Run perldoc on POD if -h|--h|--help is given as argument or arguments are not given
|
|
211 if (!$blast_report || !$q_file || !$minimum_size) {
|
|
212 die system($usage);
|
|
213 } elsif ($help) {
|
|
214 die system($usage);
|
|
215 }
|
|
216
|
|
217
|
|
218 ### Parse the blast report/output file
|
|
219 my %q_covered; # stores for each seq position (key) of a query accession if it falls within a hsp hit (value set to 1)
|
|
220 my @blast_q_acc; # store query accessions to test if identical in blast-parse and query-seq (seqio below)
|
|
221 my $blast_db; # add to GFF3 print out
|
|
222 my $parser = new Bio::SearchIO(-file => "<$blast_report", -format => 'blast'); # Bio::SearchIO object
|
|
223 $|++; # turn on autoflush, forces STDOUT to flush right away
|
|
224 print "\nParsing blast report file."; # status display (with autoflush)
|
|
225 while(my $result = $parser->next_result) { # several query sequences possible (result = entire analysis for a single query seq)
|
|
226 my $query_acc = $result->query_accession;
|
|
227 $query_acc =~ s/\.$//; # rm a '.' if present at the end of the string (for non-NCBI fasta headers)
|
|
228 $query_acc =~ s/\.\d$//; # rm version nr. from NCBI query acc (to fit it to the acc.-nr. from Bio::SeqIO below)
|
|
229 push(@blast_q_acc, $query_acc);
|
|
230 $blast_db = $result->database_name;
|
|
231 chop $blast_db; # always an empty character after the name?
|
|
232 while (my $hit = $result->next_hit) { # several subject sequences in the database might have hits to a single query
|
|
233 print '.'; # status display
|
|
234 while(my $hsp = $hit->next_hsp) { # each hit might have one or more hsps (the alignments shown in a blast report)
|
|
235 my ($query_start, $query_end) = $hsp->range('query'); # query hsp start/stop coords
|
|
236 foreach my $i ($query_start .. $query_end) {
|
|
237 $q_covered{$query_acc}{$i}++; # changes for each hsp hit the value from 'undef' to '1'
|
|
238 }
|
|
239 }
|
|
240 }
|
|
241 }
|
|
242
|
|
243
|
|
244 ### Read the query sequence file into RAM, for Bio::Seq::RichSeq files (e.g. embl/genbank) also read features
|
|
245 my %q_seqobj; # combine all seqobj (values; for multi-fasta/embl/gbk queries) to acc.-nr.s (keys )
|
|
246 my %features; # stores all features of a query RichSeq file
|
|
247 my $multi = 0; # test if multi-seq query (multi-fasta/embl/genbank) >= 2; and counter
|
|
248 my $seqio_obj = Bio::SeqIO->new(-file => "<$q_file"); # no '-format' to leave it to bioperl guessing
|
|
249 print "\nParsing query sequence file and potential annotation."; # status display (with autoflush)
|
|
250 while (my $seq_obj = $seqio_obj->next_seq) { # Bio::Seq object, query might be multi-fasta/embl/genbank
|
|
251 print '.'; # status display
|
|
252 my $query_acc;
|
|
253 if (ref($seqio_obj) =~ m/\:\:fasta$/i) { # $seqio_obj blessed, thus ref returns package name (here= Bio::SeqIO::fasta) from bioperl guessing
|
|
254 $query_acc = $seq_obj->display_id; # method '->accession_number' doesn't work with fasta files
|
|
255 $query_acc =~ s/gi\|\d+\|(emb|gb|dbj|ref)\|(.+)\|/$2/; # get acc. nr. from NCBI fasta ID lines
|
|
256 $query_acc =~ s/\.\d$//; # rm version nr., to fit it to blast acc. nr. above
|
|
257 } else { # Bio::Seq::RichSeq file (embl, genbank) inherited from Bio::SeqIO::embl/genbank
|
|
258 $query_acc = $seq_obj->accession_number;
|
|
259 @{$features{$query_acc}} = $seq_obj->get_all_SeqFeatures; # store all Bio::SeqFeatureI objects in anonymous array
|
|
260 }
|
|
261 if ($query_acc ne $blast_q_acc[$multi]) { # check if acc.s fit; SAME order of acc.s in blast-report and seq-obj
|
|
262 die "\n\n###Fatal error:\nBlast query accession \'$blast_q_acc[$multi]\' doesn't fit to query sequence accession \'$query_acc\'!\n\n";
|
|
263 }
|
|
264 $q_seqobj{$query_acc} = $seq_obj;
|
|
265 $multi++;
|
|
266 }
|
|
267
|
|
268
|
|
269 ### Get the contiguous unaligned regions of the query seq(s)
|
|
270 my %uncovered_regions; # stores coords of uncovered regions (RODs)
|
|
271 print "\nLooking for RODs."; # status display (with autoflush)
|
|
272 foreach my $acc (keys %q_seqobj) {
|
|
273 print "."; # status display
|
|
274 my $start;
|
|
275 my $seq = $q_seqobj{$acc}->seq; # Bio::SeqIO object method 'seq'
|
|
276 my $previous_state = 1; # begin at base #1 of query seq
|
|
277 foreach my $i (1 .. length($seq)) { # loop through the whole query seq and look for undef positions
|
|
278 my $current_state = $q_covered{$acc}{$i};
|
|
279 if (!defined $current_state and defined $previous_state) {
|
|
280 # warn "We have just entered an unaligned region $acc: $i\n";
|
|
281 $start = $i;
|
|
282 } elsif (defined $current_state and !defined $previous_state) {
|
|
283 # warn "We have just exited an unaligned region $acc: ", $i-1, "\n";
|
|
284 my $end = $i - 1; # position before current_state was undef
|
|
285 my $length = $end - $start + 1;
|
|
286 push @{$uncovered_regions{$length}{$acc}}, $start; # anonymous array in hash-in-hash data structure
|
|
287 undef $start;
|
|
288 } elsif ($i == length($seq) and !defined $previous_state) {
|
|
289 # warn "We have just exited an unaligned region at end of contig $acc: $i\n";
|
|
290 my $end = $i;
|
|
291 my $length = $end - $start + 1;
|
|
292 push @{$uncovered_regions{$length}{$acc}}, $start;
|
|
293 undef $start;
|
|
294 }
|
|
295 $previous_state = $current_state;
|
|
296 }
|
|
297 }
|
|
298
|
|
299
|
|
300 ### Create results directory, where output files are written to
|
|
301 my $out_dir = './results/';
|
|
302 if (-e $out_dir) {
|
|
303 print "\n\n###Directory \'$out_dir\' already exists! Replace the directory and all its contents? [y|n] ";
|
|
304 my $user_ask = <STDIN>;
|
|
305 if ($user_ask =~ /y/i) {
|
|
306 unlink glob $out_dir."*"; # remove all files in results directory
|
|
307 rmdir $out_dir; # remove the empty directory
|
|
308 } else {
|
|
309 die "Script abborted!\n";
|
|
310 }
|
|
311 }
|
|
312 mkdir $out_dir or die "Can't create directory \"$out_dir\": $!\n";
|
|
313
|
|
314
|
|
315 ### List the longest uncovered regions and print results in output files
|
|
316 my $rod_result = 'rod_summary.txt'; # overview of parsed ROD results and possible ROD annotations
|
|
317 open(ROD, ">$out_dir"."$rod_result") or die "Failed to create $rod_result file: $!\n";
|
|
318 my $seq_out = 'rod_seq.fasta'; # multi-fasta nucleotide file of ROD seqs
|
|
319 open(SEQ, ">$out_dir"."$seq_out") or die "Failed to create $seq_out file: $!\n";
|
|
320 my $gff_out = 'rod.gff'; # GFF3 file of ROD regions for use in artemis/DNAPlotter etc.
|
|
321 open(GFF, ">$out_dir"."$gff_out") or die "Failed to create $gff_out file: $!\n";
|
|
322 my $brig_out = 'rod_BRIG.txt'; # tab-separated ROD file to load into BRIG
|
|
323 open (BRIG, ">$out_dir"."$brig_out") or die "Failed to create $brig_out file: $!\n";
|
|
324 print ROD "$q_file regions with no blast hits (RODs) in $blast_report\n";
|
|
325 print ROD "Rank\t";
|
|
326 if ($multi >=2) { # for multi-seq query files include acc. nr. column
|
|
327 print ROD "Accession\t";
|
|
328 }
|
|
329 print ROD "Length\tROD position";
|
|
330 my $cds_out; # only for RichSeq query files (embl/genbank)
|
|
331 if (ref($seqio_obj) !~ m/\:\:fasta$/i) { # query NOT a fasta file
|
|
332 print ROD "\tFeature primary_tag\tFeature locus_tag\tgene\tFeature product\tFeature position";
|
|
333 $cds_out = 'rod_aa_fasta.txt'; # CDS amino acid sequence output file
|
|
334 open(CDS, ">$out_dir"."$cds_out") or die "Failed to create $cds_out file: $!\n";
|
|
335 }
|
|
336 print ROD "\n";
|
|
337 print GFF "##gff-version 3\n"; # header, shows it's a gff version 3 file
|
|
338 print GFF "#$q_file regions with no blast hits (RODs) in $blast_report\n"; # comment line for description
|
|
339 print BRIG "#Start\tStop\tLabel\n"; # column headers (commented)
|
|
340
|
|
341 print "\nPreparing results."; # status display (with autoflush)
|
|
342 my $rank = 0; # number of ROD regions
|
|
343 foreach my $length (sort {$b<=>$a} keys %uncovered_regions) { # sort RODs from large length to small length
|
|
344 if ($length >= $minimum_size) {
|
|
345 print "."; # status display
|
|
346 foreach my $acc (sort keys %{$uncovered_regions{$length}}) { # de-reference hash-in-hash, sort by acc. nr.
|
|
347 foreach my $start (sort @{$uncovered_regions{$length}{$acc}}) { # de-reference anonymous array
|
|
348 my $end = $start + $length - 1;
|
|
349 my $pos = "$start..$end";
|
|
350 $rank++;
|
|
351 print GFF "$acc\tBLASTN\tsequence_difference\t$start\t$end\t.\t+\t.\tName=ROD$rank;Target=$blast_db;color=2\n";
|
|
352 print BRIG "$start\t$end\tROD$rank\n";
|
|
353 print ROD "$rank\t";
|
|
354 if ($multi >=2) { # for multi-seq query files include acc. nr.
|
|
355 print ROD "$acc\t";
|
|
356 }
|
|
357 print ROD "$length\t$pos";
|
|
358 print SEQ ">ROD$rank $length $pos\n";
|
|
359 print SEQ $q_seqobj{$acc}->subseq($start,$end), "\n\n"; # subseq method of Bio::Seq object
|
|
360
|
|
361 ### fill ROD feature columns in 'rod_result_summary.txt', print CDS aa seqs to 'rod_aa_fasta.txt' for RichSeq files,
|
|
362 ### and print each ROD as a separate sequence file (with Bio::Sequtils->'trunc_with_features')
|
|
363 if (ref($seqio_obj) !~ m/\:\:fasta$/i) { # RichSeq query file
|
|
364 if ($rank >1) { # blank line in front of the next ROD aa seq block, if it's not the first block
|
|
365 print CDS "\n";
|
|
366 }
|
|
367 my $truncseq = Bio::SeqUtils->trunc_with_features($q_seqobj{$acc}, $start, $end); # truncate current Bio::Seq object to coordinates
|
|
368 my $seqout; # variable to hold Bio::SeqIO object
|
|
369 if (ref($seqio_obj) =~ /genbank/) { # print the truncated ROD sequences in the same '-format' as $q_file; Bio::SeqIO::genbank object
|
|
370 $seqout = Bio::SeqIO->new(-file => ">$out_dir"."ROD$rank".".gbk", -format => 'genbank');
|
|
371 } elsif (ref($seqio_obj) =~ /embl/) { # Bio::SeqIO::embl object
|
|
372 $seqout = Bio::SeqIO->new(-file => ">$out_dir"."ROD$rank".".embl", -format => 'embl');
|
|
373 }
|
|
374 $seqout->write_seq($truncseq); # write truncated sequence object
|
|
375 my $loop = 0; # indicates that more than one feature in the ROD range
|
|
376 print CDS "~~ROD$rank $length $pos\n";
|
|
377 foreach my $feature (@{$features{$acc}}) { # de-reference anonymous array in hash
|
|
378 if ($feature->location->start >= $start && $feature->location->end <= $end) { # features that are fully within ROD region
|
|
379 $loop = print_CDSs($feature, $loop); # subroutine 'print_CDSs'
|
|
380 } elsif (($feature->location->start <= $start && ($feature->location->end > $start && $feature->location->end <= $end)) || (($feature->location->start >= $start && $feature->location->start < $end) && $feature->location->end > $end)) { # features that overlap the ROD region
|
|
381 $loop = print_CDSs($feature, $loop, 1); # 1 defines $overlap in subroutine
|
|
382 }
|
|
383 }
|
|
384 if ($loop == 0) { # print "\n" if no features exist in the ROD
|
|
385 print ROD "\n";
|
|
386 }
|
|
387 } else { # fasta files don't contain feature information
|
|
388 print ROD "\n";
|
|
389 }
|
|
390 }
|
|
391 }
|
|
392 }
|
|
393 }
|
|
394 close ROD;
|
|
395 close SEQ;
|
|
396 close GFF;
|
|
397 close BRIG;
|
|
398 if (ref($seqio_obj) !~ m/\:\:fasta$/i) {
|
|
399 close CDS;
|
|
400 }
|
|
401 $| = 0; # turn off autoflush
|
|
402
|
|
403
|
|
404 ### Print which files have been created!
|
|
405 print "\n\nThe following files were created in the \"./results\" directory:\n";
|
|
406 print "- $rod_result: Tab-separated summary of the found regions of difference (RODs)\n";
|
|
407 print "- $gff_out: GFF3 file with ROD coordinates to use in Artemis/DNAPlotter ...\n";
|
|
408 print "- $brig_out: Tab-separated file for BRIG (BLAST Ring Image Generator)\n";
|
|
409 print "- $seq_out: Nucleotide sequences of ROD regions in multi-fasta format\n";
|
|
410 if (!(ref($seqio_obj) =~ m/\:\:fasta$/i)) {
|
|
411 print "- $cds_out: Amino acid sequences of CDSs in ROD regions\n";
|
|
412 print "- ROD#.format: Each ROD as a separate sequence file in the query file format\n";
|
|
413 }
|
|
414 print "\n";
|
|
415
|
|
416 exit;
|
|
417
|
|
418
|
|
419 ###############
|
|
420 # Subroutines #
|
|
421 ###############
|
|
422
|
|
423 ### Subroutine to print the position of features according to leading and lagging strand
|
|
424 sub position {
|
|
425 my $feature = shift;
|
|
426 if ($feature->strand == 1) { # leading strand
|
|
427 print ROD "\t", $feature->location->start, "..", $feature->location->end;
|
|
428 } elsif ($feature->strand == -1) { # lagging strand
|
|
429 print ROD "\t", $feature->location->end, "..", $feature->location->start;
|
|
430 }
|
|
431 return 1;
|
|
432 }
|
|
433
|
|
434
|
|
435 ### Subroutine that prints feature information from RichSeq files
|
|
436 sub print_CDSs {
|
|
437 my $feature = shift;
|
|
438 my $loop = shift;
|
|
439 my $overlap = shift; # defined if feature overlaps ROD edge
|
|
440 my $primary_tag = $feature->primary_tag;
|
|
441 if (($primary_tag eq 'CDS') || ($primary_tag eq 'misc_RNA') || ($primary_tag eq 'ncRNA') || ($primary_tag eq 'rRNA') || ($primary_tag eq 'tRNA') || ($primary_tag eq 'tmRNA') || ($primary_tag eq 'misc_feature') || ($primary_tag eq 'repeat_region')) {
|
|
442 skip_columns($loop); # sub to skip to correct column in print out
|
|
443 print ROD "\t$primary_tag";
|
|
444 if ($feature->has_tag('locus_tag')) { # TRUE if tag 'locus_tag' exists for this feature
|
|
445 print ROD "\t", $feature->get_tag_values('locus_tag');
|
|
446 print CDS ">", $feature->get_tag_values('locus_tag') if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo')); # include only CDSs with "/translation" in file 'rod_aa_fasta.txt' (exclude pseudo-genes, RNAs ...)
|
|
447 } else {
|
|
448 print ROD "\t";
|
|
449 print CDS ">" if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
450 }
|
|
451 if ($feature->has_tag('gene')) {
|
|
452 print ROD "\t", $feature->get_tag_values('gene');
|
|
453 print CDS " ", $feature->get_tag_values('gene') if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
454 } else {
|
|
455 print ROD "\t";
|
|
456 print CDS " " if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
457 }
|
|
458 if ($feature->has_tag('product')) {
|
|
459 my ($product) = $feature->get_tag_values('product');
|
|
460 print ROD "\t", $product;
|
|
461 print CDS " ", replace_white($product) if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo')); # subroutine 'replace_white'
|
|
462 } elsif ($feature->has_tag('rpt_family')) { # don't include feature 'rpt_family' in the aa output
|
|
463 print ROD "\t", $feature->get_tag_values('rpt_family');
|
|
464 } elsif ($feature->has_tag('note')) { # only include feature 'note' if 'product' doesn't exist
|
|
465 my ($note) = $feature->get_tag_values('note');
|
|
466 print ROD "\t", $note;
|
|
467 print CDS " ", replace_white($note) if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
468 }
|
|
469 position($feature); # subroutine 'position' to print coordinates of features
|
|
470 if (defined $overlap) { # include '(overlap)' if feature overlaps edge of ROD
|
|
471 print ROD " (overlap)\n";
|
|
472 print CDS " (overlap)\n" if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
473 } else {
|
|
474 print ROD "\n";
|
|
475 print CDS "\n" if ($primary_tag eq 'CDS' && !$feature->has_tag('pseudo'));
|
|
476 }
|
|
477 if ($primary_tag eq 'CDS' && $feature->has_tag('translation')) { # print translations of CDSs
|
|
478 print CDS $feature->get_tag_values('translation'), "\n";
|
|
479 }
|
|
480 $loop++; # Indicates that more than one feature in the ROD range
|
|
481 } elsif ($primary_tag eq 'mobile_element') { # include 'mobile_element' features only in ROD
|
|
482 skip_columns($loop);
|
|
483 print ROD "\t$primary_tag";
|
|
484 if ($feature->has_tag('mobile_element_type')) {
|
|
485 print ROD "\t\t\t", $feature->get_tag_values('mobile_element_type');
|
|
486 } else {
|
|
487 print ROD "\t\t\t";
|
|
488 }
|
|
489 position($feature); # sub to print positions of features
|
|
490 if (defined $overlap) {
|
|
491 print ROD " (overlap)\n";
|
|
492 } else {
|
|
493 print ROD "\n";
|
|
494 }
|
|
495 $loop++;
|
|
496 }
|
|
497 return $loop;
|
|
498 }
|
|
499
|
|
500
|
|
501 ### Subroutine to replace whitespaces in a string
|
|
502 sub replace_white {
|
|
503 my $string = shift;
|
|
504 $string =~ s/\s/_/g;
|
|
505 return $string;
|
|
506 }
|
|
507
|
|
508
|
|
509 ### Subroutine to skip to the correct column for features in tab-separated result file 'rod_result_summary.txt'
|
|
510 sub skip_columns {
|
|
511 my $loop = shift; # $multi is in the same scope ... (actually also $loop)
|
|
512 if ($loop >= 1) { # for the features after the first ROD line skip to the 'feature primary_tag' column in the tab-separated summary output
|
|
513 print ROD "\t\t";
|
|
514 if ($multi >=2) { # multi-seq query files include acc. nr. column
|
|
515 print ROD "\t";
|
|
516 }
|
|
517 }
|
|
518 return 1;
|
|
519 }
|