annotate COG/bac-genomics-scripts/cds_extractor/cds_extractor.pl @ 13:152d7c43478b draft default tip

Uploaded
author dereeper
date Thu, 30 May 2024 20:07:55 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<cds_extractor.pl> - extract protein or DNA sequences from CDS features
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 C<perl cds_extractor.pl -i seq_file.[embl|gbk] -p>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 Extracts protein or DNA sequences of CDS features from a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 (multi)-RichSeq file (e.g. EMBL or GENBANK format) and writes them to a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21 multi-FASTA file. The FASTA headers for each CDS include either the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 locus tag, if that's not available, protein ID, gene, or an internal
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23 CDS counter as identifier (in this order). The organism info
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 includes also possible plasmid names. Pseudogenes (tagged by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 B</pseudo>) are not included (except in the CDS counter).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 In addition to the identifier, FASTA headers include gene (B<g=>),
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 product (B<p=>), organism (B<o=>), and EC numbers (B<ec=>), if these
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 are present for a CDS. Individual EC numbers are separated by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 B<semicolons>. The location/position (B<l=>start..stop) of a CDS will
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 always be included. If gene is used as FASTA header ID
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 'B<g=>gene' will only be included with option B<-f>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 Fuzzy locations in the feature table of a sequence file are not
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 taken into consideration for B<l=>. If you set options B<-u>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 and/or B<-d> and the feature location overlaps a B<circular>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 replicon boundary, positions are marked with '<' or '>' in the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 direction of the exceeded boundary. Features with overlapping
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 locations in B<linear> sequences (e.g. contigs) will be skipped and
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 are B<not> included in the output! A CDS feature is on the lagging
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 strand if start > stop in the location. In the special case of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 overlapping circular sequence boundaries this is reversed.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 Of course, the B<l=> positions are separate for each sequence in a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 multi-sequence file. Thus, if you want continuous positions for the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 CDSs run these files first through L<C<cat_seq.pl>|/cat_seq>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 Optionally, a file with locus tags can be given to extract only
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 these CDS features with option B<-l> (each locus tag in a new line).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53 =head2 Mandatory options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55 =over 23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57 =item B<-i>=I<str>, B<-input>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59 Input RichSeq sequence file including CDS annotation (e.g. EMBL or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 GENBANK format)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 =item B<-p>, B<-protein>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64 Extract B<protein> sequence for each CDS feature, excludes option B<-n>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68 =item B<-n>, B<-nucleotide>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 Extract B<nucleotide> sequence for each CDS feature, excludes option
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 B<-p>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 =head2 Optional options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 =over 28
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 =item B<-u>=I<int>, B<-upstream>=I<int>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 Include given number of flanking nucleotides upstream of each CDS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86 feature, forces option B<-n>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88 =item B<-d>=I<int>, B<-downstream>=I<int>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90 Include given number of flanking nucleotides downstream of each CDS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 feature, forces option B<-n>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 =item B<-c>=I<str>, B<-cds_prefix>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 Prefix for the internal CDS counter [default = 'CDS']
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97 =item B<-l>=I<str>, B<-locustag_list>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 List of locus tags to extract only those (each locus tag on a new line)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 =item B<-f>, B<-full_header>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 If gene is used as ID include additionally 'B<g=>gene' in FASTA
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 headers, so downstream analyses can recognize the gene tag (e.g.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 L<C<prot_finder.pl>|/prot_finder>).
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 Print version number to C<STDERR>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 =over 23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 =item F<*.faa>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 Multi-FASTA file of CDS protein sequences
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 =item F<*.ffn>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 Multi-FASTA file of CDS DNA sequences
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 =item (F<no_annotation_err.txt>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 Lists input files missing CDS annotation, script exited with B<fatal error> i.e. no FASTA output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 =item (F<double_id_err.txt>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 Lists input files with ambiguous FASTA IDs, script exited with B<fatal error> i.e. no FASTA output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 =item (F<locus_tag_missing_err.txt>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 Lists CDS features without locus tags
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 =item (F<linear_seq_cds_overlap_err.txt>)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 Lists CDS features overlapping sequence border of a B<linear>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 molecule, which are not included in the result multi-FASTA file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 =item C<perl cds_extractor.pl -i seq_file.gbk -p -l locus_tags.txt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 =item C<perl cds_extractor.pl -i seq_file.embl -n -l locus_tags.txt -u 100 -d 20>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 =item C<perl cds_extractor.pl -i Ecoli_MG1655.gbk -p -f -c MG1655>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 =head1 DEPENDENCIES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 =item B<BioPerl (L<http://www.bioperl.org>)>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 Tested with BioPerl version 1.006923
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 0.7.1 update: 26-10-2015
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 0.1 24-05-2012
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 the Free Software Foundation; either version 3 (GPLv3) of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 License, or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 use Bio::SeqIO; # bioperl module to handle sequence input/output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 # use Bio::Seq; # bioperl module to play with the sequence and its features ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 # use Bio::SeqFeatureI; # bioperl module to handle features in a sequence ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 ### Get options with Getopt::Long, works also abbreviated and with two "--": -i, --i, -input ...
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 my $Seq_File; # RichSeq sequence file including feature annotation
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 my $Opt_Protein; # extract protein sequences for each CDS feature; excludes option '-n'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 my $Opt_Nucleotide; # extract nucleotide sequences for each CDS feature; excludes option '-p'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 my $Upstream = 0; # include given number of flanking nucleotides upstream of each CDS feature; forces option '-n'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 my $Downstream = 0; # include given number of flanking nucleotides downstream of each CDS feature; forces option '-n'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 my $Prefix; # prefix for the internal CDS counter
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 my $Locustag_List; # list of locus_tags to extract only those
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 my $Opt_Full_Header; # include a full FASTA header for downstream 'prot_finder.pl' analysis (needed to identify /gene feature tag if used as identifier)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 my $VERSION = '0.7.1';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221 GetOptions ('input=s' => \$Seq_File,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222 'protein' => \$Opt_Protein,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223 'nucleotide' => \$Opt_Nucleotide,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 'upstream:i' => \$Upstream,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 'downstream:i' => \$Downstream,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 'cds_prefix:s' => \$Prefix,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227 'locustag_list:s' => \$Locustag_List,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 'full_header' => \$Opt_Full_Header,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 'help|?' => \$Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 ### Run perldoc on POD
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 die "$0 $VERSION\n" if ($Opt_Version);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 if (!$Seq_File) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 my $warning = "\n### Fatal error: Option '-i' or its argument is missing!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239 pod2usage(-verbose => 1, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 ### Enforce mandatory or optional options
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 if (($Upstream || $Downstream) && !$Opt_Nucleotide) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 warn "Option '-d' and/or '-u' set, but not '-n'. Forcing option '-n'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 $Opt_Nucleotide = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 undef $Opt_Protein;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 if (!$Opt_Protein && !$Opt_Nucleotide) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 die "\n### Fatal error: None of the mandatory options ('-p' or '-n') given! Please choose one of the extraction methods!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 } elsif ($Opt_Protein && $Opt_Nucleotide) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 die "\n### Fatal error: Both mandatory options ('-p' and '-n') given! Choose only one of the extraction methods!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255 $Prefix = 'CDS' if (!$Prefix); # default for internal CDS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259 ### Create a bioperl Bio::SeqIO object with $Seq_File
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 my $seqio_object = Bio::SeqIO->new(-file => "<$Seq_File"); # no '-format' to leave to bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 ### Print the input and output file which are processed and written, respectively
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 print "\nInput: $Seq_File\t";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 my $Ori_Filename = $Seq_File; # store original filename to give error statements
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 $Seq_File =~ s/(.+)\.\w+$/$1/; # strip filename extension
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 my $Out_Fh; # filehandle for output file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269 if ($Opt_Protein) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270 $Seq_File = $Seq_File.'.faa';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271 print "Output: $Seq_File\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 open ($Out_Fh, ">", "$Seq_File");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 } elsif ($Opt_Nucleotide) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 $Seq_File = $Seq_File.'.ffn';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275 print "Output: $Seq_File\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 open ($Out_Fh, ">", "$Seq_File");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 ### Get the list of locus_tags if $Locustag_List defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 my %Locus_Tags; # store locus_tags and indicate if found in $Seq_File
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 if ($Locustag_List) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284 open (my $locus_fh, "<", "$Locustag_List");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285 while (<$locus_fh>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286 chomp;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 next if (/^$/); # skip empty lines
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 $Locus_Tags{$_} = 0; # changes to 1 if the locus tag was found in $Seq_File, see below
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290 close $locus_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 ### Write output FASTA with respective header lines and either protein seq from /translation feature tag or nucleic acid subseq
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 my $Anno_Present = 0; # switches to true if CDS primary features present
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 my $Organism; # store /organism tag value to include in each header, include plasmid name
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298 my $Cds_Count = sprintf("%04d", 0); # internal CDS counter, counts also 'pseudo' CDSs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 my %Double_Id; # control if a locus_tag or protein_id is ambiguous in $Seq_File and die (they should be unique); see subroutine 'control_double'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301 my $No_Locus_Tag = 0; # count how many CDS features don't have a locus_tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302 my $Locus_Tag_Miss_Err = 'locus_tag_missing_err.txt'; # error file that contains all alternative IDs (see subroutine 'locus_tag_missing')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 my $Locus_Tag_Miss_Err_Fh; # filehandle
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 my $Linear_Seq_Overlap = 0; # count CDS features which overlap linear sequence borders (see subroutine 'linear_overlap')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 my $Linear_Overlap_Err = 'linear_seq_cds_overlap_err.txt'; # error file that contains all affected IDs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 my $Linear_Seq_Overlap_Fh; # filehandle
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 while (my $seq_object = $seqio_object->next_seq) { # a Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 my @feat_objects = $seq_object->get_SeqFeatures; # slurp all features from the seq to check CDS primary feature annotation
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312 if (grep ($_->primary_tag eq 'CDS', @feat_objects) == 0) { # scalar context
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313 next; # if no CDS features present skip to next $seq_object in (multi-)seq file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 $Anno_Present = 1; # if not skipped CDS annotation present
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 $Organism = ''; # empty $Organism for each $seq_object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318 foreach my $feat_object (@feat_objects) { # a Bio::SeqFeatureI object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 if ($Locustag_List) { # skip residual FEATURE objects if all locus_tags in $Locustag_List found
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320 if (grep ($Locus_Tags{$_} == 1, keys %Locus_Tags) == keys %Locus_Tags) { # scalar context
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 if ($feat_object->primary_tag eq 'source') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326 $Organism = feature_value_eval($feat_object, 'organism'); # subroutine to evaluate existence of the tag and return value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327 if ($feat_object->has_tag('plasmid')) { # subroutine 'feature_value_eval' also possible, but method 'has_tag' more elegantly
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 my ($plasmid) = $feat_object->get_tag_values('plasmid'); # values always returned as ARRAYS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 $plasmid =~ s/\s/_/g;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 $Organism = $Organism.'-plasmid_'.$plasmid;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 if ($feat_object->primary_tag eq 'CDS') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 $Cds_Count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 if ($feat_object->has_tag('pseudo')) { # skip pseudogenes, they don't include '/translation'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 my ($start, $stop, $strand, $seq_stop, $location_stop) = get_location($feat_object, $seq_object); # subroutine to get start, stop and strand of feature; $seq_stop needed for sub 'print_seq' and with $location_stop needed for sub 'print_location'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
341 if ($feat_object->has_tag('locus_tag')) { # LOCUS_TAG-ELSIF
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
342 my ($locus_tag) = $feat_object->get_tag_values('locus_tag');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
343 control_double($locus_tag, 'locus_tag'); # subroutine to control uniqueness of identifier (here value of /locus_tag)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
344
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
345 if ($Locustag_List) { # if '-locustag_list' is set only get those CDSs
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
346 my ($locus) = grep (/$locus_tag/i, keys %Locus_Tags); # case-insensitive for typing mistakes
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
347 if ($locus) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
348 $Locus_Tags{$locus} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
349 if ($Opt_Nucleotide && ($start eq 'not_circular' || ($location_stop && $location_stop eq 'not_circular'))) { # CDS feature overlaps seq border of linear molecule
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
350 linear_overlap($locus_tag); # subroutine to write overlap CDSs to '$Linear_Overlap_Err'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
351 next; # jump to the next feature object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
352 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
353 print_fasta_ID($feat_object, $locus_tag); # subroutine to print the identifier (here /locus_tag), /gene (g=) and /product (p=) (both if present) to the FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
354 print_location($start, $stop, $strand, $seq_stop, $location_stop); # subroutine to print feature location/position (l=) to FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
355 print_org_ec($feat_object); # subroutine to print /organism (o=) and /EC_numbers (ec=) (both if present) to FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
356 print_seq($feat_object, $start, $stop, $strand, $seq_stop, $seq_object); # subroutine to print the protein or nucleic sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
357 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
358 next; # jump to the next feature object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
359 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
360
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
361 ### '$Locustag_List' not defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
362 if ($Opt_Nucleotide && ($start eq 'not_circular' || ($location_stop && $location_stop eq 'not_circular'))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
363 linear_overlap($locus_tag); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
364 next; # jump to the next feature object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
365 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
366 print_fasta_ID($feat_object, $locus_tag); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
367
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
368 } elsif ($Locustag_List) { # in case a list of locus_tags is given, no need to look at CDSs that don't have a /locus_tag
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
369 next; # jump to the next feature object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
370
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
371 } elsif ($feat_object->has_tag('protein_id')) { # PROTEIN_ID-ELSIF
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
372 my ($protein_id) = $feat_object->get_tag_values('protein_id');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
373 control_double($protein_id, 'protein_id'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
374 locus_tag_missing($protein_id, 'protein_id'); # subroutine to inform no /locus_tag is present for this CDS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
375 $No_Locus_Tag++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
376 if ($Opt_Nucleotide && ($start eq 'not_circular' || ($location_stop && $location_stop eq 'not_circular'))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
377 linear_overlap($protein_id); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
378 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
379 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
380 print_fasta_ID($feat_object, $protein_id); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
381
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
382 } elsif ($feat_object->has_tag('gene')) { # GENE-ELSIF
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
383 my ($gene) = $feat_object->get_tag_values('gene');
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
384 control_double($gene, 'gene'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
385 locus_tag_missing($gene, 'gene'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
386 $No_Locus_Tag++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
387 if ($Opt_Nucleotide && ($start eq 'not_circular' || ($location_stop && $location_stop eq 'not_circular'))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
388 linear_overlap($gene); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
389 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
390 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
391 print_fasta_ID($feat_object, ''); # subroutine; give empty string for $tag to sub, to be able to determine that the sub call is from GENE-ELSIF and 'g=' should only be included if option '-f' is set
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
392
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
393 } else { # if none of the above tags are existent use the internal CDS counter; CDS_COUNT-ELSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
394 my $cds_prefix_count = $Prefix.'_'.$Cds_Count;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
395 locus_tag_missing($cds_prefix_count, 'cds-counter'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
396 $No_Locus_Tag++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
397 if ($Opt_Nucleotide && ($start eq 'not_circular' || ($location_stop && $location_stop eq 'not_circular'))) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
398 linear_overlap($cds_prefix_count); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
399 next;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
400 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
401 print_fasta_ID($feat_object, $cds_prefix_count); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
402 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
403
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
404 ### Print location, organism, EC_number(s), and sequence for all $feat_object if '$Locustag_List' not defined
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
405 print_location($start, $stop, $strand, $seq_stop, $location_stop); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
406 print_org_ec($feat_object); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
407 print_seq($feat_object, $start, $stop, $strand, $seq_stop, $seq_object); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
408 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
409 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
410
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
411 if ($Locustag_List) { # skip ALSO residual SEQUENCE objects if all locus_tags are found
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
412 if (grep ($Locus_Tags{$_} == 1, keys %Locus_Tags) == keys %Locus_Tags) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
413 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
414 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
415 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
416 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
417
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
418 close $Out_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
419 if ($No_Locus_Tag > 0) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
420 warn "### $No_Locus_Tag CDS feature(s) don't have a locus tag in file '$Ori_Filename'. The respective ID(s) are written to the error file '$Locus_Tag_Miss_Err'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
421 close $Locus_Tag_Miss_Err_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
422 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
423 if ($Linear_Seq_Overlap > 0) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
424 warn "### $Linear_Seq_Overlap CDS feature(s) overlap a sequence border of a linear molecule in '$Ori_Filename' because of your settings for '-u' and/or '-d'. These CDS feature(s) are not included in the output file '$Seq_File'! The respective ID(s) are written to the error file '$Linear_Overlap_Err'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
425 close $Linear_Seq_Overlap_Fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
426 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
427
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
428
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
429
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
430 ### Exit with error and remove empty result file if no CDS features/annotation found in (multi-)seq file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
431 if (!$Anno_Present) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
432 my $anno_err = 'no_annotation_err.txt';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
433 err_file_exist($anno_err); # subroutine to test for file existence and give warning to STDERR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
434 open (my $anno_err_fh, ">>", "$anno_err");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
435 print $anno_err_fh "$Ori_Filename\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
436 close $anno_err_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
437 unlink $Seq_File;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
438 die "\n###Fatal error\nNo CDS annotation in '$Ori_Filename', the respective filename was written to '$anno_err'!\nExiting program!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
439 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
440
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
441
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
442
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
443 ### Print locus tags that were not found in seq_file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
444 if ($Locustag_List) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
445 my @missed = grep ($Locus_Tags{$_} == 0, keys %Locus_Tags);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
446 if (@missed) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
447 print "### The following locus tags were not found in '$Ori_Filename':\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
448 foreach (sort @missed) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
449 print "$_\t";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
450 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
451 print "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
452 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
453 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
454
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
455
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
456 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
457
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
458
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
459 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
460 # Subroutines #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
461 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
462
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
463 ### Control if an identifier for the FASTA header is ambiguous in the $Seq_File and die (they should be unique/unambiguous)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
464 sub control_double {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
465 my ($tag, $type) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
466 if ($Double_Id{$tag}) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
467 my $double_id_err = 'double_id_err.txt';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
468 my $file_exist = err_file_exist($double_id_err); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
469 open (my $double_id_err_fh, ">>", "$double_id_err");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
470 print $double_id_err_fh "Original_filename\tID-type\tDouble_ID\n" if (!$file_exist); # print header in file only if file doesn't exist
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
471 print $double_id_err_fh "$Ori_Filename\t$type\t$tag\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
472 close $double_id_err_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
473 unlink $Seq_File;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
474 die "\n###Fatal error!\n'$tag' of type '$type' exists at least two times in organism '$Organism' of file '$Ori_Filename', but should be unambiguous for downstream analyses! The error is written to the error file '$double_id_err'. Please modify all repetitive occurences.\nExiting program!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
475 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
476 $Double_Id{$tag} = 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
477 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
478 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
479 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
480
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
481
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
482
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
483 ### Test for error file existence and give warning to STDERR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
484 sub err_file_exist {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
485 my $file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
486 if (-e $file) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
487 warn "\nThe error file '$file' exists already, the current errors will be appended to the existing file!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
488 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
489 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
490 return 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
491 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
492
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
493
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
494
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
495 ### Return value of a tag (replace whitespaces with '_') or catch error if non-existent to return empty value
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
496 sub feature_value_eval {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
497 my ($feat_object, $tag) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
498 my $value = '';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
499 eval {$value = join(';', $feat_object->get_tag_values($tag));}; # values always returned as ARRAYS; catch error if tag doesn't exist. To seperate /EC_numbers ';' is used, the only feature tag needed with more than one occurrence in a single CDS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
500 $value =~ s/\s/_/g; # replace spaces with '_' (guess only needed for /organism and /product values)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
501 return $value;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
502 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
503
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
504
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
505
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
506 ### Get the position/location of a feature
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
507 sub get_location {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
508 my ($feat_object, $seq_object) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
509 my ($start, $stop, $strand, $seq_stop, $location_stop); # $seq_stop needed for sub 'print_seq' and with $location_stop needed for sub 'print_location'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
510
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
511 $strand = $feat_object->strand; # 1 = feature on leading strand, -1 = feature on lagging strand
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
512 if ($strand == 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
513 $start = $feat_object->start - $Upstream;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
514 $stop = $feat_object->end + $Downstream;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
515 } elsif ($strand == -1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
516 $start = $feat_object->start - $Downstream;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
517 $stop = $feat_object->end + $Upstream;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
518 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
519
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
520 ### Adjust positions if they overlap replicon sequence borders
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
521 if ($start < 0) { # '$seq_obj->subseq|->trunc' (in subroutine 'print_seq') don't work with negative numbers, but with positions > '$seq_obj->length'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
522 if ($seq_object->is_circular) { # true if molecule is circular
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
523 $start = $seq_object->length - abs($start);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
524 } else { # if sequence linear wrong to print overlapping sequences (see sub 'linear_overlap')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
525 $start = 'not_circular';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
526 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
527 $seq_stop = $seq_object->length + $stop;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
528 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
529 if ($stop > $seq_object->length) { # for sub 'print_location' a stop > '$seq_obj->length' is no use and should start again from the seq start
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
530 if ($seq_object->is_circular) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
531 $location_stop = $stop - $seq_object->length;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
532 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
533 $location_stop = 'not_circular';
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
534 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
535 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
536
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
537 return ($start, $stop, $strand, $seq_stop, $location_stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
538 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
539
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
540
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
541
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
542 ### Inform CDS locations with options '-u' and/or '-d' are overlapping a sequence border of a linear molecule
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
543 sub linear_overlap {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
544 my $tag = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
545 if ($Linear_Seq_Overlap == 0) { # open error file to append errors only for first overlap error
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
546 my $file_exist = err_file_exist($Linear_Overlap_Err); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
547 open ($Linear_Seq_Overlap_Fh, ">>", "$Linear_Overlap_Err");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
548 print $Linear_Seq_Overlap_Fh "Original_filename\tOrganism\tID\n" if (!$file_exist); # header of error file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
549 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
550 $Linear_Seq_Overlap++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
551 print $Linear_Seq_Overlap_Fh "$Ori_Filename\t$Organism\t$tag\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
552 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
553 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
554
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
555
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
556
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
557 ### Inform if locus_tags are missing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
558 sub locus_tag_missing {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
559 my ($tag, $type) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
560 if ($No_Locus_Tag == 0) { # open error file to append errors only for first missing locus_tag occurrence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
561 my $file_exist = err_file_exist($Locus_Tag_Miss_Err); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
562 open ($Locus_Tag_Miss_Err_Fh, ">>", "$Locus_Tag_Miss_Err");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
563 print $Locus_Tag_Miss_Err_Fh "Original_filename\tOrganism\tAlternative_ID_type\tAlternative_ID\n" if (!$file_exist); # header of error file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
564 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
565 print $Locus_Tag_Miss_Err_Fh "$Ori_Filename\t$Organism\t$type\t$tag\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
566 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
567 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
568
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
569
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
570
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
571 ### Print identifier, /gene (g=) and /product (p=) (both if existent) of a CDS to FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
572 sub print_fasta_ID {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
573 my ($feat_object, $tag) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
574 my $gene = feature_value_eval($feat_object, 'gene'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
575 my $product = feature_value_eval($feat_object, 'product'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
576 if ($tag) { # $tag is defined for LOCUS_TAG-ELSIF, PROTEIN_ID-ELSIF, and CDS_COUNT-ELSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
577 print $Out_Fh ">$tag";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
578 } elsif (!$tag) { # subroutine call from GENE-ELSIF with empty string for $tag (see below 'g=')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
579 print $Out_Fh ">$gene";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
580 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
581 print $Out_Fh " g=$gene" if (($gene && $tag) || ($Opt_Full_Header && !$tag)); # print 'g='-gene only if /gene tag present; additionally, only print 'g=' for GENE-ELSIF if '-f' is set, otherwise can't determine if the identifier is a gene in downstream tools (e.g. 'prot_finder.pl')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
582 print $Out_Fh " p=$product" if ($product);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
583 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
584 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
585
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
586
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
587
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
588 ### Print position/location (l=) of a CDS to FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
589 sub print_location {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
590 my ($start, $stop, $strand, $seq_stop, $location_stop) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
591 $stop = $location_stop if ($location_stop); # see sub 'get_location'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
592 print $Out_Fh " l=";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
593 if ($strand == 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
594 print $Out_Fh ">" if ($seq_stop); # defined if feature start position overlaps replicon's seq start (with '-u' or '-d'), sub 'get_location'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
595 print $Out_Fh "$start..$stop";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
596 print $Out_Fh "<" if ($location_stop); # defined if feature stop position overlaps replicon's seq end (with '-u' or '-d')
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
597 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
598 } elsif ($strand == -1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
599 print $Out_Fh "<" if ($location_stop); # switched on lagging strand, because $stop and $start switched
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
600 print $Out_Fh "$stop..$start"; # switch $stop and $start to indicate position on lagging strand
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
601 print $Out_Fh ">" if ($seq_stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
602 return -1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
603 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
604 return 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
605 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
606
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
607
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
608
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
609 ### Print organism (o=) and EC numbers (ec=) (both if existent) of a CDS to FASTA header
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
610 sub print_org_ec {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
611 my $feat_object = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
612 print $Out_Fh " o=$Organism" if ($Organism);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
613 my $ec_numbers = feature_value_eval($feat_object, 'EC_number'); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
614 print $Out_Fh " ec=$ec_numbers" if ($ec_numbers);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
615 print $Out_Fh "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
616 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
617 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
618
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
619
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
620
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
621 ### Print the protein or nucleic sequence to the result file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
622 sub print_seq {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
623 my ($feat_object, $start, $stop, $strand, $seq_stop, $seq_object) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
624 if ($Opt_Protein) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
625 print $Out_Fh $feat_object->get_tag_values('translation'), "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
626
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
627 } elsif ($Opt_Nucleotide) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
628 $stop = $seq_stop if ($seq_stop); # see sub 'get_location'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
629 if ($strand == 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
630 my $subseq = $seq_object->subseq($start, $stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
631 print $Out_Fh substr($subseq, 0, $Upstream); # upstream bases, not possible to include in '$feat_obj->spliced_seq'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
632 print $Out_Fh $feat_object->spliced_seq->seq; # to include also features with "join"/fuzzy locations; '$feat_object->spliced_seq' returns a Bio::Seq object thus call funtion '->seq' for the seq string
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
633 print $Out_Fh substr($subseq, length($subseq) - $Downstream, $Downstream), "\n"; # downstream bases
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
634
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
635 } elsif ($strand == -1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
636 my $trunc_obj = $seq_object->trunc($start, $stop); # a Bio::Seq object, needed for revcom
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
637 my $rev_obj = $trunc_obj->revcom; # a Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
638 print $Out_Fh substr($rev_obj->seq, 0, $Upstream);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
639 print $Out_Fh $feat_object->spliced_seq->seq;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
640 print $Out_Fh substr($rev_obj->seq, $rev_obj->length - $Downstream, $Downstream), "\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
641 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
642 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
643 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
644 }