3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<revcom_seq.pl> - reverse complement (multi-)sequence files
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl revcom_seq.pl seq-file.embl E<gt> seq-file_revcom.embl>
|
|
16
|
|
17 B<or>
|
|
18
|
|
19 C<perl cat_seq.pl multi-seq_file.embl | perl revcom_seq.pl -i embl
|
|
20 E<gt> seq_file_cat_revcom.embl>
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 This script reverse complements (multi-)sequence files. The
|
|
25 features/annotations in RichSeq files (e.g. EMBL or GENBANK format)
|
|
26 will also be adapted accordingly. Use option B<-o> to specify a
|
|
27 different output sequence format. Input files can be given directly via
|
|
28 C<STDIN> or as a file. If C<STDIN> is used, the input sequence file
|
|
29 format has to be given with option B<-i>. Be careful to set the correct
|
|
30 input format.
|
|
31
|
|
32 =head1 OPTIONS
|
|
33
|
|
34 =over 20
|
|
35
|
|
36 =item B<-h>, B<-help>
|
|
37
|
|
38 Help (perldoc POD)
|
|
39
|
|
40 =item B<-o>=I<str>, B<-outformat>=I<str>
|
|
41
|
|
42 Specify different sequence format for the output [fasta, embl, or gbk]
|
|
43
|
|
44 =item B<-i>=I<str>, B<-informat>=I<str>
|
|
45
|
|
46 Specify the input sequence file format, only needed for C<STDIN> input
|
|
47
|
|
48 =item B<-v>, B<-version>
|
|
49
|
|
50 Print version number to C<STDOUT>
|
|
51
|
|
52 =back
|
|
53
|
|
54 =head1 OUTPUT
|
|
55
|
|
56 =over 20
|
|
57
|
|
58 =item C<STDOUT>
|
|
59
|
|
60 The reverse complemented sequence file is printed to C<STDOUT>.
|
|
61 Redirect or pipe into another tool as needed.
|
|
62
|
|
63 =back
|
|
64
|
|
65 =head1 EXAMPLES
|
|
66
|
|
67 =over
|
|
68
|
|
69 =item C<perl revcom_seq.pl -o gbk seq-file.embl E<gt>
|
|
70 seq-file_revcom.gbk>
|
|
71
|
|
72 =back
|
|
73
|
|
74 B<or>
|
|
75
|
|
76 =over
|
|
77
|
|
78 =item C<for file in *.embl; do perl revcom_seq.pl -o fasta "$file"
|
|
79 E<gt> "${file%.embl}"_revcom.fasta; done>
|
|
80
|
|
81 =back
|
|
82
|
|
83 =head1 DEPENDENCIES
|
|
84
|
|
85 =over
|
|
86
|
|
87 =item B<L<BioPerl|http://www.bioperl.org>>
|
|
88
|
|
89 Tested with BioPerl version 1.007001
|
|
90
|
|
91 =back
|
|
92
|
|
93 =head1 VERSION
|
|
94
|
|
95 0.2 update: 2015-12-10
|
|
96 0.1 2013-08-02
|
|
97
|
|
98 =head1 AUTHOR
|
|
99
|
|
100 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
101
|
|
102 =head1 LICENSE
|
|
103
|
|
104 This program is free software: you can redistribute it and/or modify
|
|
105 it under the terms of the GNU General Public License as published by
|
|
106 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
107 License, or (at your option) any later version.
|
|
108
|
|
109 This program is distributed in the hope that it will be useful, but
|
|
110 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
111 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
112 General Public License for more details.
|
|
113
|
|
114 You should have received a copy of the GNU General Public License
|
|
115 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
116
|
|
117 =cut
|
|
118
|
|
119
|
|
120 ########
|
|
121 # MAIN #
|
|
122 ########
|
|
123
|
|
124 use strict;
|
|
125 use warnings;
|
|
126 use autodie;
|
|
127 use Getopt::Long;
|
|
128 use Pod::Usage;
|
|
129 use Bio::SeqIO; # bioperl module to handle sequence input/output
|
|
130 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
|
|
131 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
|
|
132
|
|
133 ### Get options with Getopt::Long
|
|
134 my $In_Format; # input seq file format needed for STDIN
|
|
135 my $Out_Format; # optional different output seq file format
|
|
136 my $VERSION = 0.2;
|
|
137 my ($Opt_Version, $Opt_Help);
|
|
138 GetOptions ('informat=s' => \$In_Format,
|
|
139 'outformat=s' => \$Out_Format,
|
|
140 'version' => \$Opt_Version,
|
|
141 'help|?' => \$Opt_Help)
|
|
142 or pod2usage(-verbose => 1, -exitval => 2);
|
|
143
|
|
144
|
|
145
|
|
146 ### Run perldoc on POD
|
|
147 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
148 if ($Opt_Version) {
|
|
149 print "$0 $VERSION\n";
|
|
150 exit;
|
|
151 }
|
|
152
|
|
153
|
|
154
|
|
155 ### Check input (@ARGV and STDIN)
|
|
156 if (-t STDIN && ! @ARGV) {
|
|
157 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
|
|
158 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
|
|
159 } elsif (!-t STDIN && @ARGV) {
|
|
160 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
|
|
161 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
|
|
162 }
|
|
163 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n" if (@ARGV > 1);
|
|
164 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
|
|
165
|
|
166
|
|
167
|
|
168 ### Bio::SeqIO objects for input and output
|
|
169 print STDERR "\nReverse complementing";
|
|
170 my $Seqin; # Bio::SeqIO object
|
|
171 if (-t STDIN) { # input from file
|
|
172 warn "\n### Warning: Ignoring input file format ('-i $In_Format'), because input file given and not STDIN!\n\n" if ($In_Format);
|
|
173 my $seq_file = shift;
|
|
174 $Seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
|
|
175 print STDERR " '$seq_file' ";
|
|
176 } elsif (!-t STDIN) { # input from STDIN
|
|
177 die "\n### Fatal error: Sequence file given as STDIN requires an input file format, please set one with option '-i' and/or see help with '-h'!\n" if (!$In_Format);
|
|
178 $In_Format = 'genbank' if ($In_Format =~ /(gbk|gb)/i); # allow shorter format string for 'genbank'
|
|
179 $Seqin = Bio::SeqIO->new(-fh => \*STDIN, -format => $In_Format); # capture typeglob of STDIN, requires '-format'
|
|
180 print STDERR " input file ";
|
|
181 }
|
|
182 print STDERR "...\n";
|
|
183
|
|
184 my $Seqout; # Bio::SeqIO object
|
|
185 if ($Out_Format) {
|
|
186 $Out_Format = 'genbank' if ($Out_Format =~ /(gbk|gb)/i);
|
|
187 } else { # same format as input file
|
|
188 if (!-t STDIN) {
|
|
189 $Out_Format = $In_Format;
|
|
190 } else {
|
|
191 if (ref($Seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
|
|
192 $Out_Format = $1;
|
|
193 } else {
|
|
194 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
|
|
195 }
|
|
196 }
|
|
197 }
|
|
198 $Seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $Out_Format); # printing to STDOUT requires '-format'
|
|
199
|
|
200
|
|
201 ### Write reverse complemented sequence (and its features) to STDOUT
|
|
202 while (my $seq_obj = $Seqin->next_seq) { # Bio::Seq object; for multi-seq files
|
|
203 my $revcom = Bio::SeqUtils->revcom_with_features($seq_obj);
|
|
204 $Seqout->write_seq($revcom);
|
|
205 }
|
|
206
|
|
207 exit;
|