annotate COG/bac-genomics-scripts/revcom_seq/revcom_seq.pl @ 10:d103c41b6931 draft

Uploaded
author dereeper
date Thu, 30 May 2024 16:35:22 +0000
parents e42d30da7a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<revcom_seq.pl> - reverse complement (multi-)sequence files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 C<perl revcom_seq.pl seq-file.embl E<gt> seq-file_revcom.embl>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19 C<perl cat_seq.pl multi-seq_file.embl | perl revcom_seq.pl -i embl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 E<gt> seq_file_cat_revcom.embl>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 This script reverse complements (multi-)sequence files. The
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 features/annotations in RichSeq files (e.g. EMBL or GENBANK format)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 will also be adapted accordingly. Use option B<-o> to specify a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 different output sequence format. Input files can be given directly via
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 C<STDIN> or as a file. If C<STDIN> is used, the input sequence file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 format has to be given with option B<-i>. Be careful to set the correct
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 input format.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 =item B<-o>=I<str>, B<-outformat>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42 Specify different sequence format for the output [fasta, embl, or gbk]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 =item B<-i>=I<str>, B<-informat>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46 Specify the input sequence file format, only needed for C<STDIN> input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 Print version number to C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 =item C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 The reverse complemented sequence file is printed to C<STDOUT>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61 Redirect or pipe into another tool as needed.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 =item C<perl revcom_seq.pl -o gbk seq-file.embl E<gt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70 seq-file_revcom.gbk>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78 =item C<for file in *.embl; do perl revcom_seq.pl -o fasta "$file"
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 E<gt> "${file%.embl}"_revcom.fasta; done>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 =head1 DEPENDENCIES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 =item B<L<BioPerl|http://www.bioperl.org>>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 Tested with BioPerl version 1.007001
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95 0.2 update: 2015-12-10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 0.1 2013-08-02
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106 the Free Software Foundation; either version 3 (GPLv3) of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 License, or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129 use Bio::SeqIO; # bioperl module to handle sequence input/output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133 ### Get options with Getopt::Long
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 my $In_Format; # input seq file format needed for STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135 my $Out_Format; # optional different output seq file format
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 my $VERSION = 0.2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 GetOptions ('informat=s' => \$In_Format,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139 'outformat=s' => \$Out_Format,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141 'help|?' => \$Opt_Help)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 or pod2usage(-verbose => 1, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146 ### Run perldoc on POD
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148 if ($Opt_Version) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 print "$0 $VERSION\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155 ### Check input (@ARGV and STDIN)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 if (-t STDIN && ! @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 my $warning = "\n### Fatal error: No STDIN and no input file given as argument, please supply one of them and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 } elsif (!-t STDIN && @ARGV) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160 my $warning = "\n### Fatal error: Both STDIN and an input file given as argument, please supply only either one and/or see help with '-h'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163 die "\n### Fatal error: Too many arguments given, only STDIN or one input file allowed as argument! Please see the usage with option '-h' if unclear!\n" if (@ARGV > 1);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 die "\n### Fatal error: File '$ARGV[0]' does not exist!\n" if (@ARGV && $ARGV[0] ne '-' && !-e $ARGV[0]);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 ### Bio::SeqIO objects for input and output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 print STDERR "\nReverse complementing";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170 my $Seqin; # Bio::SeqIO object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 if (-t STDIN) { # input from file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 warn "\n### Warning: Ignoring input file format ('-i $In_Format'), because input file given and not STDIN!\n\n" if ($In_Format);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 my $seq_file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 $Seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 print STDERR " '$seq_file' ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 } elsif (!-t STDIN) { # input from STDIN
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 die "\n### Fatal error: Sequence file given as STDIN requires an input file format, please set one with option '-i' and/or see help with '-h'!\n" if (!$In_Format);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 $In_Format = 'genbank' if ($In_Format =~ /(gbk|gb)/i); # allow shorter format string for 'genbank'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179 $Seqin = Bio::SeqIO->new(-fh => \*STDIN, -format => $In_Format); # capture typeglob of STDIN, requires '-format'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 print STDERR " input file ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 print STDERR "...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 my $Seqout; # Bio::SeqIO object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 if ($Out_Format) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 $Out_Format = 'genbank' if ($Out_Format =~ /(gbk|gb)/i);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 } else { # same format as input file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 if (!-t STDIN) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 $Out_Format = $In_Format;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191 if (ref($Seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192 $Out_Format = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 $Seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $Out_Format); # printing to STDOUT requires '-format'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201 ### Write reverse complemented sequence (and its features) to STDOUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 while (my $seq_obj = $Seqin->next_seq) { # Bio::Seq object; for multi-seq files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 my $revcom = Bio::SeqUtils->revcom_with_features($seq_obj);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 $Seqout->write_seq($revcom);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 exit;