3
|
1 #!/usr/bin/perl
|
|
2
|
|
3 #######
|
|
4 # POD #
|
|
5 #######
|
|
6
|
|
7 =pod
|
|
8
|
|
9 =head1 NAME
|
|
10
|
|
11 C<trunc_seq.pl> - truncate sequence files
|
|
12
|
|
13 =head1 SYNOPSIS
|
|
14
|
|
15 C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
|
|
16 seq-file_trunc_20_3500.embl>
|
|
17
|
|
18 B<or>
|
|
19
|
|
20 C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
|
|
21
|
|
22 =head1 DESCRIPTION
|
|
23
|
|
24 This script truncates sequence files according to the given
|
|
25 coordinates. The features/annotations in RichSeq files (e.g. EMBL or
|
|
26 GENBANK format) will also be adapted accordingly. Use option B<-o> to
|
|
27 specify a different output sequence format. Input can be given directly
|
|
28 as a file and truncation coordinates to the script, with the start
|
|
29 position as the first argument, stop as the second and (the path to)
|
|
30 the sequence file as the third. In this case the truncated sequence
|
|
31 entry is printed to C<STDOUT>. Input sequence files should contain only
|
|
32 one sequence entry, if a multi-sequence file is used as input only the
|
|
33 B<first> sequence entry is truncated.
|
|
34
|
|
35 Alternatively, a file of filenames (fof) with respective coordinates
|
|
36 and sequence files in the following B<tab-separated> format can be
|
|
37 given to the script (the header is optional):
|
|
38
|
|
39 #start\tstop\tseq-file
|
|
40 300\t9000\t(path/to/)seq-file
|
|
41 50\t1300\t(path/to/)seq-file2
|
|
42
|
|
43 With a fof the resulting truncated sequence files are printed into a
|
|
44 results directory. Use option B<-r> to specify a different results
|
|
45 directory than the default.
|
|
46
|
|
47 It is also possible to truncate a RichSeq sequence file loaded into the
|
|
48 L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
|
|
49 from the Sanger Institute: Select a subsequence and then go to Edit
|
|
50 -E<gt> Subsequence (and Features)
|
|
51
|
|
52 =head1 OPTIONS
|
|
53
|
|
54 =over 20
|
|
55
|
|
56 =item B<-h>, B<-help>
|
|
57
|
|
58 Help (perldoc POD)
|
|
59
|
|
60 =item B<-o>=I<str>, B<-outformat>=I<str>
|
|
61
|
|
62 Specify different sequence format for the output (files) [fasta, embl,
|
|
63 or gbk]
|
|
64
|
|
65 =item B<-r>=I<str>, B<-result_dir>=I<str>
|
|
66
|
|
67 Path to result folder for fof input [default = './trunc_seq_results']
|
|
68
|
|
69 =item B<-v>, B<-version>
|
|
70
|
|
71 Print version number to C<STDOUT>
|
|
72
|
|
73 =back
|
|
74
|
|
75 =head1 OUTPUT
|
|
76
|
|
77 =over 20
|
|
78
|
|
79 =item C<STDOUT>
|
|
80
|
|
81 If a single sequence file is given to the script the truncated sequence
|
|
82 file is printed to C<STDOUT>. Redirect or pipe into another tool as
|
|
83 needed.
|
|
84
|
|
85 =back
|
|
86
|
|
87 B<or>
|
|
88
|
|
89 =over 20
|
|
90
|
|
91 =item F<./trunc_seq_results>
|
|
92
|
|
93 If a fof is given to the script, all output files are stored in a
|
|
94 results folder
|
|
95
|
|
96 =item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
|
|
97
|
|
98 Truncated output sequence files are named appended with 'trunc' and the
|
|
99 corresponding start and stop positions
|
|
100
|
|
101 =back
|
|
102
|
|
103 =head1 EXAMPLES
|
|
104
|
|
105 =over
|
|
106
|
|
107 =item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
|
|
108 seq-file_trunc_120_3000.gbk>
|
|
109
|
|
110 =back
|
|
111
|
|
112 B<or>
|
|
113
|
|
114 =over
|
|
115
|
|
116 =item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
|
|
117 revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
|
|
118
|
|
119 =back
|
|
120
|
|
121 B<or>
|
|
122
|
|
123 =over
|
|
124
|
|
125 =item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
|
|
126 file_of_filenames_and_coords.tsv>
|
|
127
|
|
128 =back
|
|
129
|
|
130 =head1 DEPENDENCIES
|
|
131
|
|
132 =over
|
|
133
|
|
134 =item B<L<BioPerl|http://www.bioperl.org>>
|
|
135
|
|
136 Tested with BioPerl version 1.007001
|
|
137
|
|
138 =back
|
|
139
|
|
140 =head1 VERSION
|
|
141
|
|
142 0.2 update: 2015-12-07
|
|
143 0.1 2013-08-02
|
|
144
|
|
145 =head1 AUTHOR
|
|
146
|
|
147 Andreas Leimbach aleimba[at]gmx[dot]de
|
|
148
|
|
149 =head1 LICENSE
|
|
150
|
|
151 This program is free software: you can redistribute it and/or modify
|
|
152 it under the terms of the GNU General Public License as published by
|
|
153 the Free Software Foundation; either version 3 (GPLv3) of the
|
|
154 License, or (at your option) any later version.
|
|
155
|
|
156 This program is distributed in the hope that it will be useful, but
|
|
157 WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
158 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
159 General Public License for more details.
|
|
160
|
|
161 You should have received a copy of the GNU General Public License
|
|
162 along with this program. If not, see L<http://www.gnu.org/licenses/>.
|
|
163
|
|
164 =cut
|
|
165
|
|
166
|
|
167 ########
|
|
168 # MAIN #
|
|
169 ########
|
|
170
|
|
171 use strict;
|
|
172 use warnings;
|
|
173 use autodie;
|
|
174 use Getopt::Long;
|
|
175 use Pod::Usage;
|
|
176 use Bio::SeqIO; # bioperl module to handle sequence input/output
|
|
177 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
|
|
178 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
|
|
179
|
|
180 ### Get options with Getopt::Long
|
|
181 my $Out_Format_Opt; # optional different output seq file format
|
|
182 my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
|
|
183 my $VERSION = 0.2;
|
|
184 my ($Opt_Version, $Opt_Help);
|
|
185 GetOptions ('outformat=s' => \$Out_Format_Opt,
|
|
186 'result_dir=s' => \$Result_Dir,
|
|
187 'version' => \$Opt_Version,
|
|
188 'help|?' => \$Opt_Help)
|
|
189 or pod2usage(-verbose => 1, -exitval => 2);
|
|
190
|
|
191
|
|
192
|
|
193 ### Run perldoc on POD
|
|
194 pod2usage(-verbose => 2) if ($Opt_Help);
|
|
195 if ($Opt_Version) {
|
|
196 print "$0 $VERSION\n";
|
|
197 exit;
|
|
198 }
|
|
199
|
|
200
|
|
201
|
|
202 ### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
|
|
203 my $Fof; # file of filenames (fof) with truncation coords
|
|
204 my $Start;
|
|
205 my $Stop;
|
|
206 my $Seq_File;
|
|
207 if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
|
|
208 my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
|
|
209 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
|
|
210 } elsif (@ARGV == 1) { # fof
|
|
211 check_file_exists($ARGV[0]); # subroutine to check for file existence
|
|
212 $Fof = shift;
|
|
213 } elsif (@ARGV == 3) {
|
|
214 check_file_exists($ARGV[2]); # subroutine
|
|
215 if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
|
|
216 my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
|
|
217 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
|
|
218 }
|
|
219 ($Start, $Stop, $Seq_File) = @ARGV;
|
|
220 }
|
|
221
|
|
222
|
|
223
|
|
224 ### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
|
|
225 if ($Fof) {
|
|
226 open (my $fof_fh, "<", "$Fof");
|
|
227
|
|
228 # create result folder
|
|
229 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
|
|
230 if (-e $Result_Dir) {
|
|
231 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
|
|
232 } else {
|
|
233 mkdir $Result_Dir;
|
|
234 }
|
|
235
|
|
236 while (my $line = <$fof_fh>) {
|
|
237 chomp $line;
|
|
238 next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
|
|
239
|
|
240 die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
|
|
241 ($Start, $Stop, $Seq_File) = split(/\t/, $line);
|
|
242 check_file_exists($Seq_File); # subroutine
|
|
243
|
|
244 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
|
|
245 my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
|
|
246 $seqout->write_seq($truncseq);
|
|
247 }
|
|
248 close $fof_fh;
|
|
249
|
|
250 } else { # single seq file, @ARGV == 3
|
|
251 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
|
|
252 my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
|
|
253 $seqout->write_seq($truncseq);
|
|
254 }
|
|
255
|
|
256 exit;
|
|
257
|
|
258
|
|
259
|
|
260 ###############
|
|
261 # Subroutines #
|
|
262 ###############
|
|
263
|
|
264 ### Subroutine to check if file exists
|
|
265 sub check_file_exists {
|
|
266 my $file = shift;
|
|
267 die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
|
|
268 }
|
|
269
|
|
270
|
|
271
|
|
272 ### Subroutine to empty a directory with user interaction
|
|
273 sub empty_dir {
|
|
274 my $dir = shift;
|
|
275 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
|
|
276 my $user_ask = <STDIN>;
|
|
277 if ($user_ask =~ /y/i) {
|
|
278 unlink glob "$dir/*"; # remove all files in results directory
|
|
279 } else {
|
|
280 die "\nScript abborted!\n";
|
|
281 }
|
|
282 return 1;
|
|
283 }
|
|
284
|
|
285
|
|
286
|
|
287 ### Subroutine to create a Bio::SeqIO output object
|
|
288 sub seq_out {
|
|
289 my ($seqin, $start, $stop, $seq_file) = @_;
|
|
290
|
|
291 my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
|
|
292 if ($Out_Format_Opt) {
|
|
293 $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
|
|
294 $out_format = $Out_Format_Opt;
|
|
295 } else { # same format as input file
|
|
296 if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
|
|
297 $out_format = $1;
|
|
298 } else {
|
|
299 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
|
|
300 }
|
|
301 }
|
|
302
|
|
303 my $seqout; # Bio::SeqIO object
|
|
304 if ($seq_file) { # fof
|
|
305 $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
|
|
306 my $file_ext;
|
|
307 if ($out_format eq 'genbank') {
|
|
308 $file_ext = 'gbk'; # back to shorter file extension for GENBANK format
|
|
309 } else {
|
|
310 $file_ext = $out_format;
|
|
311 }
|
|
312 $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
|
|
313 $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
|
|
314
|
|
315 } else { # single seq file input
|
|
316 $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
|
|
317 }
|
|
318
|
|
319 return $seqout;
|
|
320 }
|
|
321
|
|
322
|
|
323
|
|
324 ### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
|
|
325 sub trunc_seq {
|
|
326 my ($start, $stop, $seq_file) = @_;
|
|
327 print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
|
|
328 my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
|
|
329 my $count = 0;
|
|
330 my $truncseq;
|
|
331 while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
|
|
332 $count++;
|
|
333 if ($count > 1) {
|
|
334 warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
|
|
335 last;
|
|
336 }
|
|
337 $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
|
|
338 }
|
|
339 return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
|
|
340 }
|