comparison COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
comparison
equal deleted inserted replaced
2:97e4e3e818b6 3:e42d30da7a74
1 #!/usr/bin/perl
2
3 #######
4 # POD #
5 #######
6
7 =pod
8
9 =head1 NAME
10
11 C<trunc_seq.pl> - truncate sequence files
12
13 =head1 SYNOPSIS
14
15 C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
16 seq-file_trunc_20_3500.embl>
17
18 B<or>
19
20 C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
21
22 =head1 DESCRIPTION
23
24 This script truncates sequence files according to the given
25 coordinates. The features/annotations in RichSeq files (e.g. EMBL or
26 GENBANK format) will also be adapted accordingly. Use option B<-o> to
27 specify a different output sequence format. Input can be given directly
28 as a file and truncation coordinates to the script, with the start
29 position as the first argument, stop as the second and (the path to)
30 the sequence file as the third. In this case the truncated sequence
31 entry is printed to C<STDOUT>. Input sequence files should contain only
32 one sequence entry, if a multi-sequence file is used as input only the
33 B<first> sequence entry is truncated.
34
35 Alternatively, a file of filenames (fof) with respective coordinates
36 and sequence files in the following B<tab-separated> format can be
37 given to the script (the header is optional):
38
39 #start\tstop\tseq-file
40 300\t9000\t(path/to/)seq-file
41 50\t1300\t(path/to/)seq-file2
42
43 With a fof the resulting truncated sequence files are printed into a
44 results directory. Use option B<-r> to specify a different results
45 directory than the default.
46
47 It is also possible to truncate a RichSeq sequence file loaded into the
48 L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
49 from the Sanger Institute: Select a subsequence and then go to Edit
50 -E<gt> Subsequence (and Features)
51
52 =head1 OPTIONS
53
54 =over 20
55
56 =item B<-h>, B<-help>
57
58 Help (perldoc POD)
59
60 =item B<-o>=I<str>, B<-outformat>=I<str>
61
62 Specify different sequence format for the output (files) [fasta, embl,
63 or gbk]
64
65 =item B<-r>=I<str>, B<-result_dir>=I<str>
66
67 Path to result folder for fof input [default = './trunc_seq_results']
68
69 =item B<-v>, B<-version>
70
71 Print version number to C<STDOUT>
72
73 =back
74
75 =head1 OUTPUT
76
77 =over 20
78
79 =item C<STDOUT>
80
81 If a single sequence file is given to the script the truncated sequence
82 file is printed to C<STDOUT>. Redirect or pipe into another tool as
83 needed.
84
85 =back
86
87 B<or>
88
89 =over 20
90
91 =item F<./trunc_seq_results>
92
93 If a fof is given to the script, all output files are stored in a
94 results folder
95
96 =item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
97
98 Truncated output sequence files are named appended with 'trunc' and the
99 corresponding start and stop positions
100
101 =back
102
103 =head1 EXAMPLES
104
105 =over
106
107 =item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
108 seq-file_trunc_120_3000.gbk>
109
110 =back
111
112 B<or>
113
114 =over
115
116 =item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
117 revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
118
119 =back
120
121 B<or>
122
123 =over
124
125 =item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
126 file_of_filenames_and_coords.tsv>
127
128 =back
129
130 =head1 DEPENDENCIES
131
132 =over
133
134 =item B<L<BioPerl|http://www.bioperl.org>>
135
136 Tested with BioPerl version 1.007001
137
138 =back
139
140 =head1 VERSION
141
142 0.2 update: 2015-12-07
143 0.1 2013-08-02
144
145 =head1 AUTHOR
146
147 Andreas Leimbach aleimba[at]gmx[dot]de
148
149 =head1 LICENSE
150
151 This program is free software: you can redistribute it and/or modify
152 it under the terms of the GNU General Public License as published by
153 the Free Software Foundation; either version 3 (GPLv3) of the
154 License, or (at your option) any later version.
155
156 This program is distributed in the hope that it will be useful, but
157 WITHOUT ANY WARRANTY; without even the implied warranty of
158 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
159 General Public License for more details.
160
161 You should have received a copy of the GNU General Public License
162 along with this program. If not, see L<http://www.gnu.org/licenses/>.
163
164 =cut
165
166
167 ########
168 # MAIN #
169 ########
170
171 use strict;
172 use warnings;
173 use autodie;
174 use Getopt::Long;
175 use Pod::Usage;
176 use Bio::SeqIO; # bioperl module to handle sequence input/output
177 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
178 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
179
180 ### Get options with Getopt::Long
181 my $Out_Format_Opt; # optional different output seq file format
182 my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
183 my $VERSION = 0.2;
184 my ($Opt_Version, $Opt_Help);
185 GetOptions ('outformat=s' => \$Out_Format_Opt,
186 'result_dir=s' => \$Result_Dir,
187 'version' => \$Opt_Version,
188 'help|?' => \$Opt_Help)
189 or pod2usage(-verbose => 1, -exitval => 2);
190
191
192
193 ### Run perldoc on POD
194 pod2usage(-verbose => 2) if ($Opt_Help);
195 if ($Opt_Version) {
196 print "$0 $VERSION\n";
197 exit;
198 }
199
200
201
202 ### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
203 my $Fof; # file of filenames (fof) with truncation coords
204 my $Start;
205 my $Stop;
206 my $Seq_File;
207 if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
208 my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
209 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
210 } elsif (@ARGV == 1) { # fof
211 check_file_exists($ARGV[0]); # subroutine to check for file existence
212 $Fof = shift;
213 } elsif (@ARGV == 3) {
214 check_file_exists($ARGV[2]); # subroutine
215 if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
216 my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
217 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
218 }
219 ($Start, $Stop, $Seq_File) = @ARGV;
220 }
221
222
223
224 ### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
225 if ($Fof) {
226 open (my $fof_fh, "<", "$Fof");
227
228 # create result folder
229 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
230 if (-e $Result_Dir) {
231 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
232 } else {
233 mkdir $Result_Dir;
234 }
235
236 while (my $line = <$fof_fh>) {
237 chomp $line;
238 next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
239
240 die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
241 ($Start, $Stop, $Seq_File) = split(/\t/, $line);
242 check_file_exists($Seq_File); # subroutine
243
244 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
245 my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
246 $seqout->write_seq($truncseq);
247 }
248 close $fof_fh;
249
250 } else { # single seq file, @ARGV == 3
251 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
252 my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
253 $seqout->write_seq($truncseq);
254 }
255
256 exit;
257
258
259
260 ###############
261 # Subroutines #
262 ###############
263
264 ### Subroutine to check if file exists
265 sub check_file_exists {
266 my $file = shift;
267 die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
268 }
269
270
271
272 ### Subroutine to empty a directory with user interaction
273 sub empty_dir {
274 my $dir = shift;
275 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
276 my $user_ask = <STDIN>;
277 if ($user_ask =~ /y/i) {
278 unlink glob "$dir/*"; # remove all files in results directory
279 } else {
280 die "\nScript abborted!\n";
281 }
282 return 1;
283 }
284
285
286
287 ### Subroutine to create a Bio::SeqIO output object
288 sub seq_out {
289 my ($seqin, $start, $stop, $seq_file) = @_;
290
291 my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
292 if ($Out_Format_Opt) {
293 $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
294 $out_format = $Out_Format_Opt;
295 } else { # same format as input file
296 if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
297 $out_format = $1;
298 } else {
299 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
300 }
301 }
302
303 my $seqout; # Bio::SeqIO object
304 if ($seq_file) { # fof
305 $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
306 my $file_ext;
307 if ($out_format eq 'genbank') {
308 $file_ext = 'gbk'; # back to shorter file extension for GENBANK format
309 } else {
310 $file_ext = $out_format;
311 }
312 $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
313 $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
314
315 } else { # single seq file input
316 $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
317 }
318
319 return $seqout;
320 }
321
322
323
324 ### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
325 sub trunc_seq {
326 my ($start, $stop, $seq_file) = @_;
327 print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
328 my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
329 my $count = 0;
330 my $truncseq;
331 while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
332 $count++;
333 if ($count > 1) {
334 warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
335 last;
336 }
337 $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
338 }
339 return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
340 }