annotate COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft

Uploaded
author dereeper
date Thu, 30 May 2024 11:52:25 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
1 #!/usr/bin/perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
3 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
4 # POD #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
5 #######
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
6
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
7 =pod
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
8
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
9 =head1 NAME
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
10
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
11 C<trunc_seq.pl> - truncate sequence files
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
12
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
13 =head1 SYNOPSIS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
14
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
15 C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
16 seq-file_trunc_20_3500.embl>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
17
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
18 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
19
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
20 C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
21
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
22 =head1 DESCRIPTION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
23
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
24 This script truncates sequence files according to the given
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
25 coordinates. The features/annotations in RichSeq files (e.g. EMBL or
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
26 GENBANK format) will also be adapted accordingly. Use option B<-o> to
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
27 specify a different output sequence format. Input can be given directly
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
28 as a file and truncation coordinates to the script, with the start
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
29 position as the first argument, stop as the second and (the path to)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
30 the sequence file as the third. In this case the truncated sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
31 entry is printed to C<STDOUT>. Input sequence files should contain only
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
32 one sequence entry, if a multi-sequence file is used as input only the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
33 B<first> sequence entry is truncated.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
34
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
35 Alternatively, a file of filenames (fof) with respective coordinates
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
36 and sequence files in the following B<tab-separated> format can be
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
37 given to the script (the header is optional):
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
38
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
39 #start\tstop\tseq-file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
40 300\t9000\t(path/to/)seq-file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
41 50\t1300\t(path/to/)seq-file2
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
42
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
43 With a fof the resulting truncated sequence files are printed into a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
44 results directory. Use option B<-r> to specify a different results
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
45 directory than the default.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
46
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
47 It is also possible to truncate a RichSeq sequence file loaded into the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
48 L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
49 from the Sanger Institute: Select a subsequence and then go to Edit
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
50 -E<gt> Subsequence (and Features)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
51
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
52 =head1 OPTIONS
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
53
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
54 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
55
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
56 =item B<-h>, B<-help>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
57
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
58 Help (perldoc POD)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
59
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
60 =item B<-o>=I<str>, B<-outformat>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
61
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
62 Specify different sequence format for the output (files) [fasta, embl,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
63 or gbk]
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
64
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
65 =item B<-r>=I<str>, B<-result_dir>=I<str>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
66
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
67 Path to result folder for fof input [default = './trunc_seq_results']
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
68
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
69 =item B<-v>, B<-version>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
70
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
71 Print version number to C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
72
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
73 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
74
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
75 =head1 OUTPUT
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
76
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
77 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
78
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
79 =item C<STDOUT>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
80
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
81 If a single sequence file is given to the script the truncated sequence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
82 file is printed to C<STDOUT>. Redirect or pipe into another tool as
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
83 needed.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
84
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
85 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
86
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
87 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
88
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
89 =over 20
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
90
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
91 =item F<./trunc_seq_results>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
92
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
93 If a fof is given to the script, all output files are stored in a
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
94 results folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
95
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
96 =item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
97
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
98 Truncated output sequence files are named appended with 'trunc' and the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
99 corresponding start and stop positions
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
100
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
101 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
102
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
103 =head1 EXAMPLES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
104
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
105 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
106
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
107 =item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
108 seq-file_trunc_120_3000.gbk>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
109
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
110 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
111
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
112 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
113
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
114 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
115
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
116 =item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
117 revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
118
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
119 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
120
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
121 B<or>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
122
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
123 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
124
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
125 =item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
126 file_of_filenames_and_coords.tsv>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
127
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
128 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
129
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
130 =head1 DEPENDENCIES
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
131
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
132 =over
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
133
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
134 =item B<L<BioPerl|http://www.bioperl.org>>
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
135
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
136 Tested with BioPerl version 1.007001
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
137
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
138 =back
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
139
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
140 =head1 VERSION
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
141
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
142 0.2 update: 2015-12-07
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
143 0.1 2013-08-02
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
144
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
145 =head1 AUTHOR
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
146
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
147 Andreas Leimbach aleimba[at]gmx[dot]de
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
148
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
149 =head1 LICENSE
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
150
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
151 This program is free software: you can redistribute it and/or modify
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
152 it under the terms of the GNU General Public License as published by
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
153 the Free Software Foundation; either version 3 (GPLv3) of the
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
154 License, or (at your option) any later version.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
155
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
156 This program is distributed in the hope that it will be useful, but
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
157 WITHOUT ANY WARRANTY; without even the implied warranty of
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
158 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
159 General Public License for more details.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
160
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
161 You should have received a copy of the GNU General Public License
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
162 along with this program. If not, see L<http://www.gnu.org/licenses/>.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
163
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
164 =cut
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
165
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
166
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
167 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
168 # MAIN #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
169 ########
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
170
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
171 use strict;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
172 use warnings;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
173 use autodie;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
174 use Getopt::Long;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
175 use Pod::Usage;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
176 use Bio::SeqIO; # bioperl module to handle sequence input/output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
177 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
178 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
179
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
180 ### Get options with Getopt::Long
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
181 my $Out_Format_Opt; # optional different output seq file format
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
182 my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
183 my $VERSION = 0.2;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
184 my ($Opt_Version, $Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
185 GetOptions ('outformat=s' => \$Out_Format_Opt,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
186 'result_dir=s' => \$Result_Dir,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
187 'version' => \$Opt_Version,
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
188 'help|?' => \$Opt_Help)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
189 or pod2usage(-verbose => 1, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
190
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
191
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
192
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
193 ### Run perldoc on POD
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
194 pod2usage(-verbose => 2) if ($Opt_Help);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
195 if ($Opt_Version) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
196 print "$0 $VERSION\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
197 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
198 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
199
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
200
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
201
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
202 ### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
203 my $Fof; # file of filenames (fof) with truncation coords
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
204 my $Start;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
205 my $Stop;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
206 my $Seq_File;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
207 if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
208 my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
209 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
210 } elsif (@ARGV == 1) { # fof
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
211 check_file_exists($ARGV[0]); # subroutine to check for file existence
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
212 $Fof = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
213 } elsif (@ARGV == 3) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
214 check_file_exists($ARGV[2]); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
215 if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
216 my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
217 pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
218 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
219 ($Start, $Stop, $Seq_File) = @ARGV;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
220 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
221
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
222
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
223
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
224 ### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
225 if ($Fof) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
226 open (my $fof_fh, "<", "$Fof");
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
227
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
228 # create result folder
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
229 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
230 if (-e $Result_Dir) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
231 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
232 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
233 mkdir $Result_Dir;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
234 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
235
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
236 while (my $line = <$fof_fh>) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
237 chomp $line;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
238 next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
239
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
240 die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
241 ($Start, $Stop, $Seq_File) = split(/\t/, $line);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
242 check_file_exists($Seq_File); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
243
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
244 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
245 my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
246 $seqout->write_seq($truncseq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
247 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
248 close $fof_fh;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
249
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
250 } else { # single seq file, @ARGV == 3
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
251 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
252 my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
253 $seqout->write_seq($truncseq);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
254 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
255
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
256 exit;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
257
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
258
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
259
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
260 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
261 # Subroutines #
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
262 ###############
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
263
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
264 ### Subroutine to check if file exists
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
265 sub check_file_exists {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
266 my $file = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
267 die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
268 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
269
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
270
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
271
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
272 ### Subroutine to empty a directory with user interaction
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
273 sub empty_dir {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
274 my $dir = shift;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
275 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
276 my $user_ask = <STDIN>;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
277 if ($user_ask =~ /y/i) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
278 unlink glob "$dir/*"; # remove all files in results directory
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
279 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
280 die "\nScript abborted!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
281 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
282 return 1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
283 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
284
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
285
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
286
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
287 ### Subroutine to create a Bio::SeqIO output object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
288 sub seq_out {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
289 my ($seqin, $start, $stop, $seq_file) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
290
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
291 my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
292 if ($Out_Format_Opt) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
293 $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
294 $out_format = $Out_Format_Opt;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
295 } else { # same format as input file
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
296 if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
297 $out_format = $1;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
298 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
299 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
300 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
301 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
302
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
303 my $seqout; # Bio::SeqIO object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
304 if ($seq_file) { # fof
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
305 $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
306 my $file_ext;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
307 if ($out_format eq 'genbank') {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
308 $file_ext = 'gbk'; # back to shorter file extension for GENBANK format
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
309 } else {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
310 $file_ext = $out_format;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
311 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
312 $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
313 $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
314
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
315 } else { # single seq file input
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
316 $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
317 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
318
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
319 return $seqout;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
320 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
321
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
322
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
323
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
324 ### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
325 sub trunc_seq {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
326 my ($start, $stop, $seq_file) = @_;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
327 print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
328 my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
329 my $count = 0;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
330 my $truncseq;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
331 while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
332 $count++;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
333 if ($count > 1) {
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
334 warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
335 last;
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
336 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
337 $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
338 }
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
339 return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
e42d30da7a74 Uploaded
dereeper
parents:
diff changeset
340 }