Mercurial > repos > dereeper > pangenome_explorer
comparison COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft
Uploaded
author | dereeper |
---|---|
date | Thu, 30 May 2024 11:52:25 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:97e4e3e818b6 | 3:e42d30da7a74 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 ####### | |
4 # POD # | |
5 ####### | |
6 | |
7 =pod | |
8 | |
9 =head1 NAME | |
10 | |
11 C<trunc_seq.pl> - truncate sequence files | |
12 | |
13 =head1 SYNOPSIS | |
14 | |
15 C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt> | |
16 seq-file_trunc_20_3500.embl> | |
17 | |
18 B<or> | |
19 | |
20 C<perl trunc_seq.pl file_of_filenames_and_coords.tsv> | |
21 | |
22 =head1 DESCRIPTION | |
23 | |
24 This script truncates sequence files according to the given | |
25 coordinates. The features/annotations in RichSeq files (e.g. EMBL or | |
26 GENBANK format) will also be adapted accordingly. Use option B<-o> to | |
27 specify a different output sequence format. Input can be given directly | |
28 as a file and truncation coordinates to the script, with the start | |
29 position as the first argument, stop as the second and (the path to) | |
30 the sequence file as the third. In this case the truncated sequence | |
31 entry is printed to C<STDOUT>. Input sequence files should contain only | |
32 one sequence entry, if a multi-sequence file is used as input only the | |
33 B<first> sequence entry is truncated. | |
34 | |
35 Alternatively, a file of filenames (fof) with respective coordinates | |
36 and sequence files in the following B<tab-separated> format can be | |
37 given to the script (the header is optional): | |
38 | |
39 #start\tstop\tseq-file | |
40 300\t9000\t(path/to/)seq-file | |
41 50\t1300\t(path/to/)seq-file2 | |
42 | |
43 With a fof the resulting truncated sequence files are printed into a | |
44 results directory. Use option B<-r> to specify a different results | |
45 directory than the default. | |
46 | |
47 It is also possible to truncate a RichSeq sequence file loaded into the | |
48 L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser | |
49 from the Sanger Institute: Select a subsequence and then go to Edit | |
50 -E<gt> Subsequence (and Features) | |
51 | |
52 =head1 OPTIONS | |
53 | |
54 =over 20 | |
55 | |
56 =item B<-h>, B<-help> | |
57 | |
58 Help (perldoc POD) | |
59 | |
60 =item B<-o>=I<str>, B<-outformat>=I<str> | |
61 | |
62 Specify different sequence format for the output (files) [fasta, embl, | |
63 or gbk] | |
64 | |
65 =item B<-r>=I<str>, B<-result_dir>=I<str> | |
66 | |
67 Path to result folder for fof input [default = './trunc_seq_results'] | |
68 | |
69 =item B<-v>, B<-version> | |
70 | |
71 Print version number to C<STDOUT> | |
72 | |
73 =back | |
74 | |
75 =head1 OUTPUT | |
76 | |
77 =over 20 | |
78 | |
79 =item C<STDOUT> | |
80 | |
81 If a single sequence file is given to the script the truncated sequence | |
82 file is printed to C<STDOUT>. Redirect or pipe into another tool as | |
83 needed. | |
84 | |
85 =back | |
86 | |
87 B<or> | |
88 | |
89 =over 20 | |
90 | |
91 =item F<./trunc_seq_results> | |
92 | |
93 If a fof is given to the script, all output files are stored in a | |
94 results folder | |
95 | |
96 =item F<./trunc_seq_results/seq-file_trunc_start_stop.format> | |
97 | |
98 Truncated output sequence files are named appended with 'trunc' and the | |
99 corresponding start and stop positions | |
100 | |
101 =back | |
102 | |
103 =head1 EXAMPLES | |
104 | |
105 =over | |
106 | |
107 =item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt> | |
108 seq-file_trunc_120_3000.gbk> | |
109 | |
110 =back | |
111 | |
112 B<or> | |
113 | |
114 =over | |
115 | |
116 =item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl | |
117 revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta> | |
118 | |
119 =back | |
120 | |
121 B<or> | |
122 | |
123 =over | |
124 | |
125 =item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl | |
126 file_of_filenames_and_coords.tsv> | |
127 | |
128 =back | |
129 | |
130 =head1 DEPENDENCIES | |
131 | |
132 =over | |
133 | |
134 =item B<L<BioPerl|http://www.bioperl.org>> | |
135 | |
136 Tested with BioPerl version 1.007001 | |
137 | |
138 =back | |
139 | |
140 =head1 VERSION | |
141 | |
142 0.2 update: 2015-12-07 | |
143 0.1 2013-08-02 | |
144 | |
145 =head1 AUTHOR | |
146 | |
147 Andreas Leimbach aleimba[at]gmx[dot]de | |
148 | |
149 =head1 LICENSE | |
150 | |
151 This program is free software: you can redistribute it and/or modify | |
152 it under the terms of the GNU General Public License as published by | |
153 the Free Software Foundation; either version 3 (GPLv3) of the | |
154 License, or (at your option) any later version. | |
155 | |
156 This program is distributed in the hope that it will be useful, but | |
157 WITHOUT ANY WARRANTY; without even the implied warranty of | |
158 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
159 General Public License for more details. | |
160 | |
161 You should have received a copy of the GNU General Public License | |
162 along with this program. If not, see L<http://www.gnu.org/licenses/>. | |
163 | |
164 =cut | |
165 | |
166 | |
167 ######## | |
168 # MAIN # | |
169 ######## | |
170 | |
171 use strict; | |
172 use warnings; | |
173 use autodie; | |
174 use Getopt::Long; | |
175 use Pod::Usage; | |
176 use Bio::SeqIO; # bioperl module to handle sequence input/output | |
177 #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited | |
178 #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited | |
179 | |
180 ### Get options with Getopt::Long | |
181 my $Out_Format_Opt; # optional different output seq file format | |
182 my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input | |
183 my $VERSION = 0.2; | |
184 my ($Opt_Version, $Opt_Help); | |
185 GetOptions ('outformat=s' => \$Out_Format_Opt, | |
186 'result_dir=s' => \$Result_Dir, | |
187 'version' => \$Opt_Version, | |
188 'help|?' => \$Opt_Help) | |
189 or pod2usage(-verbose => 1, -exitval => 2); | |
190 | |
191 | |
192 | |
193 ### Run perldoc on POD | |
194 pod2usage(-verbose => 2) if ($Opt_Help); | |
195 if ($Opt_Version) { | |
196 print "$0 $VERSION\n"; | |
197 exit; | |
198 } | |
199 | |
200 | |
201 | |
202 ### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc. | |
203 my $Fof; # file of filenames (fof) with truncation coords | |
204 my $Start; | |
205 my $Stop; | |
206 my $Seq_File; | |
207 if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) { | |
208 my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n"; | |
209 pod2usage(-verbose => 0, -message => $warning, -exitval => 2); | |
210 } elsif (@ARGV == 1) { # fof | |
211 check_file_exists($ARGV[0]); # subroutine to check for file existence | |
212 $Fof = shift; | |
213 } elsif (@ARGV == 3) { | |
214 check_file_exists($ARGV[2]); # subroutine | |
215 if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) { | |
216 my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n"; | |
217 pod2usage(-verbose => 0, -message => $warning, -exitval => 2); | |
218 } | |
219 ($Start, $Stop, $Seq_File) = @ARGV; | |
220 } | |
221 | |
222 | |
223 | |
224 ### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof | |
225 if ($Fof) { | |
226 open (my $fof_fh, "<", "$Fof"); | |
227 | |
228 # create result folder | |
229 $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path | |
230 if (-e $Result_Dir) { | |
231 empty_dir($Result_Dir); # subroutine to empty a directory with user interaction | |
232 } else { | |
233 mkdir $Result_Dir; | |
234 } | |
235 | |
236 while (my $line = <$fof_fh>) { | |
237 chomp $line; | |
238 next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines | |
239 | |
240 die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/); | |
241 ($Start, $Stop, $Seq_File) = split(/\t/, $line); | |
242 check_file_exists($Seq_File); # subroutine | |
243 | |
244 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object | |
245 my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames | |
246 $seqout->write_seq($truncseq); | |
247 } | |
248 close $fof_fh; | |
249 | |
250 } else { # single seq file, @ARGV == 3 | |
251 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine | |
252 my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output | |
253 $seqout->write_seq($truncseq); | |
254 } | |
255 | |
256 exit; | |
257 | |
258 | |
259 | |
260 ############### | |
261 # Subroutines # | |
262 ############### | |
263 | |
264 ### Subroutine to check if file exists | |
265 sub check_file_exists { | |
266 my $file = shift; | |
267 die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file); | |
268 } | |
269 | |
270 | |
271 | |
272 ### Subroutine to empty a directory with user interaction | |
273 sub empty_dir { | |
274 my $dir = shift; | |
275 print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? "; | |
276 my $user_ask = <STDIN>; | |
277 if ($user_ask =~ /y/i) { | |
278 unlink glob "$dir/*"; # remove all files in results directory | |
279 } else { | |
280 die "\nScript abborted!\n"; | |
281 } | |
282 return 1; | |
283 } | |
284 | |
285 | |
286 | |
287 ### Subroutine to create a Bio::SeqIO output object | |
288 sub seq_out { | |
289 my ($seqin, $start, $stop, $seq_file) = @_; | |
290 | |
291 my $out_format; # need to keep $Out_Format_Opt for several seq files with fof | |
292 if ($Out_Format_Opt) { | |
293 $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format | |
294 $out_format = $Out_Format_Opt; | |
295 } else { # same format as input file | |
296 if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing | |
297 $out_format = $1; | |
298 } else { | |
299 die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n"; | |
300 } | |
301 } | |
302 | |
303 my $seqout; # Bio::SeqIO object | |
304 if ($seq_file) { # fof | |
305 $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows) | |
306 my $file_ext; | |
307 if ($out_format eq 'genbank') { | |
308 $file_ext = 'gbk'; # back to shorter file extension for GENBANK format | |
309 } else { | |
310 $file_ext = $out_format; | |
311 } | |
312 $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename | |
313 $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format); | |
314 | |
315 } else { # single seq file input | |
316 $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format' | |
317 } | |
318 | |
319 return $seqout; | |
320 } | |
321 | |
322 | |
323 | |
324 ### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object | |
325 sub trunc_seq { | |
326 my ($start, $stop, $seq_file) = @_; | |
327 print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n"; | |
328 my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing | |
329 my $count = 0; | |
330 my $truncseq; | |
331 while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object | |
332 $count++; | |
333 if ($count > 1) { | |
334 warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n"; | |
335 last; | |
336 } | |
337 $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop); | |
338 } | |
339 return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout | |
340 } |