pangenome_explorer: COG/bac-genomics-scripts/trunc_seq/trunc

comparison COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft

Uploaded

author	dereeper
date	Thu, 30 May 2024 11:52:25 +0000
parents
children

comparison

equal deleted inserted replaced

-:97e4e3e818b6
+:e42d30da7a74
+#!/usr/bin/perl
+#######
+# POD #
+#######
+=pod
+=head1 NAME
+C<trunc_seq.pl> - truncate sequence files
+=head1 SYNOPSIS
+C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt>
+seq-file_trunc_20_3500.embl>
+B<or>
+C<perl trunc_seq.pl file_of_filenames_and_coords.tsv>
+=head1 DESCRIPTION
+This script truncates sequence files according to the given
+coordinates. The features/annotations in RichSeq files (e.g. EMBL or
+GENBANK format) will also be adapted accordingly. Use option B<-o> to
+specify a different output sequence format. Input can be given directly
+as a file and truncation coordinates to the script, with the start
+position as the first argument, stop as the second and (the path to)
+the sequence file as the third. In this case the truncated sequence
+entry is printed to C<STDOUT>. Input sequence files should contain only
+one sequence entry, if a multi-sequence file is used as input only the
+B<first> sequence entry is truncated.
+Alternatively, a file of filenames (fof) with respective coordinates
+and sequence files in the following B<tab-separated> format can be
+given to the script (the header is optional):
+#start\tstop\tseq-file
+300\t9000\t(path/to/)seq-file
+50\t1300\t(path/to/)seq-file2
+With a fof the resulting truncated sequence files are printed into a
+results directory. Use option B<-r> to specify a different results
+directory than the default.
+It is also possible to truncate a RichSeq sequence file loaded into the
+L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser
+from the Sanger Institute: Select a subsequence and then go to Edit
+-E<gt> Subsequence (and Features)
+=head1 OPTIONS
+=over 20
+=item B<-h>, B<-help>
+Help (perldoc POD)
+=item B<-o>=I<str>, B<-outformat>=I<str>
+Specify different sequence format for the output (files) [fasta, embl,
+or gbk]
+=item B<-r>=I<str>, B<-result_dir>=I<str>
+Path to result folder for fof input [default = './trunc_seq_results']
+=item B<-v>, B<-version>
+Print version number to C<STDOUT>
+=back
+=head1 OUTPUT
+=over 20
+=item C<STDOUT>
+If a single sequence file is given to the script the truncated sequence
+file is printed to C<STDOUT>. Redirect or pipe into another tool as
+needed.
+=back
+B<or>
+=over 20
+=item F<./trunc_seq_results>
+If a fof is given to the script, all output files are stored in a
+results folder
+=item F<./trunc_seq_results/seq-file_trunc_start_stop.format>
+Truncated output sequence files are named appended with 'trunc' and the
+corresponding start and stop positions
+=back
+=head1 EXAMPLES
+=over
+=item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt>
+seq-file_trunc_120_3000.gbk>
+=back
+B<or>
+=over
+=item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl
+revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta>
+=back
+B<or>
+=over
+=item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl
+file_of_filenames_and_coords.tsv>
+=back
+=head1 DEPENDENCIES
+=over
+=item B<L<BioPerl|http://www.bioperl.org>>
+Tested with BioPerl version 1.007001
+=back
+=head1 VERSION
+0.2                                               update: 2015-12-07
+0.1                                                       2013-08-02
+=head1 AUTHOR
+Andreas Leimbach                               aleimba[at]gmx[dot]de
+=head1 LICENSE
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 (GPLv3) of the
+License, or (at your option) any later version.
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+You should have received a copy of the GNU General Public License
+along with this program. If not, see L<http://www.gnu.org/licenses/>.
+=cut
+########
+# MAIN #
+########
+use strict;
+use warnings;
+use autodie;
+use Getopt::Long;
+use Pod::Usage;
+use Bio::SeqIO; # bioperl module to handle sequence input/output
+#use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited
+#use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited
+### Get options with Getopt::Long
+my $Out_Format_Opt; # optional different output seq file format
+my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input
+my $VERSION = 0.2;
+my ($Opt_Version, $Opt_Help);
+GetOptions ('outformat=s' => \$Out_Format_Opt,
+'result_dir=s' => \$Result_Dir,
+'version' => \$Opt_Version,
+'help|?' => \$Opt_Help)
+or pod2usage(-verbose => 1, -exitval => 2);
+### Run perldoc on POD
+pod2usage(-verbose => 2) if ($Opt_Help);
+if ($Opt_Version) {
+print "$0 $VERSION\n";
+exit;
+}
+### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc.
+my $Fof; # file of filenames (fof) with truncation coords
+my $Start;
+my $Stop;
+my $Seq_File;
+if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) {
+my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n";
+pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+} elsif (@ARGV == 1) { # fof
+check_file_exists($ARGV[0]); # subroutine to check for file existence
+$Fof = shift;
+} elsif (@ARGV == 3) {
+check_file_exists($ARGV[2]); # subroutine
+if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) {
+my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n";
+pod2usage(-verbose => 0, -message => $warning, -exitval => 2);
+}
+($Start, $Stop, $Seq_File) = @ARGV;
+}
+### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof
+if ($Fof) {
+open (my $fof_fh, "<", "$Fof");
+# create result folder
+$Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path
+if (-e $Result_Dir) {
+empty_dir($Result_Dir); # subroutine to empty a directory with user interaction
+} else {
+mkdir $Result_Dir;
+}
+while (my $line = <$fof_fh>) {
+chomp $line;
+next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines
+die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/);
+($Start, $Stop, $Seq_File) = split(/\t/, $line);
+check_file_exists($Seq_File); # subroutine
+my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object
+my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames
+$seqout->write_seq($truncseq);
+}
+close $fof_fh;
+} else { # single seq file, @ARGV == 3
+my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine
+my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output
+$seqout->write_seq($truncseq);
+}
+exit;
+###############
+# Subroutines #
+###############
+### Subroutine to check if file exists
+sub check_file_exists {
+my $file = shift;
+die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file);
+}
+### Subroutine to empty a directory with user interaction
+sub empty_dir {
+my $dir = shift;
+print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? ";
+my $user_ask = <STDIN>;
+if ($user_ask =~ /y/i) {
+unlink glob "$dir/*"; # remove all files in results directory
+} else {
+die "\nScript abborted!\n";
+}
+return 1;
+}
+### Subroutine to create a Bio::SeqIO output object
+sub seq_out {
+my ($seqin, $start, $stop, $seq_file) = @_;
+my $out_format; # need to keep $Out_Format_Opt for several seq files with fof
+if ($Out_Format_Opt) {
+$Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format
+$out_format = $Out_Format_Opt;
+} else { # same format as input file
+if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing
+$out_format = $1;
+} else {
+die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n";
+}
+}
+my $seqout; # Bio::SeqIO object
+if ($seq_file) { # fof
+$seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows)
+my $file_ext;
+if ($out_format eq 'genbank') {
+$file_ext = 'gbk'; # back to shorter file extension for GENBANK format
+} else {
+$file_ext = $out_format;
+}
+$seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename
+$seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format);
+} else { # single seq file input
+$seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format'
+}
+return $seqout;
+}
+### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object
+sub trunc_seq {
+my ($start, $stop, $seq_file) = @_;
+print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n";
+my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing
+my $count = 0;
+my $truncseq;
+while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object
+$count++;
+if ($count > 1) {
+warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n";
+last;
+}
+$truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop);
+}
+return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout
+}

Mercurial > repos > dereeper > pangenome_explorer

comparison COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 3:e42d30da7a74 draft