Mercurial > repos > dereeper > pangenome_explorer
view COG/bac-genomics-scripts/trunc_seq/trunc_seq.pl @ 14:5a5c9a6b047b draft
Uploaded
author | dereeper |
---|---|
date | Tue, 10 Dec 2024 16:20:53 +0000 |
parents | e42d30da7a74 |
children |
line wrap: on
line source
#!/usr/bin/perl ####### # POD # ####### =pod =head1 NAME C<trunc_seq.pl> - truncate sequence files =head1 SYNOPSIS C<perl trunc_seq.pl 20 3500 seq-file.embl E<gt> seq-file_trunc_20_3500.embl> B<or> C<perl trunc_seq.pl file_of_filenames_and_coords.tsv> =head1 DESCRIPTION This script truncates sequence files according to the given coordinates. The features/annotations in RichSeq files (e.g. EMBL or GENBANK format) will also be adapted accordingly. Use option B<-o> to specify a different output sequence format. Input can be given directly as a file and truncation coordinates to the script, with the start position as the first argument, stop as the second and (the path to) the sequence file as the third. In this case the truncated sequence entry is printed to C<STDOUT>. Input sequence files should contain only one sequence entry, if a multi-sequence file is used as input only the B<first> sequence entry is truncated. Alternatively, a file of filenames (fof) with respective coordinates and sequence files in the following B<tab-separated> format can be given to the script (the header is optional): #start\tstop\tseq-file 300\t9000\t(path/to/)seq-file 50\t1300\t(path/to/)seq-file2 With a fof the resulting truncated sequence files are printed into a results directory. Use option B<-r> to specify a different results directory than the default. It is also possible to truncate a RichSeq sequence file loaded into the L<Artemis|http://www.sanger.ac.uk/science/tools/artemis> genome browser from the Sanger Institute: Select a subsequence and then go to Edit -E<gt> Subsequence (and Features) =head1 OPTIONS =over 20 =item B<-h>, B<-help> Help (perldoc POD) =item B<-o>=I<str>, B<-outformat>=I<str> Specify different sequence format for the output (files) [fasta, embl, or gbk] =item B<-r>=I<str>, B<-result_dir>=I<str> Path to result folder for fof input [default = './trunc_seq_results'] =item B<-v>, B<-version> Print version number to C<STDOUT> =back =head1 OUTPUT =over 20 =item C<STDOUT> If a single sequence file is given to the script the truncated sequence file is printed to C<STDOUT>. Redirect or pipe into another tool as needed. =back B<or> =over 20 =item F<./trunc_seq_results> If a fof is given to the script, all output files are stored in a results folder =item F<./trunc_seq_results/seq-file_trunc_start_stop.format> Truncated output sequence files are named appended with 'trunc' and the corresponding start and stop positions =back =head1 EXAMPLES =over =item C<perl trunc_seq.pl -o gbk 120 30000 seq-file.embl E<gt> seq-file_trunc_120_3000.gbk> =back B<or> =over =item C<perl trunc_seq.pl -o fasta 5300 18500 seq-file.gbk | perl revcom_seq.pl -i fasta E<gt> seq-file_trunc_revcom.fasta> =back B<or> =over =item C<perl trunc_seq.pl -r path/to/trunc_embl_dir -o embl file_of_filenames_and_coords.tsv> =back =head1 DEPENDENCIES =over =item B<L<BioPerl|http://www.bioperl.org>> Tested with BioPerl version 1.007001 =back =head1 VERSION 0.2 update: 2015-12-07 0.1 2013-08-02 =head1 AUTHOR Andreas Leimbach aleimba[at]gmx[dot]de =head1 LICENSE This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 (GPLv3) of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see L<http://www.gnu.org/licenses/>. =cut ######## # MAIN # ######## use strict; use warnings; use autodie; use Getopt::Long; use Pod::Usage; use Bio::SeqIO; # bioperl module to handle sequence input/output #use Bio::Seq; # bioperl module to handle sequences with features ### apparently not needed, methods inherited #use Bio::SeqUtils; # bioperl module with additional methods (including features) for Bio::Seq objects ### apparently not needed, methods inherited ### Get options with Getopt::Long my $Out_Format_Opt; # optional different output seq file format my $Result_Dir = 'trunc_seq_results'; # path to result folder for fof input my $VERSION = 0.2; my ($Opt_Version, $Opt_Help); GetOptions ('outformat=s' => \$Out_Format_Opt, 'result_dir=s' => \$Result_Dir, 'version' => \$Opt_Version, 'help|?' => \$Opt_Help) or pod2usage(-verbose => 1, -exitval => 2); ### Run perldoc on POD pod2usage(-verbose => 2) if ($Opt_Help); if ($Opt_Version) { print "$0 $VERSION\n"; exit; } ### Check input (@ARGV); didn't include STDIN as input option, too complicated here with fof etc. my $Fof; # file of filenames (fof) with truncation coords my $Start; my $Stop; my $Seq_File; if (@ARGV < 1 || @ARGV == 2 || @ARGV > 3) { my $warning = "\n### Fatal error: Give either three arguments,\n$0\tstart\tstop\tseq-file\nor one file of sequence filenames with truncation coordinates as argument! Please see the usage with option '-h' if unclear!\n"; pod2usage(-verbose => 0, -message => $warning, -exitval => 2); } elsif (@ARGV == 1) { # fof check_file_exists($ARGV[0]); # subroutine to check for file existence $Fof = shift; } elsif (@ARGV == 3) { check_file_exists($ARGV[2]); # subroutine if ($ARGV[0] !~ /^\d+$/ || $ARGV[1] !~ /^\d+$/) { my $warning = "\n### Fatal error: With a single sequence file input the first and second arguments are the start and stop positions for truncation, and need to include ONLY digits:\n$0\tstart\tstop\tseq-file\nPlease see the usage with option '-h' if unclear!\n"; pod2usage(-verbose => 0, -message => $warning, -exitval => 2); } ($Start, $Stop, $Seq_File) = @ARGV; } ### Truncate the sequence and write either to STDOUT for single seq file input or output files for fof if ($Fof) { open (my $fof_fh, "<", "$Fof"); # create result folder $Result_Dir =~ s/\/$//; # get rid of a potential '/' at the end of $Result_Dir path if (-e $Result_Dir) { empty_dir($Result_Dir); # subroutine to empty a directory with user interaction } else { mkdir $Result_Dir; } while (my $line = <$fof_fh>) { chomp $line; next if ($line =~ /^\s*$/ || $line =~ /^#/); # skip empty or comment lines die "\n### Fatal error: Line '$.' of the '$Fof' file of sequence filenames plus truncation coordinates does not include the mandatory tab-separated two NUMERICAL start and stop truncation positions, and the sequence file (without any other whitespaces):\nstart\tstop\tpath/to/seq-file\n" if ($line !~ /^\d+\t\d+\t\S+$/); ($Start, $Stop, $Seq_File) = split(/\t/, $line); check_file_exists($Seq_File); # subroutine my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO input object and truncate the respective Bio::Seq object my $seqout = seq_out($seqin, $Start, $Stop, $Seq_File); # subroutine to create a Bio::SeqIO output object, $seqin needed for format guessing, $Start/$Stop/$Seq_File needed for output filenames $seqout->write_seq($truncseq); } close $fof_fh; } else { # single seq file, @ARGV == 3 my ($seqin, $truncseq) = trunc_seq($Start, $Stop, $Seq_File); # subroutine my $seqout = seq_out($seqin); # subroutine, without $Start/$Stop/$Seq_file for STDOUT output $seqout->write_seq($truncseq); } exit; ############### # Subroutines # ############### ### Subroutine to check if file exists sub check_file_exists { my $file = shift; die "\n### Fatal error: File '$file' does not exist: $!\n" if (!-e $file); } ### Subroutine to empty a directory with user interaction sub empty_dir { my $dir = shift; print STDERR "\nDirectory '$dir' already exists! You can use either option '-r' to set a different output result directory name, or do you want to replace the directory and all its contents [y|n]? "; my $user_ask = <STDIN>; if ($user_ask =~ /y/i) { unlink glob "$dir/*"; # remove all files in results directory } else { die "\nScript abborted!\n"; } return 1; } ### Subroutine to create a Bio::SeqIO output object sub seq_out { my ($seqin, $start, $stop, $seq_file) = @_; my $out_format; # need to keep $Out_Format_Opt for several seq files with fof if ($Out_Format_Opt) { $Out_Format_Opt = 'genbank' if ($Out_Format_Opt =~ /(gbk|gb)/i); # allow shorter input for GENBANK format $out_format = $Out_Format_Opt; } else { # same format as input file if (ref($seqin) =~ /Bio::SeqIO::(genbank|embl|fasta)/) { # from bioperl guessing $out_format = $1; } else { die "\n### Fatal error: Could not determine input file format, please set an output file format with option '-o'!\n"; } } my $seqout; # Bio::SeqIO object if ($seq_file) { # fof $seq_file =~ s/\S+(\/|\\)//; # remove input filepaths, aka 'basename' ('/' for Unix and '\' for Windows) my $file_ext; if ($out_format eq 'genbank') { $file_ext = 'gbk'; # back to shorter file extension for GENBANK format } else { $file_ext = $out_format; } $seq_file =~ s/^(\S+)\.\w+$/$Result_Dir\/$1\_trunc_$start\_$stop\.$file_ext/; # append also result directory to output filename $seqout = Bio::SeqIO->new(-file => ">$seq_file", -format => $out_format); } else { # single seq file input $seqout = Bio::SeqIO->new(-fh => \*STDOUT, -format => $out_format); # printing to STDOUT requires '-format' } return $seqout; } ### Subroutine create a Bio::SeqIO input object and truncate the respective Bio::Seq object sub trunc_seq { my ($start, $stop, $seq_file) = @_; print STDERR "\nTruncating \"$seq_file\" to coordinates $start..$stop ...\n"; my $seqin = Bio::SeqIO->new(-file => "<$seq_file"); # Bio::SeqIO object; no '-format' given, leave it to bioperl guessing my $count = 0; my $truncseq; while (my $seq_obj = $seqin->next_seq) { # Bio::Seq object $count++; if ($count > 1) { warn "\n### Warning: More than one sequence entry in sequence file '$seq_file', but only the FIRST sequence entry will be truncated and printed to STDOUT or a result file!\n\n"; last; } $truncseq = Bio::SeqUtils->trunc_with_features($seq_obj, $start, $stop); } return ($seqin, $truncseq); # $seqin needed for outformat guessing in subroutine seqout }