Mercurial > repos > galaxyp > nbic_fasta
diff ConvertFastaHeaders.pl @ 0:163892325845 draft default tip
Initial commit.
author | galaxyp |
---|---|
date | Fri, 10 May 2013 17:15:08 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ConvertFastaHeaders.pl Fri May 10 17:15:08 2013 -0400 @@ -0,0 +1,669 @@ +#!/usr/bin/perl + +# +# convertFastaHeaders.pl +# +# $Id: ConvertFastaHeaders.pl 44 2010-10-18 12:58:41Z pieter.neerincx@gmail.com $ +# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ConvertFastaHeaders.pl $ +# $LastChangedDate: 2010-10-18 07:58:41 -0500 (Mon, 18 Oct 2010) $ +# $LastChangedRevision: 44 $ +# $LastChangedBy: pieter.neerincx@gmail.com $ +# +# Converts sequence header of FASTA files (in various customisable ways). +# + +# +# Initialize evironment +# +use strict; +use Getopt::Std; +use Log::Log4perl qw(:easy); + +my %log_levels = ( + 'ALL' => $ALL, + 'TRACE' => $TRACE, + 'DEBUG' => $DEBUG, + 'INFO' => $INFO, + 'WARN' => $WARN, + 'ERROR' => $ERROR, + 'FATAL' => $FATAL, + 'OFF' => $OFF, +); + +# +# Get options. +# +my %opts; +Getopt::Std::getopts('i:o:l:e:f:n:a:p:', \%opts); +my $input = $opts{'i'}; +my $output = $opts{'o'}; +my $log_level = $opts{'l'}; +my $extension = $opts{'e'}; +my @x_fixes_array = split(/\s+/, $opts{'f'}); +my $new_x_fix = $opts{'n'}; +my $action = $opts{'a'}; +my $position = $opts{'p'}; +my %ids_to_delete; +my @new_id_order; + +# +# Configure logging. +# +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'WARN'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'}); +#Log::Log4perl->init('log4perl.properties'); +Log::Log4perl->easy_init( + #{ level => $log_level, + # file => ">>ConvertFastaHeaders.log", + # layout => '%F{1}-%L-%M: %m%n' }, + { level => $log_level, + file => "STDERR", + layout => '%d L:%L %p> %m%n' }, +); +my $logger = Log::Log4perl::get_logger(); + +# +# Start the conversion process. +# +$logger->info("Starting..."); + +# +# Check user input. +# + +# Provides default if user did not specify action: +$action = (defined($action) ? $action : 'add'); + +# Check for valid action and action specific options. +if ($action eq 'add' || $action eq 'strip' || $action eq 'replace') { + + unless (scalar(@x_fixes_array) > 0) { + $logger->fatal('No prefixes or suffixes specified.'); + _Usage(); + } + + if ($action eq 'replace') { + unless (defined($new_x_fix) && $new_x_fix ne '') { + $logger->fatal('No new prefix or suffix specified to replace the existing ones.'); + _Usage(); + } + } + + # Provides default if user did not specify position: + $position = (defined($position) ? $position : 'prefix'); + # Check for valid position. + if ($action eq 'add' || $action eq 'strip') { + unless ($position eq 'prefix' || $position eq 'suffix') { + $logger->fatal('Illegal position specified. Must be \'prefix\' or \'suffix\'.'); + _Usage(); + } + } elsif ($action eq 'replace') { + unless ($position eq 'prefix' || $position eq 'suffix' || $position eq 'pre2suf' || $position eq 'suf2pre') { + $logger->fatal('Illegal position specified. Must be \'prefix\', \'suffix\', \'pre2suf\' or \'suf2pre\'.'); + _Usage(); + } + } + +} elsif ($action eq 'delete' || $action eq 'shuffle') { + + unless (defined($position) && $position ne '') { + $logger->fatal('No position specified.'); + _Usage(); + } + + my @id_indices = split(/,/, $position); + + # Check if the value is a number. + foreach my $index_number (@id_indices) { + + unless ($index_number =~ m/^[1-9][0-9]*$/) { + + $logger->fatal('Illegal character in position list. Must be a single positive integer or comma separated list of positive integers.'); + _Usage(); + + } + + if ($action eq 'delete') { + + $ids_to_delete{$index_number} = 'del'; + + } elsif ($action eq 'shuffle') { + + push(@new_id_order, $index_number); + + } + } + +} else { + $logger->fatal('Illegal action specified. Must be add, strip, replace, delete or shuffle.'); + _Usage(); +} + + +# Provides default if user did not specify log level: +$log_level = (defined($log_level) ? $log_level : 'WARN'); +# Reset log level to default if user specified illegal log level. +$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'}); + +# Provide default if user did not specify fasta filename extension: +$extension = (defined($extension) ? $extension : 'fa'); + +if ($input =~ /^$/ || $output =~ /^$/) { + # Indir and outdir cannot be empty. + _Usage(); +} +if ($input eq $output) { + $logger->fatal("Output dir/file is the same as the input dir/file. Please choose a different one."); + exit; +} + +# +# Check if input is a single file or a directory. +# +unless (-e $input && -r $input) { + + $logger->fatal("Input $input does not exist or is not readable: $!"); + exit; + +} else { + + if (-f $input) { + + # + # We've got an input file. + # + my $file; + if ($input =~ m/(.+\/)([^\/]+)$/) { + $file = $2; + } else { + $file = $input; + } + + $logger->info('Parsing ' . $file . "...\n"); + + _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order); + + $logger->info('Converted ' . $file); + + } else { + + # + # We've got an input directory. + # Assume the output is also a directory. + # Append trailing path separators if they was missing. + # + my $indir; + my $outdir; + unless ($input =~ m/\/$/) { + $input = $input .+ '/'; + } + unless ($output =~ m/\/$/) { + $output = $output .+ '/'; + } + # + # Make sure the input dir is a directory. + # + unless (-d $input) { + $logger->fatal("Input $input is not a file nor a directory: $!"); + exit; + } else { + $indir = $input; + $outdir = $output; + } + + # + # Get all FASTA files from the input dir. + # + my $files = _GetFiles($indir, $outdir, $extension); + + # + # Create the output directory if did not exist yet. + # + if (-e $outdir && -d $outdir) { + unless (-w $outdir) { + $logger->fatal("Cannot write to output directory $outdir. Check for permission errors, read-only file systems, etc."); + exit; + } + } else { + $logger->info("Creating output directory $outdir..."); + eval{mkdir($outdir);}; + if ($@) { + $logger->fatal("Cannot create output directory $outdir: $@"); + exit; + } + } + + # + # Convert FASTA files. + # + foreach my $file (@{$files}) { + + $logger->info('Parsing ' . $file . "...\n"); + + my $pathfrom = $indir .+ $file; + my $pathto = $outdir .+ $file; + + _ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order); + + $logger->info('Converted ' . $file); + + } + } +} + +$logger->info('Finished!'); + +# +## +### Internal subs. +## +# + +sub _GetFiles { + + my ($indir, $outdir, $extension) = @_; + my @files; + + # + # Get the relative path to the outdir. + # Use this to remove it from the list of files/folders that need to be processed + # in case it's a subfolder of the input directory. + # + $outdir =~ m/\/([^\/]+)\/$/; + my $outdir_rel = $1; + + # + # Get and parse all files from the input dir. + # + eval{ + opendir (INDIR, $indir); + @files = grep { /.+\.$extension/i and not /^\..*/ and not /$outdir_rel/} readdir INDIR; + closedir INDIR; + }; + if ($@) { + $logger->fatal("Cannot read files from input directory $indir: $@"); + exit; + } + + return(\@files); +} + +sub _ConvertFastaHeaders { + + $logger->debug('_ConvertFastaHeaders sub'); + + my ($pathfrom, $pathto, $action, $x_fixes_array, $new_x_fix, $position, $ids_to_delete, $new_id_order) = @_; + + my $header_count = 0; + + #local($/) = "\n\n"; # set line seperator to a blank line + open(READ,"<$pathfrom") or die "\tcan't open input file $pathfrom: $!"; + open(SAVE,">$pathto") or die "\tcan't open output file $pathto: $!"; + while (my $line = <READ>) { + + my $new_line; + + if ($line =~ /^>/) { + + # + # It's a header line. + # + $header_count++; + my $ids_string; + my $description; + my $line_end; + + if ($line =~ /^>([^\s]+)\s+(.+)([\n\r\f]+)/i) { + + # + # Header with descripton + # + $ids_string = $1; + $description = $2; + $line_end = $3; + + } elsif ($line =~ /^>([^\s]+)\s*([\n\r\f]+)/i) { + + # + # Header without descripton + # + $ids_string = $1; + $line_end = $2; + + } else { + + $logger->fatal("Malformed header line. Cannot find ID."); + exit; + + } + + my @ids = split(/\|/, $ids_string); + + if ($action eq 'strip') { + + $new_line = _StripFix($x_fixes_array, $ids_string, $description); + + } elsif ($action eq 'replace') { + + $new_line = _ReplaceFix($x_fixes_array, $new_x_fix, $position, \@ids, $description); + + } elsif ($action eq 'add') { + + $new_line = _AddFix($x_fixes_array, $position, \@ids, $description); + + } elsif ($action eq 'delete') { + + $new_line = _DeleteID($ids_to_delete, \@ids, $description); + + } elsif ($action eq 'shuffle') { + + $new_line = _ShuffleID($new_id_order, \@ids, $description); + + } + + unless (defined($new_line)) { + + $logger->fatal('Cannot convert header number: ' . $header_count); + $logger->fatal('Offending header line was: ' . $line); + exit; + + } + + $new_line .= $line_end; + + } elsif ($line =~ /^[\n\r\f]+$/) { + + # Skip blank line. + + } else { + + # + # It must be a sequence line. + # + $new_line = $line; + + } + + # Save (modified) line. + print SAVE $new_line or die "\tcan't save output to file $pathto: $!"; + + } + + close(READ); + close(SAVE); + +} + +sub _StripFix { + + my ($x_fixes_array, $ids_string, $description) = @_; + my $new_line; + + foreach my $x_fix (@{$x_fixes_array}) { + + $ids_string =~ s/$x_fix//g; + + } + + if (defined($description)) { + $new_line = '>' . $ids_string . ' ' . $description; + } else { + $new_line = '>' . $ids_string; + } + + return($new_line); + +} + +sub _ReplaceFix { + + my ($x_fixes_array, $new_x_fix, $position, $ids, $description) = @_; + my $new_line = '>'; + + for my $count (0 .. $#{$ids}) { + + my $id = ${$ids}[$count]; + my $stripped_id; + my $match = 0; + + if ($position eq 'prefix' || $position eq 'pre2suf') { + + foreach my $x_fix (@{$x_fixes_array}) { + + if ($id =~ m/^$x_fix(.+)/) { + + $stripped_id = $1; + $id = $stripped_id; + $match = 1; + + } + } + + } elsif ($position eq 'suffix' || $position eq 'suf2pre') { + + foreach my $x_fix (@{$x_fixes_array}) { + + if ($id =~ m/(.+)$x_fix$/) { + + $stripped_id = $1; + $id = $stripped_id; + $match = 1; + + } + } + + } else { + + $logger->fatal("Illegal or no position $position specified."); + exit; + + } + + if ($match) { + + # + # Append the new *fix. + # + if ($position eq 'prefix' || $position eq 'suf2pre') { + + $new_line .= $new_x_fix . $stripped_id . '|'; + + } elsif ($position eq 'pre2suf' || $position eq 'suffix') { + + $new_line .= $stripped_id . $new_x_fix . '|'; + + } + + } else { + + # + # Copy the ID unmodified to the result. + # + $new_line .= ${$ids}[$count] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _AddFix { + + my ($x_fixes_array, $position, $ids, $description) = @_; + my $new_line = '>'; + + my $id_count = scalar(@{$ids}); + my $x_fix_count = scalar(@{$x_fixes_array}); + + unless ($id_count == $x_fix_count) { + $logger->fatal('Amount of pre- or suffixes specified (' . $x_fix_count . ') does not match with amount if IDs found ' . $id_count . ').'); + return(undef); + } + + for my $count (0 .. $#{$ids}) { + + if ($position eq 'prefix') { + + $new_line .= ${$x_fixes_array}[$count] . ${$ids}[$count] . '|'; + + } elsif ($position eq 'suffix') { + + $new_line .= ${$ids}[$count] . ${$x_fixes_array}[$count] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _DeleteID { + + my ($ids_to_delete, $ids, $description) = @_; + my $new_line = '>'; + + $new_line = '>'; + + for my $offset (0 .. $#{$ids}) { + + my $index = $offset + 1; + + if (defined(${$ids_to_delete}{$index})) { + + # Skip (drop) this ID. + $logger->debug('Dropping ' . ${$ids}[$offset] . ' as it is ID number ' . $index . '.'); + + } else { + + $new_line .= ${$ids}[$offset] . '|'; + + } + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _ShuffleID { + + my ($new_id_order, $ids, $description) = @_; + my $new_line = '>'; + + my $id_count = scalar(@{$ids}); + my $new_id_order_item_count = scalar(@{$new_id_order}); + + unless ($id_count == $new_id_order_item_count) { + $logger->fatal('Amount of IDs specified to re-order (' . $new_id_order_item_count . ') does not match with amount if IDs found (' . $id_count . ').'); + return(undef); + } + + $new_line = '>'; + + foreach my $rank (@{$new_id_order}) { + + my $offset = $rank - 1; + $logger->debug('ID rank ' . $rank . ' = ' . ${$ids}[$offset] . '.'); + $new_line .= ${$ids}[$offset] . '|'; + $logger->debug('New header line now contains ' . $new_line . '.'); + + } + + $new_line =~ s/\|$//; + if (defined($description)) { + $new_line .= ' ' . $description; + } + + return($new_line); + +} + +sub _Usage { + + print "\n"; + print "ConvertFastaHeaders.pl - Converts sequence headers of FASTA files.\n"; + print "\n"; + print "Usage:\n"; + print "\n"; + print " ConvertFastaHeaders.pl options\n"; + print "\n"; + print "Available options are:\n"; + print "\n"; + print " -i [dir/file] Input can be a single FASTA file or a directory containing FASTA files.\n"; + print " -e [ext] File name extension for the FASTA files in case the input is a directory. (default = fa)\n"; + print " -o [dir/file] Output file or directory where the result(s) will be saved.\n"; + print " -a [action] Action must be one of 'add', 'strip', 'replace', 'delete' or 'shuffle'.\n"; + print " The actions 'delete' and 'shuffle' operate on complete sequence IDs with or without (database namespace) prefixes or suffixes.\n"; + print " The actions 'add', 'strip' and 'replace' operate on sequence ID prefixes or suffixes.\n"; + print " Note in case *fixes are added the order of the *fixes is important! (See below for examples.)\n"; + print " -p [position] Positon must be a comma separated list of numbers in case the action is 'delete' or 'shuffle'.\n"; + print " Position must be one of 'prefix' or 'suffix' when the action is 'add' or 'strip'.\n"; + print " In case the action is 'replace' the position can also be one of pre2suf or suf2pre \n"; + print " to replace a prefix with a suffix or vice versa.\n"; + print " -f '[*fix1 *fix2 *fixN]' Space separated list of prefixes or suffixes, which will be replaced in, added to or removed from pipe separated identifiers.\n"; + print " Note that in case of database namespace prefixes you must specify both the database name space and \n"; + print " the character to separate the namespace from the accession number as the prefix. (See below for examples.) \n"; + print " -n '[*fix]' A single new prefix or suffix to replace the *fixes specified with -f.\n"; + print " (Only required in case the action is 'replace'.)\n"; + print " -l [LEVEL] Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n"; + print "\n"; + print "Examples:\n"; + print "\n"; + print " Adding prefixes\n"; + print " In this case the order of the *fixes specified with -f is important!\n"; + print " With -a add -p prefix -f 'UniProtAcc: UniProtID:', this header:\n"; + print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Stripping prefixes\n"; + print " In this case the order of the *fixes specified with -f is not relevant.\n"; + print " With both -a strip -p prefix -f 'UniProtAcc: UniProtID:' or \n"; + print " with -a strip -p prefix -f 'UniProtID: UniProtAcc:', this header:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Replacing prefixes with a suffix\n"; + print " In this case the order of the *fixes specified with -f is not relevant.\n"; + print " With -a replace -p pre2suf -f 'REV_' -n '_REV', this header:\n"; + print " >REV_P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >P32234_REV|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Deleting sequence identifiers\n"; + print " Supply a comma separated list of numbers for the ranks of the identifiers / accession numbers you want to remove.\n"; + print " Multiple identifiers must be separated with a pipe symbol.\n"; + print " With -a delete -p '1,3', this header:\n"; + print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Changing the order of sequence identifiers\n"; + print " Supply a comma separated list of numbers for the new order of all the identifiers / accession numbers in a header.\n"; + print " Multiple identifiers must be separated with a pipe symbol.\n"; + print " Hence if your headers contain 4 pipe separated IDs and you only want to swap the order of the first and the second, \n"; + print " you will still need to specify the new (unchanged) order for number 3 and 4 too.\n"; + print " With -a shuffle -p '2,1,3', this header:\n"; + print " >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " will be converted into:\n"; + print " >UniProtAcc:P32234|UniProtID:128UP_DROME|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n"; + print " Specifying only *2,1* as the New order for the IDs will not work, because this header contains 3 IDs, \n"; + print " so you'll have to include the (new) position for the third one as well.\n"; + print "\n"; + exit; + +}