diff ConvertFastaHeaders.pl @ 0:163892325845 draft default tip

Initial commit.
author galaxyp
date Fri, 10 May 2013 17:15:08 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ConvertFastaHeaders.pl	Fri May 10 17:15:08 2013 -0400
@@ -0,0 +1,669 @@
+#!/usr/bin/perl
+
+#
+# convertFastaHeaders.pl
+#
+# $Id: ConvertFastaHeaders.pl 44 2010-10-18 12:58:41Z pieter.neerincx@gmail.com $
+# $URL: https://trac.nbic.nl/svn/galaxytools/trunk/tools/general/FastaTools/ConvertFastaHeaders.pl $
+# $LastChangedDate: 2010-10-18 07:58:41 -0500 (Mon, 18 Oct 2010) $ 
+# $LastChangedRevision: 44 $
+# $LastChangedBy: pieter.neerincx@gmail.com $
+# 
+# Converts sequence header of FASTA files (in various customisable ways).
+#
+
+#
+# Initialize evironment
+#
+use strict;
+use Getopt::Std;
+use Log::Log4perl qw(:easy);
+
+my %log_levels = (
+	'ALL'	=> $ALL,
+	'TRACE'	=> $TRACE,
+	'DEBUG'	=> $DEBUG,
+	'INFO'	=> $INFO,
+	'WARN'	=> $WARN,
+	'ERROR'	=> $ERROR,
+	'FATAL'	=> $FATAL,
+	'OFF'	=> $OFF,
+);
+
+#
+# Get options.
+#
+my %opts;
+Getopt::Std::getopts('i:o:l:e:f:n:a:p:', \%opts);
+my $input			= $opts{'i'};
+my $output			= $opts{'o'};
+my $log_level		= $opts{'l'};
+my $extension		= $opts{'e'};
+my @x_fixes_array	= split(/\s+/, $opts{'f'});
+my $new_x_fix		= $opts{'n'};
+my $action			= $opts{'a'};
+my $position		= $opts{'p'};
+my %ids_to_delete;
+my @new_id_order;
+
+#
+# Configure logging.
+#
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'WARN');
+# Reset log level to default if user specified illegal log level.
+$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'});
+#Log::Log4perl->init('log4perl.properties');
+Log::Log4perl->easy_init(
+	#{ level    => $log_level,
+	#  file     => ">>ConvertFastaHeaders.log",
+	#  layout   => '%F{1}-%L-%M: %m%n' },
+	{ level    => $log_level,
+	  file     => "STDERR",
+	  layout   => '%d L:%L %p> %m%n' },
+);
+my $logger = Log::Log4perl::get_logger();
+
+#
+# Start the conversion process.
+#
+$logger->info("Starting...");
+
+#
+# Check user input.
+#
+
+# Provides default if user did not specify action:
+$action = (defined($action) ? $action : 'add');
+
+# Check for valid action and action specific options.
+if ($action eq 'add' || $action eq 'strip' || $action eq 'replace') {
+	
+	unless (scalar(@x_fixes_array) > 0) {
+		$logger->fatal('No prefixes or suffixes specified.');
+		_Usage();
+	}
+	
+	if ($action eq 'replace') {
+		unless (defined($new_x_fix) && $new_x_fix ne '') {
+			$logger->fatal('No new prefix or suffix specified to replace the existing ones.');
+			_Usage();
+		}
+	}
+	
+	# Provides default if user did not specify position:
+	$position = (defined($position) ? $position : 'prefix');
+	# Check for valid position.
+	if ($action eq 'add' || $action eq 'strip') {
+		unless ($position eq 'prefix' || $position eq 'suffix') {
+			$logger->fatal('Illegal position specified. Must be \'prefix\' or \'suffix\'.');
+			_Usage();
+		}
+	} elsif ($action eq 'replace') {
+		unless ($position eq 'prefix' || $position eq 'suffix' || $position eq 'pre2suf' || $position eq 'suf2pre') {
+			$logger->fatal('Illegal position specified. Must be \'prefix\', \'suffix\', \'pre2suf\' or \'suf2pre\'.');
+			_Usage();
+		}
+	}
+	
+} elsif ($action eq 'delete' || $action eq 'shuffle') {
+	
+	unless (defined($position) && $position ne '') {
+		$logger->fatal('No position specified.');
+		_Usage();
+	}
+	
+	my @id_indices = split(/,/, $position);
+	
+	# Check if the value is a number.
+	foreach my $index_number (@id_indices) {
+		
+		unless ($index_number =~ m/^[1-9][0-9]*$/) {
+			
+			$logger->fatal('Illegal character in position list. Must be a single positive integer or comma separated list of positive integers.');
+			_Usage();
+		
+		}
+		
+		if ($action eq 'delete') {
+		
+			$ids_to_delete{$index_number} = 'del';
+		
+		} elsif ($action eq 'shuffle') {
+			
+			push(@new_id_order, $index_number);
+				
+		}
+	}
+
+} else {
+	$logger->fatal('Illegal action specified. Must be add, strip, replace, delete or shuffle.');
+	_Usage();
+}
+
+
+# Provides default if user did not specify log level:
+$log_level = (defined($log_level) ? $log_level : 'WARN');
+# Reset log level to default if user specified illegal log level.
+$log_level = (defined($log_levels{$log_level}) ? $log_levels{$log_level} : $log_levels{'WARN'});
+
+# Provide default if user did not specify fasta filename extension:
+$extension = (defined($extension) ? $extension : 'fa');
+
+if ($input =~ /^$/ || $output =~ /^$/) {
+	# Indir and outdir cannot be empty.
+	_Usage();
+}
+if ($input eq $output) {
+    $logger->fatal("Output dir/file is the same as the input dir/file. Please choose a different one.");
+    exit;
+}
+
+#
+# Check if input is a single file or a directory. 
+#
+unless (-e $input && -r $input) {
+
+	$logger->fatal("Input $input does not exist or is not readable: $!");
+	exit;
+
+} else {
+
+	if (-f $input) {
+	
+		#
+		# We've got an input file.
+		#
+		my $file;
+		if ($input =~ m/(.+\/)([^\/]+)$/) {
+			$file = $2;
+		} else {
+			$file = $input;
+		}
+		
+		$logger->info('Parsing ' . $file . "...\n");
+		
+		_ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order);
+		
+		$logger->info('Converted ' . $file);
+		
+	} else {
+	
+		#
+		# We've got an input directory.
+		# Assume the output is also a directory.
+		# Append trailing path separators if they was missing.
+		#
+		my $indir;
+		my $outdir;
+		unless ($input =~ m/\/$/) {
+			$input  = $input .+ '/';
+		}
+		unless ($output =~ m/\/$/) {
+			$output = $output .+ '/';
+		}
+		#
+		# Make sure the input dir is a directory. 
+		#
+		unless (-d $input) {
+		    $logger->fatal("Input $input is not a file nor a directory: $!");
+		    exit;
+		} else {
+			$indir	= $input;
+			$outdir	= $output;
+		}
+		
+		#
+		# Get all FASTA files from the input dir.
+		#
+		my $files = _GetFiles($indir, $outdir, $extension);
+		
+		#
+		# Create the output directory if did not exist yet.
+		#
+		if (-e $outdir && -d $outdir) {
+			unless (-w $outdir) {
+				$logger->fatal("Cannot write to output directory $outdir. Check for permission errors, read-only file systems, etc.");
+				exit;	
+			}
+		} else {
+			$logger->info("Creating output directory $outdir...");
+			eval{mkdir($outdir);};
+			if ($@) {
+				$logger->fatal("Cannot create output directory $outdir: $@");
+				exit;	
+			}
+		}
+		
+		#
+		# Convert FASTA files.
+		#
+		foreach my $file (@{$files}) {
+			
+			$logger->info('Parsing ' . $file . "...\n");
+			
+			my $pathfrom = $indir .+ $file;
+			my $pathto   = $outdir .+ $file;
+			
+			_ConvertFastaHeaders($input, $output, $action, \@x_fixes_array, $new_x_fix, $position, \%ids_to_delete, \@new_id_order);
+			
+			$logger->info('Converted ' . $file);
+		
+		}
+	}
+}
+
+$logger->info('Finished!');
+
+#
+##
+### Internal subs.
+##
+#
+
+sub _GetFiles {
+	
+	my ($indir, $outdir, $extension) = @_;
+	my @files;
+	
+	#
+	# Get the relative path to the outdir.
+	# Use this to remove it from the list of files/folders that need to be processed
+	# in case it's a subfolder of the input directory.
+	#
+	$outdir =~ m/\/([^\/]+)\/$/;
+	my $outdir_rel = $1;
+	
+	#
+	# Get and parse all files from the input dir.
+	#
+	eval{
+		opendir (INDIR, $indir);
+		@files = grep { /.+\.$extension/i and not /^\..*/ and not /$outdir_rel/} readdir INDIR;
+		closedir INDIR;
+	};
+	if ($@) {
+		$logger->fatal("Cannot read files from input directory $indir: $@");
+		exit;	
+	}
+
+	return(\@files);
+}
+
+sub _ConvertFastaHeaders {
+		
+	$logger->debug('_ConvertFastaHeaders sub');
+	
+	my ($pathfrom, $pathto, $action, $x_fixes_array, $new_x_fix, $position, $ids_to_delete, $new_id_order) = @_;
+
+	my $header_count = 0;
+	
+	#local($/) = "\n\n"; # set line seperator to a blank line
+	open(READ,"<$pathfrom") or die "\tcan't open input file $pathfrom: $!";
+	open(SAVE,">$pathto") or die "\tcan't open output file $pathto: $!";
+	while (my $line = <READ>) {
+		
+		my $new_line;
+			
+		if ($line =~ /^>/) {
+			
+			#
+			# It's a header line.
+			#				
+			$header_count++;
+			my $ids_string;
+			my $description;
+			my $line_end;		
+			
+			if ($line =~ /^>([^\s]+)\s+(.+)([\n\r\f]+)/i) {
+				
+				#
+				# Header with descripton
+				#
+				$ids_string		= $1;
+				$description	= $2;
+				$line_end		= $3;
+				
+			} elsif ($line =~ /^>([^\s]+)\s*([\n\r\f]+)/i) {
+			
+				#
+				# Header without descripton
+				#
+				$ids_string	= $1;
+				$line_end	= $2;
+				
+			} else {
+				
+				$logger->fatal("Malformed header line. Cannot find ID.");
+				exit;
+				
+			}
+		
+			my @ids = split(/\|/, $ids_string);	
+			
+			if ($action eq 'strip') {
+				
+				$new_line = _StripFix($x_fixes_array, $ids_string, $description);
+								
+			} elsif ($action eq 'replace') {
+								
+				$new_line = _ReplaceFix($x_fixes_array, $new_x_fix, $position, \@ids, $description);
+				
+			} elsif ($action eq 'add') {
+				
+				$new_line = _AddFix($x_fixes_array, $position, \@ids, $description);
+					
+			} elsif ($action eq 'delete') {
+				
+				$new_line = _DeleteID($ids_to_delete, \@ids, $description);
+					
+			} elsif ($action eq 'shuffle') {
+				
+				$new_line = _ShuffleID($new_id_order, \@ids, $description);
+					
+			}
+			
+			unless (defined($new_line)) {
+				
+				$logger->fatal('Cannot convert header number: ' . $header_count);
+				$logger->fatal('Offending header line was: ' . $line);
+				exit;
+			
+			}
+			
+			$new_line .= $line_end;
+		
+		} elsif ($line =~ /^[\n\r\f]+$/) {
+			
+			# Skip blank line.
+			
+		} else {
+			
+			#
+			# It must be a sequence line.
+			#
+			$new_line = $line;
+			
+		}
+		
+		# Save (modified) line.
+		print SAVE $new_line or die "\tcan't save output to file $pathto: $!";
+		
+	}
+	
+	close(READ);
+	close(SAVE);
+	
+}
+
+sub _StripFix {
+	
+	my ($x_fixes_array, $ids_string, $description) = @_;
+	my $new_line;
+	
+	foreach my $x_fix (@{$x_fixes_array}) {
+		
+		$ids_string =~ s/$x_fix//g;
+		
+	}
+	
+	if (defined($description)) {
+		$new_line = '>' . $ids_string . ' ' . $description;
+	} else {
+		$new_line = '>' . $ids_string;
+	}
+	
+	return($new_line);
+	
+}
+
+sub	_ReplaceFix {
+	
+	my ($x_fixes_array, $new_x_fix, $position, $ids, $description) = @_;
+	my $new_line = '>';
+	
+	for my $count (0 .. $#{$ids}) {
+		
+		my $id = ${$ids}[$count];
+		my $stripped_id;
+		my $match = 0;
+		
+		if ($position eq 'prefix' || $position eq 'pre2suf') {
+			
+			foreach my $x_fix (@{$x_fixes_array}) {
+				
+				if ($id =~ m/^$x_fix(.+)/) {
+					
+					$stripped_id = $1;
+					$id = $stripped_id;
+					$match = 1;
+					
+				}
+			}						
+			
+		} elsif ($position eq 'suffix' || $position eq 'suf2pre') {
+			
+			foreach my $x_fix (@{$x_fixes_array}) {
+				
+				if ($id =~ m/(.+)$x_fix$/) {
+					
+					$stripped_id = $1;
+					$id = $stripped_id;
+					$match = 1;
+					
+				}
+			}			
+			
+		} else {
+		
+			$logger->fatal("Illegal or no position $position specified.");
+			exit;
+			
+		}
+		
+		if ($match) {
+					
+			#
+			# Append the new *fix.
+			#
+			if ($position eq 'prefix' || $position eq 'suf2pre') {
+				
+				$new_line .= $new_x_fix . $stripped_id . '|';
+				
+			} elsif ($position eq 'pre2suf' || $position eq 'suffix') {
+				
+				$new_line .= $stripped_id . $new_x_fix . '|';
+			
+			}
+			
+		} else {
+			
+			#
+			# Copy the ID unmodified to the result.
+			#
+			$new_line .= ${$ids}[$count] . '|';
+					
+		}
+	}
+	
+	$new_line =~ s/\|$//;
+	if (defined($description)) {
+		$new_line .= ' ' . $description;
+	}
+	
+	return($new_line);
+		
+}
+
+sub _AddFix {
+	
+	my ($x_fixes_array, $position, $ids, $description) = @_;
+	my $new_line = '>';
+	
+	my $id_count = scalar(@{$ids});
+	my $x_fix_count = scalar(@{$x_fixes_array});
+	
+	unless ($id_count == $x_fix_count) {
+		$logger->fatal('Amount of pre- or suffixes specified (' . $x_fix_count . ') does not match with amount if IDs found ' . $id_count . ').');
+		return(undef);
+	}
+	
+	for my $count (0 .. $#{$ids}) {
+		
+		if ($position eq 'prefix') {
+			
+			$new_line .= ${$x_fixes_array}[$count] . ${$ids}[$count] . '|';
+			
+		} elsif ($position eq 'suffix') {
+			
+			$new_line .= ${$ids}[$count] . ${$x_fixes_array}[$count] . '|';
+			
+		}
+	}
+	
+	$new_line =~ s/\|$//;
+	if (defined($description)) {
+		$new_line .= ' ' . $description;
+	}
+	
+	return($new_line);
+	
+}
+
+sub _DeleteID {
+	
+	my ($ids_to_delete, $ids, $description) = @_;
+	my $new_line = '>';
+		
+	$new_line = '>';
+	
+	for my $offset (0 .. $#{$ids}) {
+		
+		my $index = $offset + 1;
+		
+		if (defined(${$ids_to_delete}{$index})) {
+			
+			# Skip (drop) this ID.
+			$logger->debug('Dropping ' . ${$ids}[$offset] . ' as it is ID number ' . $index . '.');
+			
+		} else {
+			
+			$new_line .= ${$ids}[$offset] . '|';
+				
+		}
+	}
+	
+	$new_line =~ s/\|$//;
+	if (defined($description)) {
+		$new_line .= ' ' . $description;
+	}
+	
+	return($new_line);
+	
+}
+
+sub _ShuffleID {
+	
+	my ($new_id_order, $ids, $description) = @_;
+	my $new_line = '>';
+	
+	my $id_count = scalar(@{$ids});
+	my $new_id_order_item_count = scalar(@{$new_id_order});
+	
+	unless ($id_count == $new_id_order_item_count) {
+		$logger->fatal('Amount of IDs specified to re-order (' . $new_id_order_item_count . ') does not match with amount if IDs found (' . $id_count . ').');
+		return(undef);
+	}
+	
+	$new_line = '>';
+	
+	foreach my $rank (@{$new_id_order}) {
+		
+		my $offset = $rank - 1;
+		$logger->debug('ID rank ' . $rank . ' = ' . ${$ids}[$offset] . '.');
+		$new_line .= ${$ids}[$offset] . '|';
+		$logger->debug('New header line now contains ' . $new_line . '.');
+		
+	}
+	
+	$new_line =~ s/\|$//;
+	if (defined($description)) {
+		$new_line .= ' ' . $description;
+	}
+	
+	return($new_line);
+	
+}
+
+sub _Usage {
+
+	print "\n";
+	print "ConvertFastaHeaders.pl - Converts sequence headers of FASTA files.\n";
+	print "\n";
+	print "Usage:\n";
+	print "\n";
+	print "   ConvertFastaHeaders.pl options\n";
+	print "\n";
+	print "Available options are:\n";
+	print "\n";
+    print "   -i [dir/file]             Input can be a single FASTA file or a directory containing FASTA files.\n";
+    print "   -e [ext]                  File name extension for the FASTA files in case the input is a directory. (default = fa)\n";
+    print "   -o [dir/file]             Output file or directory where the result(s) will be saved.\n";
+    print "   -a [action]               Action must be one of 'add', 'strip', 'replace', 'delete' or 'shuffle'.\n";
+    print "                             The actions 'delete' and 'shuffle' operate on complete sequence IDs with or without (database namespace) prefixes or suffixes.\n";
+    print "                             The actions 'add', 'strip' and 'replace' operate on sequence ID prefixes or suffixes.\n";
+    print "                             Note in case *fixes are added the order of the *fixes is important! (See below for examples.)\n";
+    print "   -p [position]             Positon must be a comma separated list of numbers in case the action is 'delete' or 'shuffle'.\n";
+    print "                             Position must be one of 'prefix' or 'suffix' when the action is 'add' or 'strip'.\n";
+    print "                             In case the action is 'replace' the position can also be one of pre2suf or suf2pre \n";
+    print "                             to replace a prefix with a suffix or vice versa.\n";
+    print "   -f '[*fix1 *fix2 *fixN]'  Space separated list of prefixes or suffixes, which will be replaced in, added to or removed from pipe separated identifiers.\n";
+	print "                             Note that in case of database namespace prefixes you must specify both the database name space and \n";
+	print "                             the character to separate the namespace from the accession number as the prefix. (See below for examples.) \n";
+    print "   -n '[*fix]'               A single new prefix or suffix to replace the *fixes specified with -f.\n";
+	print "                             (Only required in case the action is 'replace'.)\n";
+    print "   -l [LEVEL]                Log4perl log level. One of: ALL, TRACE, DEBUG, INFO (default), WARN, ERROR, FATAL or OFF.\n";
+    print "\n";
+    print "Examples:\n";
+    print "\n";
+    print "    Adding prefixes\n";
+    print "                        In this case the order of the *fixes specified with -f is important!\n";
+    print "                        With -a add -p prefix -f 'UniProtAcc: UniProtID:', this header:\n";
+    print "                           >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        will be converted into:\n";
+    print "                           >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "    Stripping prefixes\n";
+    print "                        In this case the order of the *fixes specified with -f is not relevant.\n";
+    print "                        With both -a strip -p prefix -f 'UniProtAcc: UniProtID:' or \n";
+    print "                        with      -a strip -p prefix -f 'UniProtID: UniProtAcc:', this header:\n";
+    print "                           >UniProtAcc:P32234|UniProtID:128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        will be converted into:\n";
+    print "                           >P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "    Replacing prefixes with a suffix\n";
+    print "                        In this case the order of the *fixes specified with -f is not relevant.\n";
+    print "                        With -a replace -p pre2suf -f 'REV_' -n '_REV', this header:\n";
+    print "                           >REV_P32234|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        will be converted into:\n";
+    print "                           >P32234_REV|128UP_DROME GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "    Deleting sequence identifiers\n";
+    print "                        Supply a comma separated list of numbers for the ranks of the identifiers / accession numbers you want to remove.\n";
+    print "                        Multiple identifiers must be separated with a pipe symbol.\n";
+    print "                        With -a delete -p '1,3', this header:\n";
+    print "                            >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        will be converted into:\n";
+    print "                            >UniProtAcc:P32234 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "    Changing the order of sequence identifiers\n";
+    print "                        Supply a comma separated list of numbers for the new order of all the identifiers / accession numbers in a header.\n";
+    print "                        Multiple identifiers must be separated with a pipe symbol.\n";
+    print "                        Hence if your headers contain 4 pipe separated IDs and you only want to swap the order of the first and the second, \n";
+    print "                        you will still need to specify the new (unchanged) order for number 3 and 4 too.\n";
+    print "                        With -a shuffle -p '2,1,3', this header:\n";
+    print "                            >UniProtID:128UP_DROME|UniProtAcc:P32234|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        will be converted into:\n";
+    print "                            >UniProtAcc:P32234|UniProtID:128UP_DROME|EMBL:AY069810 GTP-binding protein 128up - Drosophila melanogaster (Fruit fly)\n";
+    print "                        Specifying only *2,1* as the New order for the IDs will not work, because this header contains 3 IDs, \n";
+    print "                        so you'll have to include the (new) position for the third one as well.\n";
+	print "\n";
+	exit;
+
+}