diff TopHit_namefilter/TopHit_namefilter_galaxy.pl @ 0:9f1fe290345e default tip

Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
author abossers
date Tue, 07 Jun 2011 18:07:34 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter_galaxy.pl	Tue Jun 07 18:07:34 2011 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/perl -w
+
+# Simple filter to keep just the TOPHIT / first occurrence of some identifier
+# usefull for keeping only the first tophit in blast when multiple hits are returned
+#
+# Please be aware that NO additional filtering or checking is done on for instance
+# E values of BLAST hits. Tophit = FIRST hit...not necessarily the best..
+#
+# input list/table having some groupable identifier
+# input the column number to filter on (column number starts at 1)
+# input number of occurrences to keep
+#       note that the hits are displayed in order of occurrence
+#       and NOT sorted on given column!
+# column splitter (default TAB)
+#		Note that: splitting on tab:    \t
+#		           splitting on pipe:   \|
+#		           combined splits:		-|\|	(splits on '-' OR '|')
+#
+# output the same table having only the FIRST occurrence of the identifier.
+#
+# alex.bossers@wur.nl
+#
+
+my $version = "v0.13.alx 19-5-2011";
+# Version history
+# 		0.13	19-05-2011  added extra cmdline opt hits to keep -> first galaxy version
+#		0.12	19-05-2011	mods to fit initial needs. Not distributed.
+#		0.1		xx-xx-2010	template
+
+use strict;
+use warnings;
+
+#cmd line options
+if (!$ARGV[4]) {
+	warn "Error: not enough arguments\n";
+	usage();
+}
+my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+my $column = $ARGV[1];   # column numbers start at 1!
+my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "")
+my $hits = $ARGV[3];	 # number of occurences to keep
+my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+
+if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();}
+
+#keeping track
+my $entrycounter = 0;
+my $filter_count = 0;
+
+#open the files
+open (IN,$input) || die "Input file error: $!\n" ;
+open (OUT, ">$output") || die "Output file error: $!\n";
+
+#read file into hash having KEY equal to column data specified
+my %filtered;
+while (<IN>){
+	chomp;
+	my $line = $_;
+	my @fields = split($splitter,$line);
+	#print "@fields\n";
+	$entrycounter++;
+	if (exists $filtered{$fields[$column-1]}){
+		if ($filtered{$fields[$column-1]} < $hits){
+			#number of occurrences to keep
+			print OUT "$line\n";
+			$filtered{$fields[$column-1]}++;
+			$filter_count++;
+		}
+		next;
+	}
+	else {
+		$filtered{$fields[$column-1]} = "1"; #first occurrence
+		print OUT "$line\n";
+		#print "key: $fields[$column-1]\tLine: $line\n";
+		$filter_count++;
+	}
+}
+
+#end and close
+close (IN);
+close (OUT);
+
+print "\nVersion   : $version\nComments/bugs : alex.bossers\@wur.nl\n";
+print "Processed : $entrycounter entries\n";
+print "Filtered  : $filter_count entries remain\n";
+
+sub usage {
+  warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n";
+  my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig;
+  die <<EOF;
+usage: $cmd <infile> <column> <splitter> <outfile>
+
+    INPUT:  infile      Input original tabular/text
+
+            column      Input column number to use (>= 1)
+
+            splitter    Splitter char to use (i.e. \t for tab)
+                        For splitting on pipe use escaping: \|
+                        Combined splits possible: -|\| splits both on - as |
+
+            hits        Number of hits to keep (in chronological order)
+                        The results are NOT sorted!
+
+    OUTPUT: outfile     Output filename of filtered table.
+
+EOF
+}
+#end script
\ No newline at end of file