changeset 0:9f1fe290345e default tip

Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
author abossers
date Tue, 07 Jun 2011 18:07:34 -0400
parents
children
files TopHit_namefilter/TopHit_README TopHit_namefilter/TopHit_namefilter.test.table TopHit_namefilter/TopHit_namefilter.xml TopHit_namefilter/TopHit_namefilter_galaxy.pl
diffstat 4 files changed, 283 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_README	Tue Jun 07 18:07:34 2011 -0400
@@ -0,0 +1,47 @@
+# Created May 2011
+#
+# Alex Bossers
+# Central Veterinary Institute
+# Wageningen University and Research centre
+# Lelystad, The Netherlands
+#
+# Comments/improvements/bugs: Alex (dot) Bossers (at) wur (dot) nl
+
+
+# WHAT IT DOES
+TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some
+identifier. This is  useful for keeping only the first N tophits of for instance BLAST when
+multiple hits were returned (and you don't want to rerun the BLAST analysis). Of course it is NOT
+restricted to BLAST and can basically filter ANY tabular data for uniqueness.
+
+Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits.
+Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned
+they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the
+input and therefore in the output file).
+
+
+# REQUIREMENTS 
+Perl
+Galaxy :)
+
+
+# SETUP 
+Just unpack the tool xml and perl script somewhere appropriate and plug the tool in the tool_config.xml
+of your galaxy instance. Refresh the tools or restart the galaxy server.
+
+
+# LICENSE
+Copyright (c) 2011 Central Veterinary Institute of Wageningen UR, Lelystad, The Netherlands.
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or
+(at your option) any later version.
+
+When distributing the tools please include this original reference.
+
+Use this tool at your own risk. Even though we tried to build tools and wrappers that free of errors,
+check your output since it might be erroneous. We will not be relyable to any failure this may have caused.
+
+If you like these scripts, please acknowledge our work.
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter.test.table	Tue Jun 07 18:07:34 2011 -0400
@@ -0,0 +1,16 @@
+Q3262-21	gi|71066702|gb|AE016828.2|	tjahier wat
+Q3262-23	gi|71066702|gb|AE016828.2|	okay
+Q3262-21	gi|71066702|gb|AE016828.2| and here	can contain multiple columns :)
+Q3262-24	gi|71066702|gb|AE016828.2| nothing there
+Q3262-26	gi|71066702|gb|AE016828.2|	or still
+Q3262-21	gi|71066702|gb|AE016828.2|
+Q3262-21	gi|71066702|gb|AE016828.2|
+Q3262-21	gi|71066702|gb|AE016828.2|
+Q3262-21	gi|71066702|gb|AE016828.2|
+Q3262-21	gi|145004|gb|M80806.1|COXTRANSPO
+Q3262-21	gi|144996|gb|M20482.1|COXHSPAB
+Q3262-21	gi|161761570|gb|CP000890.1|
+Q3262-30	gi|161761570|gb|CP000890.1|
+Q3262-21	gi|161761570|gb|CP000890.1|
+Q3262-21	gi|161761570|gb|CP000890.1|
+Q3262-21	gi|161761570|gb|CP000890.1|
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter.xml	Tue Jun 07 18:07:34 2011 -0400
@@ -0,0 +1,112 @@
+<tool id="TopHit_namefilter" name="TopHit filter" version="0.1.Alx">
+  <description>Simple filter to keep N occurrences of lines in a file</description>
+  <command interpreter="perl">
+            TopHit_namefilter_galaxy.pl
+                $input
+				$column
+				"$splitter"
+				$hits
+				$output_file
+				<!-- 2&gt;$logfile -->
+  </command>
+  <inputs>
+   <param name="input" type="data" format="tabular,txt" label="Input tabular or plain text file" />
+   <param name="column" type="integer" size="4" value="1" label="Column number to use after the split!" />
+   <param name="splitter" type="text" size="10" value="\t" label="Splitter character/code to use" help="See help below for advanced options and how to use {pipe}" >
+		<sanitizer>
+			<valid>
+				<add value="\"/>
+				<add value=">"/>
+				<add value="%"/>
+				<add value="|"/>
+			</valid>
+		</sanitizer>
+   </param>
+   <param name="hits" type="integer" size="4" value="1" label="Number of occurrences to keep" help="They will not be sorted!" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="input" label="Filtered table/text" />
+  </outputs>
+  <tests>
+  </tests>
+  <help>
+**What it does**
+
+TopHit_namefilter is a SIMPLE filter to keep just the TOPHIT / first [N] occurrence(s) of some identifier
+useful for keeping only the first N tophits in blast when multiple hits were returned (and you don't want to rerun the BLAST analysis).
+
+Please be aware that NO additional filtering or checking is done on for instance E values of BLAST hits.
+Tophit = FIRST hit...not necessarily the best.. If multiple hits are selected to be returned
+they will NOT be sorted (see below example of a number of 2 hits occurring somewhere else in the input
+and therefore in the output file).
+
+**Comments/feedback** on the Perl script or GALAXY wrapper: alex.bossers@wur.nl
+
+-----
+
+**Note!** Beware the special use of splitters! Especially if you want to use special characters that have a "perl" split
+meaning. They need to be escaped by a leading \\.
+
+Examples of splitters before filtering (end result will remain the ORIGINAL unsplit line!):
+
+::
+
+  Splitter   Meaning                           Example line to split          Split result for filtering only!
+  --------   -------------------------------   -----------------------        --------------------------------
+    \t       Single tab                        Foo&lt;tab&gt;Bar&lt;tab&gt;here    ---&gt;   Foo          Bar        here
+    \|       Single pipe                       Foo&lt;tab&gt;Bar|here        ---&gt;   Foo&lt;tab&gt;Bar  here
+    -        Single dash                       Foo-Bar                 ---&gt;   Foo          Bar
+    -|\|     Combined splits on dash OR pipe   Foo-Bar|here            ---&gt;   Foo          Bar        here
+
+
+-----
+
+**EXAMPLE**
+
+Parameters: Column = 1, **hits = 2** and splitter = \\t 
+
+**Input**
+
+Any text/tabular file:
+
+::
+
+   Q3262-21	gi|71066702|gb|AE016828.2|	tja..here something extra
+   Q3262-23	gi|71066702|gb|AE016828.2|	okay
+   Q3262-24	gi|71066702|gb|AE016828.2| nothing there
+   Q3262-21	gi|71066702|gb|AE016828.2| enhier	was zonder space :)
+   Q3262-26	gi|71066702|gb|AE016828.2|	or still
+   Q3262-21	gi|71066702|gb|AE016828.2|
+   Q3262-21	gi|71066702|gb|AE016828.2|
+   Q3262-21	gi|71066702|gb|AE016828.2|
+   Q3262-21	gi|71066702|gb|AE016828.2|
+   Q3262-21	gi|145004|gb|M80806.1|COXTRANSPO
+   Q3262-21	gi|144996|gb|M20482.1|COXHSPAB
+   Q3262-21	gi|161761570|gb|CP000890.1|
+   Q3262-30	gi|161761570|gb|CP000890.1|
+   Q3262-21	gi|161761570|gb|CP000890.1|
+   Q3262-21	gi|161761570|gb|CP000890.1|
+   Q3262-21	gi|161761570|gb|CP000890.1|
+
+
+**Outputs**
+
+::
+
+   Q3262-21	gi|71066702|gb|AE016828.2|	tja..here something extra
+   Q3262-23	gi|71066702|gb|AE016828.2|	okay
+   Q3262-21	gi|71066702|gb|AE016828.2| enhier	was zonder space :)
+   Q3262-24	gi|71066702|gb|AE016828.2| nothing there
+   Q3262-26	gi|71066702|gb|AE016828.2|	or still
+   Q3262-30	gi|161761570|gb|CP000890.1|
+
+-----
+
+Please acknowledge our work when you find it useful!
+
+|
+
+
+  </help>
+</tool>
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/TopHit_namefilter/TopHit_namefilter_galaxy.pl	Tue Jun 07 18:07:34 2011 -0400
@@ -0,0 +1,108 @@
+#!/usr/bin/perl -w
+
+# Simple filter to keep just the TOPHIT / first occurrence of some identifier
+# usefull for keeping only the first tophit in blast when multiple hits are returned
+#
+# Please be aware that NO additional filtering or checking is done on for instance
+# E values of BLAST hits. Tophit = FIRST hit...not necessarily the best..
+#
+# input list/table having some groupable identifier
+# input the column number to filter on (column number starts at 1)
+# input number of occurrences to keep
+#       note that the hits are displayed in order of occurrence
+#       and NOT sorted on given column!
+# column splitter (default TAB)
+#		Note that: splitting on tab:    \t
+#		           splitting on pipe:   \|
+#		           combined splits:		-|\|	(splits on '-' OR '|')
+#
+# output the same table having only the FIRST occurrence of the identifier.
+#
+# alex.bossers@wur.nl
+#
+
+my $version = "v0.13.alx 19-5-2011";
+# Version history
+# 		0.13	19-05-2011  added extra cmdline opt hits to keep -> first galaxy version
+#		0.12	19-05-2011	mods to fit initial needs. Not distributed.
+#		0.1		xx-xx-2010	template
+
+use strict;
+use warnings;
+
+#cmd line options
+if (!$ARGV[4]) {
+	warn "Error: not enough arguments\n";
+	usage();
+}
+my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+my $column = $ARGV[1];   # column numbers start at 1!
+my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "")
+my $hits = $ARGV[3];	 # number of occurences to keep
+my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
+
+if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();}
+
+#keeping track
+my $entrycounter = 0;
+my $filter_count = 0;
+
+#open the files
+open (IN,$input) || die "Input file error: $!\n" ;
+open (OUT, ">$output") || die "Output file error: $!\n";
+
+#read file into hash having KEY equal to column data specified
+my %filtered;
+while (<IN>){
+	chomp;
+	my $line = $_;
+	my @fields = split($splitter,$line);
+	#print "@fields\n";
+	$entrycounter++;
+	if (exists $filtered{$fields[$column-1]}){
+		if ($filtered{$fields[$column-1]} < $hits){
+			#number of occurrences to keep
+			print OUT "$line\n";
+			$filtered{$fields[$column-1]}++;
+			$filter_count++;
+		}
+		next;
+	}
+	else {
+		$filtered{$fields[$column-1]} = "1"; #first occurrence
+		print OUT "$line\n";
+		#print "key: $fields[$column-1]\tLine: $line\n";
+		$filter_count++;
+	}
+}
+
+#end and close
+close (IN);
+close (OUT);
+
+print "\nVersion   : $version\nComments/bugs : alex.bossers\@wur.nl\n";
+print "Processed : $entrycounter entries\n";
+print "Filtered  : $filter_count entries remain\n";
+
+sub usage {
+  warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n";
+  my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig;
+  die <<EOF;
+usage: $cmd <infile> <column> <splitter> <outfile>
+
+    INPUT:  infile      Input original tabular/text
+
+            column      Input column number to use (>= 1)
+
+            splitter    Splitter char to use (i.e. \t for tab)
+                        For splitting on pipe use escaping: \|
+                        Combined splits possible: -|\| splits both on - as |
+
+            hits        Number of hits to keep (in chronological order)
+                        The results are NOT sorted!
+
+    OUTPUT: outfile     Output filename of filtered table.
+
+EOF
+}
+#end script
\ No newline at end of file