Mercurial > repos > abossers > tophit_namefilter
diff TopHit_namefilter/TopHit_namefilter_galaxy.pl @ 0:9f1fe290345e default tip
Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
author | abossers |
---|---|
date | Tue, 07 Jun 2011 18:07:34 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/TopHit_namefilter/TopHit_namefilter_galaxy.pl Tue Jun 07 18:07:34 2011 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/perl -w + +# Simple filter to keep just the TOPHIT / first occurrence of some identifier +# usefull for keeping only the first tophit in blast when multiple hits are returned +# +# Please be aware that NO additional filtering or checking is done on for instance +# E values of BLAST hits. Tophit = FIRST hit...not necessarily the best.. +# +# input list/table having some groupable identifier +# input the column number to filter on (column number starts at 1) +# input number of occurrences to keep +# note that the hits are displayed in order of occurrence +# and NOT sorted on given column! +# column splitter (default TAB) +# Note that: splitting on tab: \t +# splitting on pipe: \| +# combined splits: -|\| (splits on '-' OR '|') +# +# output the same table having only the FIRST occurrence of the identifier. +# +# alex.bossers@wur.nl +# + +my $version = "v0.13.alx 19-5-2011"; +# Version history +# 0.13 19-05-2011 added extra cmdline opt hits to keep -> first galaxy version +# 0.12 19-05-2011 mods to fit initial needs. Not distributed. +# 0.1 xx-xx-2010 template + +use strict; +use warnings; + +#cmd line options +if (!$ARGV[4]) { + warn "Error: not enough arguments\n"; + usage(); +} +my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig; +my $column = $ARGV[1]; # column numbers start at 1! +my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "") +my $hits = $ARGV[3]; # number of occurences to keep +my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig; + +if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();} + +#keeping track +my $entrycounter = 0; +my $filter_count = 0; + +#open the files +open (IN,$input) || die "Input file error: $!\n" ; +open (OUT, ">$output") || die "Output file error: $!\n"; + +#read file into hash having KEY equal to column data specified +my %filtered; +while (<IN>){ + chomp; + my $line = $_; + my @fields = split($splitter,$line); + #print "@fields\n"; + $entrycounter++; + if (exists $filtered{$fields[$column-1]}){ + if ($filtered{$fields[$column-1]} < $hits){ + #number of occurrences to keep + print OUT "$line\n"; + $filtered{$fields[$column-1]}++; + $filter_count++; + } + next; + } + else { + $filtered{$fields[$column-1]} = "1"; #first occurrence + print OUT "$line\n"; + #print "key: $fields[$column-1]\tLine: $line\n"; + $filter_count++; + } +} + +#end and close +close (IN); +close (OUT); + +print "\nVersion : $version\nComments/bugs : alex.bossers\@wur.nl\n"; +print "Processed : $entrycounter entries\n"; +print "Filtered : $filter_count entries remain\n"; + +sub usage { + warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n"; + my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig; + die <<EOF; +usage: $cmd <infile> <column> <splitter> <outfile> + + INPUT: infile Input original tabular/text + + column Input column number to use (>= 1) + + splitter Splitter char to use (i.e. \t for tab) + For splitting on pipe use escaping: \| + Combined splits possible: -|\| splits both on - as | + + hits Number of hits to keep (in chronological order) + The results are NOT sorted! + + OUTPUT: outfile Output filename of filtered table. + +EOF +} +#end script \ No newline at end of file