comparison TopHit_namefilter/TopHit_namefilter_galaxy.pl @ 0:9f1fe290345e default tip

Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
author abossers
date Tue, 07 Jun 2011 18:07:34 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9f1fe290345e
1 #!/usr/bin/perl -w
2
3 # Simple filter to keep just the TOPHIT / first occurrence of some identifier
4 # usefull for keeping only the first tophit in blast when multiple hits are returned
5 #
6 # Please be aware that NO additional filtering or checking is done on for instance
7 # E values of BLAST hits. Tophit = FIRST hit...not necessarily the best..
8 #
9 # input list/table having some groupable identifier
10 # input the column number to filter on (column number starts at 1)
11 # input number of occurrences to keep
12 # note that the hits are displayed in order of occurrence
13 # and NOT sorted on given column!
14 # column splitter (default TAB)
15 # Note that: splitting on tab: \t
16 # splitting on pipe: \|
17 # combined splits: -|\| (splits on '-' OR '|')
18 #
19 # output the same table having only the FIRST occurrence of the identifier.
20 #
21 # alex.bossers@wur.nl
22 #
23
24 my $version = "v0.13.alx 19-5-2011";
25 # Version history
26 # 0.13 19-05-2011 added extra cmdline opt hits to keep -> first galaxy version
27 # 0.12 19-05-2011 mods to fit initial needs. Not distributed.
28 # 0.1 xx-xx-2010 template
29
30 use strict;
31 use warnings;
32
33 #cmd line options
34 if (!$ARGV[4]) {
35 warn "Error: not enough arguments\n";
36 usage();
37 }
38 my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
39 my $column = $ARGV[1]; # column numbers start at 1!
40 my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "")
41 my $hits = $ARGV[3]; # number of occurences to keep
42 my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig;
43
44 if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();}
45
46 #keeping track
47 my $entrycounter = 0;
48 my $filter_count = 0;
49
50 #open the files
51 open (IN,$input) || die "Input file error: $!\n" ;
52 open (OUT, ">$output") || die "Output file error: $!\n";
53
54 #read file into hash having KEY equal to column data specified
55 my %filtered;
56 while (<IN>){
57 chomp;
58 my $line = $_;
59 my @fields = split($splitter,$line);
60 #print "@fields\n";
61 $entrycounter++;
62 if (exists $filtered{$fields[$column-1]}){
63 if ($filtered{$fields[$column-1]} < $hits){
64 #number of occurrences to keep
65 print OUT "$line\n";
66 $filtered{$fields[$column-1]}++;
67 $filter_count++;
68 }
69 next;
70 }
71 else {
72 $filtered{$fields[$column-1]} = "1"; #first occurrence
73 print OUT "$line\n";
74 #print "key: $fields[$column-1]\tLine: $line\n";
75 $filter_count++;
76 }
77 }
78
79 #end and close
80 close (IN);
81 close (OUT);
82
83 print "\nVersion : $version\nComments/bugs : alex.bossers\@wur.nl\n";
84 print "Processed : $entrycounter entries\n";
85 print "Filtered : $filter_count entries remain\n";
86
87 sub usage {
88 warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n";
89 my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig;
90 die <<EOF;
91 usage: $cmd <infile> <column> <splitter> <outfile>
92
93 INPUT: infile Input original tabular/text
94
95 column Input column number to use (>= 1)
96
97 splitter Splitter char to use (i.e. \t for tab)
98 For splitting on pipe use escaping: \|
99 Combined splits possible: -|\| splits both on - as |
100
101 hits Number of hits to keep (in chronological order)
102 The results are NOT sorted!
103
104 OUTPUT: outfile Output filename of filtered table.
105
106 EOF
107 }
108 #end script