Mercurial > repos > abossers > tophit_namefilter
comparison TopHit_namefilter/TopHit_namefilter_galaxy.pl @ 0:9f1fe290345e default tip
Migrated tool version 0.1.Alx from old tool shed archive to new tool shed repository
author | abossers |
---|---|
date | Tue, 07 Jun 2011 18:07:34 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9f1fe290345e |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # Simple filter to keep just the TOPHIT / first occurrence of some identifier | |
4 # usefull for keeping only the first tophit in blast when multiple hits are returned | |
5 # | |
6 # Please be aware that NO additional filtering or checking is done on for instance | |
7 # E values of BLAST hits. Tophit = FIRST hit...not necessarily the best.. | |
8 # | |
9 # input list/table having some groupable identifier | |
10 # input the column number to filter on (column number starts at 1) | |
11 # input number of occurrences to keep | |
12 # note that the hits are displayed in order of occurrence | |
13 # and NOT sorted on given column! | |
14 # column splitter (default TAB) | |
15 # Note that: splitting on tab: \t | |
16 # splitting on pipe: \| | |
17 # combined splits: -|\| (splits on '-' OR '|') | |
18 # | |
19 # output the same table having only the FIRST occurrence of the identifier. | |
20 # | |
21 # alex.bossers@wur.nl | |
22 # | |
23 | |
24 my $version = "v0.13.alx 19-5-2011"; | |
25 # Version history | |
26 # 0.13 19-05-2011 added extra cmdline opt hits to keep -> first galaxy version | |
27 # 0.12 19-05-2011 mods to fit initial needs. Not distributed. | |
28 # 0.1 xx-xx-2010 template | |
29 | |
30 use strict; | |
31 use warnings; | |
32 | |
33 #cmd line options | |
34 if (!$ARGV[4]) { | |
35 warn "Error: not enough arguments\n"; | |
36 usage(); | |
37 } | |
38 my ($input) = $ARGV[0] =~ m/^([A-Z0-9_.\-\/]+)$/ig; | |
39 my $column = $ARGV[1]; # column numbers start at 1! | |
40 my $splitter = $ARGV[2]; # splitter for fields to use (might need enclosing "") | |
41 my $hits = $ARGV[3]; # number of occurences to keep | |
42 my ($output) = $ARGV[4] =~ m/^([A-Z0-9_.\-\/]+)$/ig; | |
43 | |
44 if ($column <1 || $hits < 1){warn "Invalid column/hits number\n";usage();} | |
45 | |
46 #keeping track | |
47 my $entrycounter = 0; | |
48 my $filter_count = 0; | |
49 | |
50 #open the files | |
51 open (IN,$input) || die "Input file error: $!\n" ; | |
52 open (OUT, ">$output") || die "Output file error: $!\n"; | |
53 | |
54 #read file into hash having KEY equal to column data specified | |
55 my %filtered; | |
56 while (<IN>){ | |
57 chomp; | |
58 my $line = $_; | |
59 my @fields = split($splitter,$line); | |
60 #print "@fields\n"; | |
61 $entrycounter++; | |
62 if (exists $filtered{$fields[$column-1]}){ | |
63 if ($filtered{$fields[$column-1]} < $hits){ | |
64 #number of occurrences to keep | |
65 print OUT "$line\n"; | |
66 $filtered{$fields[$column-1]}++; | |
67 $filter_count++; | |
68 } | |
69 next; | |
70 } | |
71 else { | |
72 $filtered{$fields[$column-1]} = "1"; #first occurrence | |
73 print OUT "$line\n"; | |
74 #print "key: $fields[$column-1]\tLine: $line\n"; | |
75 $filter_count++; | |
76 } | |
77 } | |
78 | |
79 #end and close | |
80 close (IN); | |
81 close (OUT); | |
82 | |
83 print "\nVersion : $version\nComments/bugs : alex.bossers\@wur.nl\n"; | |
84 print "Processed : $entrycounter entries\n"; | |
85 print "Filtered : $filter_count entries remain\n"; | |
86 | |
87 sub usage { | |
88 warn "\nVersion: $version\nContact/bugs: alex.bossers\@wur.nl\n"; | |
89 my ($cmd) = $0 =~ m/([A-Z0-9_.-]+)$/ig; | |
90 die <<EOF; | |
91 usage: $cmd <infile> <column> <splitter> <outfile> | |
92 | |
93 INPUT: infile Input original tabular/text | |
94 | |
95 column Input column number to use (>= 1) | |
96 | |
97 splitter Splitter char to use (i.e. \t for tab) | |
98 For splitting on pipe use escaping: \| | |
99 Combined splits possible: -|\| splits both on - as | | |
100 | |
101 hits Number of hits to keep (in chronological order) | |
102 The results are NOT sorted! | |
103 | |
104 OUTPUT: outfile Output filename of filtered table. | |
105 | |
106 EOF | |
107 } | |
108 #end script |