Mercurial > repos > xuebing > sharplabtool
comparison tools/filters/uniq.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 # Filename: uniq.py | |
2 # Author: Ian N. Schenck | |
3 # Version: 19/12/2005 | |
4 # | |
5 # This script accepts an input file, an output file, a column | |
6 # delimiter, and a list of columns. The script then grabs unique | |
7 # lines based on the columns, and returns those records with a count | |
8 # of occurences of each unique column, inserted before the columns. | |
9 # | |
10 # This executes the command pipeline: | |
11 # cut -f $fields | sort | uniq -C | |
12 # | |
13 # -i Input file | |
14 # -o Output file | |
15 # -d Delimiter | |
16 # -c Column list (Comma Seperated) | |
17 | |
18 import sys | |
19 import re | |
20 import string | |
21 import commands | |
22 | |
23 # This function is exceedingly useful, perhaps package for reuse? | |
24 def getopts(argv): | |
25 opts = {} | |
26 while argv: | |
27 if argv[0][0] == '-': | |
28 opts[argv[0]] = argv[1] | |
29 argv = argv[2:] | |
30 else: | |
31 argv = argv[1:] | |
32 return opts | |
33 | |
34 def main(): | |
35 args = sys.argv[1:] | |
36 | |
37 try: | |
38 opts = getopts(args) | |
39 except IndexError: | |
40 print "Usage:" | |
41 print " -i Input file" | |
42 print " -o Output file" | |
43 print " -c Column list (comma seperated)" | |
44 print " -d Delimiter:" | |
45 print " T Tab" | |
46 print " C Comma" | |
47 print " D Dash" | |
48 print " U Underscore" | |
49 print " P Pipe" | |
50 print " Dt Dot" | |
51 print " Sp Space" | |
52 return 0 | |
53 | |
54 outputfile = opts.get("-o") | |
55 if outputfile == None: | |
56 print "No output file specified." | |
57 return -1 | |
58 | |
59 inputfile = opts.get("-i") | |
60 if inputfile == None: | |
61 print "No input file specified." | |
62 return -2 | |
63 | |
64 delim = opts.get("-d") | |
65 if delim == None: | |
66 print "Field delimiter not specified." | |
67 return -3 | |
68 | |
69 columns = opts.get("-c") | |
70 if columns == None or columns == 'None': | |
71 print "Columns not specified." | |
72 return -4 | |
73 | |
74 # All inputs have been specified at this point, now validate. | |
75 fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") | |
76 columnRegEx = re.compile("([0-9]{1,},?)+") | |
77 | |
78 if not columnRegEx.match(columns): | |
79 print "Illegal column specification." | |
80 return -4 | |
81 if not fileRegEx.match(outputfile): | |
82 print "Illegal output filename." | |
83 return -5 | |
84 if not fileRegEx.match(inputfile): | |
85 print "Illegal input filename." | |
86 return -6 | |
87 | |
88 column_list = re.split(",",columns) | |
89 columns_for_display = "" | |
90 for col in column_list: | |
91 columns_for_display += "c"+col+", " | |
92 | |
93 commandline = "cut " | |
94 # Set delimiter | |
95 if delim=='C': | |
96 commandline += "-d \",\" " | |
97 if delim=='D': | |
98 commandline += "-d \"-\" " | |
99 if delim=='U': | |
100 commandline += "-d \"_\" " | |
101 if delim=='P': | |
102 commandline += "-d \"|\" " | |
103 if delim=='Dt': | |
104 commandline += "-d \".\" " | |
105 if delim=='Sp': | |
106 commandline += "-d \" \" " | |
107 | |
108 # set columns | |
109 commandline += "-f " + columns | |
110 commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile | |
111 errorcode, stdout = commands.getstatusoutput(commandline) | |
112 | |
113 print "Count of unique values in " + columns_for_display | |
114 return errorcode | |
115 | |
116 if __name__ == "__main__": | |
117 main() |