Mercurial > repos > xuebing > sharplabtool
diff tools/filters/uniq.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/uniq.py Fri Mar 09 19:37:19 2012 -0500 @@ -0,0 +1,117 @@ +# Filename: uniq.py +# Author: Ian N. Schenck +# Version: 19/12/2005 +# +# This script accepts an input file, an output file, a column +# delimiter, and a list of columns. The script then grabs unique +# lines based on the columns, and returns those records with a count +# of occurences of each unique column, inserted before the columns. +# +# This executes the command pipeline: +# cut -f $fields | sort | uniq -C +# +# -i Input file +# -o Output file +# -d Delimiter +# -c Column list (Comma Seperated) + +import sys +import re +import string +import commands + +# This function is exceedingly useful, perhaps package for reuse? +def getopts(argv): + opts = {} + while argv: + if argv[0][0] == '-': + opts[argv[0]] = argv[1] + argv = argv[2:] + else: + argv = argv[1:] + return opts + +def main(): + args = sys.argv[1:] + + try: + opts = getopts(args) + except IndexError: + print "Usage:" + print " -i Input file" + print " -o Output file" + print " -c Column list (comma seperated)" + print " -d Delimiter:" + print " T Tab" + print " C Comma" + print " D Dash" + print " U Underscore" + print " P Pipe" + print " Dt Dot" + print " Sp Space" + return 0 + + outputfile = opts.get("-o") + if outputfile == None: + print "No output file specified." + return -1 + + inputfile = opts.get("-i") + if inputfile == None: + print "No input file specified." + return -2 + + delim = opts.get("-d") + if delim == None: + print "Field delimiter not specified." + return -3 + + columns = opts.get("-c") + if columns == None or columns == 'None': + print "Columns not specified." + return -4 + + # All inputs have been specified at this point, now validate. + fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$") + columnRegEx = re.compile("([0-9]{1,},?)+") + + if not columnRegEx.match(columns): + print "Illegal column specification." + return -4 + if not fileRegEx.match(outputfile): + print "Illegal output filename." + return -5 + if not fileRegEx.match(inputfile): + print "Illegal input filename." + return -6 + + column_list = re.split(",",columns) + columns_for_display = "" + for col in column_list: + columns_for_display += "c"+col+", " + + commandline = "cut " + # Set delimiter + if delim=='C': + commandline += "-d \",\" " + if delim=='D': + commandline += "-d \"-\" " + if delim=='U': + commandline += "-d \"_\" " + if delim=='P': + commandline += "-d \"|\" " + if delim=='Dt': + commandline += "-d \".\" " + if delim=='Sp': + commandline += "-d \" \" " + + # set columns + commandline += "-f " + columns + commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile + errorcode, stdout = commands.getstatusoutput(commandline) + + print "Count of unique values in " + columns_for_display + return errorcode + +if __name__ == "__main__": + main()