view tools/filters/uniq.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
line wrap: on
line source

# Filename: uniq.py
# Author: Ian N. Schenck
# Version: 19/12/2005
#
# This script accepts an input file, an output file, a column
# delimiter, and a list of columns.  The script then grabs unique
# lines based on the columns, and returns those records with a count
# of occurences of each unique column, inserted before the columns.
#
# This executes the command pipeline:
#       cut -f $fields | sort  | uniq -C
#
# -i            Input file
# -o            Output file
# -d            Delimiter
# -c            Column list (Comma Seperated)

import sys
import re
import string
import commands

# This function is exceedingly useful, perhaps package for reuse?
def getopts(argv):
    opts = {}
    while argv:
        if argv[0][0] == '-':
            opts[argv[0]] = argv[1]
            argv = argv[2:]
        else:
            argv = argv[1:]
    return opts

def main():
    args = sys.argv[1:]

    try:
        opts = getopts(args)
    except IndexError:
        print "Usage:"
        print " -i        Input file"
        print " -o        Output file"
        print " -c        Column list (comma seperated)"
        print " -d        Delimiter:"
        print "                     T   Tab"
        print "                     C   Comma"
        print "                     D   Dash"
        print "                     U   Underscore"
        print "                     P   Pipe"
        print "                     Dt  Dot"
        print "                     Sp  Space"
        return 0

    outputfile = opts.get("-o")
    if outputfile == None:
        print "No output file specified."
        return -1
    
    inputfile = opts.get("-i")
    if inputfile == None:
        print "No input file specified."
        return -2

    delim = opts.get("-d")
    if delim == None:
        print "Field delimiter not specified."
        return -3

    columns = opts.get("-c")
    if columns == None or columns == 'None':
        print "Columns not specified."
        return -4

    # All inputs have been specified at this point, now validate.
    fileRegEx = re.compile("^[A-Za-z0-9./\-_]+$")
    columnRegEx = re.compile("([0-9]{1,},?)+")

    if not columnRegEx.match(columns):
        print "Illegal column specification."
        return -4
    if not fileRegEx.match(outputfile):
        print "Illegal output filename."
        return -5
    if not fileRegEx.match(inputfile):
        print "Illegal input filename."
        return -6

    column_list = re.split(",",columns)
    columns_for_display = ""
    for col in column_list:
        columns_for_display += "c"+col+", "

    commandline = "cut "
    # Set delimiter
    if delim=='C':
        commandline += "-d \",\" "
    if delim=='D':
        commandline += "-d \"-\" "
    if delim=='U':
        commandline += "-d \"_\" "
    if delim=='P':
        commandline += "-d \"|\" "
    if delim=='Dt':
        commandline += "-d \".\" "
    if delim=='Sp':
        commandline += "-d \" \" "

    # set columns
    commandline += "-f " + columns
    commandline += " " + inputfile + " | sed s/\ //g | sort | uniq -c | sed s/^\ *// | tr \" \" \"\t\" > " + outputfile
    errorcode, stdout = commands.getstatusoutput(commandline)
    
    print "Count of unique values in " + columns_for_display
    return errorcode

if __name__ == "__main__":
    main()