view tools/filters/joinWrapper.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

#!/usr/bin/env python
#Guruprasad Ananda
"""
This tool provides the UNIX "join" functionality.
"""
import sys, os, tempfile, subprocess

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

def main():
    infile1 = sys.argv[1]
    infile2 = sys.argv[2]
    field1 = int(sys.argv[3])
    field2 = int(sys.argv[4])
    mode =sys.argv[5]
    outfile = sys.argv[6]
    
    tmpfile1 = tempfile.NamedTemporaryFile()
    tmpfile2 = tempfile.NamedTemporaryFile()
    
    try:
        #Sort the two files based on specified fields
        os.system("sort -t '	' -k %d,%d -o %s %s" %(field1, field1, tmpfile1.name, infile1))
        os.system("sort -t '	' -k %d,%d -o %s %s" %(field2, field2, tmpfile2.name, infile2))
    except Exception, exc:
        stop_err( 'Initialization error -> %s' %str(exc) )
        
    option = ""
    for line in file(tmpfile1.name):
        line = line.strip()
        if line:
            elems = line.split('\t')
            for j in range(1,len(elems)+1):
                if j == 1:
                    option = "1.1"
                else:
                    option = option + ",1." + str(j) 
            break
    
    #check if join has --version option. BSD join doens't have this option, while GNU join does. 
    #The return value in the latter case will be 0, and non-zero in the latter case.
    ret = subprocess.call('join --version 2>/dev/null', shell=True) 
    # check if we are a version later than 7 of join. If so, we want to skip
    # checking the order since join will raise an error with duplicated items in
    # the two files being joined.
    if ret == 0: 
        cl = subprocess.Popen(["join", "--version"], stdout=subprocess.PIPE)
        (stdout, _) = cl.communicate()
        version_line = stdout.split("\n")[0]
        (version, _) = version_line.split()[-1].split(".")
        if int(version) >= 7:
            flags = "--nocheck-order"
        else:
            flags = ""
    else:
        flags = ""

    if mode == "V":
        cmdline = "join %s -t '	' -v 1 -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
    else:
        cmdline = "join %s -t '	' -o %s -1 %d -2 %d %s %s > %s" %(flags, option, field1, field2, tmpfile1.name, tmpfile2.name, outfile)
    
    try:
        os.system(cmdline) 
    except Exception, exj:
        stop_err('Error joining the two datasets -> %s' %str(exj))
       
if __name__ == "__main__":
    main()