view tools/taxonomy/gi2taxonomy.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
line wrap: on
line source

import sys
import string
import tempfile
import subprocess
from os import path

# -----------------------------------------------------------------------------------

def stop_err(msg):
    sys.stderr.write(msg)
    sys.exit()

# -----------------------------------------------------------------------------------
def gi_name_to_sorted_list(file_name, gi_col, name_col):
    """ Suppose input file looks like this:
        a       2
        b       4
        c       5
        d       5
        where column 1 is gi_col and column 0 is name_col
        output of this function will look like this:
        [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
    """

    result = []
    try:
        F = open( file_name, 'r' )
        try:
            for line in F:
                file_cols = string.split(line.rstrip(), '\t')
                file_cols[gi_col] = int(  file_cols[gi_col] )
                result.append( [ file_cols[gi_col], file_cols[name_col] ] )
        except:
            print >>sys.stderr, 'Non numeric GI field...skipping'
            
    except Exception, e:
        stop_err('%s\n' % e)
    F.close()
    result.sort()
    return result   

# -----------------------------------------------------------------------------------

def collapse_repeating_gis( L ):
    """ Accepts 2-d array of gi-key pairs such as this
        L = [
                [gi1, 'key1'],
                [gi1, 'key2'],
                [gi2','key3']
            ]

         Returns this:
         [      [gi1, 'key1', 'key2'],
                [gi2, 'key3' ]
         ]
         
         The first value in each sublist MUST be int
    """
    gi = []
    i = 0
    result = []
    
    try:
        for item in L:
            if i == 0:
                prev = item[0]
            
            if prev != item[0]:
                prev_L = []
                prev_L.append( prev )
                result.append( prev_L + gi )
                prev = item[0]
                gi =[]
                
            gi.append( item[1] )
            i += 1
            
    except Exception, e:
        stop_err('%s\n' % e)
        
    prev_L = []
    prev_L.append( prev )
    result.append( prev_L + gi )
    del(L)
    return result

# -----------------------------------------------------------------------------------

def get_taxId( gi2tax_file, gi_name_list, out_file ):
    """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
        prints result to out_file

        gi2tax_file MUST be sorted on GI column

        gi_name_list is a list that look slike this:
        [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
        where the first element of each sublist is a GI number
        this list MUST also be sorted on GI

        This function searches through 117,000,000 rows of gi2taxId file from NCBI
        in approximately 4 minutes. This time is not dependent on the length of
        gi_name_list
    """

    L = gi_name_list.pop(0)
    my_gi = L[0]
    F = open( out_file, 'w' )
    gi = 0
    for line in file( gi2tax_file ):
        line = line.rstrip()
        gi, taxId = string.split( line, '\t' )
        gi = int( gi )
        
        if gi > my_gi:
            try:
                while ( my_gi < gi ):
                    L = gi_name_list.pop(0)
                    my_gi = L[0]
            except:
                break
    
        if  gi == my_gi:
            for i in range( 1,len( L ) ):
                print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
            try:
                L = gi_name_list.pop(0)
                my_gi = L[0]
            except:
                break

# -----------------------------------------------------------------------------------


try:
    in_f          = sys.argv[1]            # input file with GIs
    gi_col        = int( sys.argv[2] ) - 1 # column in input containing GIs
    name_col      = int( sys.argv[3] ) - 1 # column containing sequence names
    out_f         = sys.argv[4]            # output file
    tool_data     = sys.argv[5]
except:
    stop_err('Check arguments\n')

#  GI2TAX point to a file produced by concatenation of:
#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
#  and
#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
#  a sorting using this command:
#  sort -n -k 1

GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )

#  NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
#  files contained within:
#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz

NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )

g2n =  gi_name_to_sorted_list(in_f, gi_col, name_col)

if len(g2n) == 0:
    stop_err('No valid GI-containing fields. Please, check your column assignments.\n')

tb_F = tempfile.NamedTemporaryFile('w')

get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )

try:
    tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
    retcode = subprocess.call( tb_cmd, shell=True )
    if retcode < 0:
        print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
except OSError, e:
    print >>sys.stderr, "Execution of taxBuilder2tree failed:", e