| 
0
 | 
     1 import sys
 | 
| 
 | 
     2 import string
 | 
| 
 | 
     3 import tempfile
 | 
| 
 | 
     4 import subprocess
 | 
| 
 | 
     5 from os import path
 | 
| 
 | 
     6 
 | 
| 
 | 
     7 # -----------------------------------------------------------------------------------
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 def stop_err(msg):
 | 
| 
 | 
    10     sys.stderr.write(msg)
 | 
| 
 | 
    11     sys.exit()
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 # -----------------------------------------------------------------------------------
 | 
| 
 | 
    14 def gi_name_to_sorted_list(file_name, gi_col, name_col):
 | 
| 
 | 
    15     """ Suppose input file looks like this:
 | 
| 
 | 
    16         a       2
 | 
| 
 | 
    17         b       4
 | 
| 
 | 
    18         c       5
 | 
| 
 | 
    19         d       5
 | 
| 
 | 
    20         where column 1 is gi_col and column 0 is name_col
 | 
| 
 | 
    21         output of this function will look like this:
 | 
| 
 | 
    22         [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
 | 
| 
 | 
    23     """
 | 
| 
 | 
    24 
 | 
| 
 | 
    25     result = []
 | 
| 
 | 
    26     try:
 | 
| 
 | 
    27         F = open( file_name, 'r' )
 | 
| 
 | 
    28         try:
 | 
| 
 | 
    29             for line in F:
 | 
| 
 | 
    30                 file_cols = string.split(line.rstrip(), '\t')
 | 
| 
 | 
    31                 file_cols[gi_col] = int(  file_cols[gi_col] )
 | 
| 
 | 
    32                 result.append( [ file_cols[gi_col], file_cols[name_col] ] )
 | 
| 
 | 
    33         except:
 | 
| 
 | 
    34             print >>sys.stderr, 'Non numeric GI field...skipping'
 | 
| 
 | 
    35             
 | 
| 
 | 
    36     except Exception, e:
 | 
| 
 | 
    37         stop_err('%s\n' % e)
 | 
| 
 | 
    38     F.close()
 | 
| 
 | 
    39     result.sort()
 | 
| 
 | 
    40     return result   
 | 
| 
 | 
    41 
 | 
| 
 | 
    42 # -----------------------------------------------------------------------------------
 | 
| 
 | 
    43 
 | 
| 
 | 
    44 def collapse_repeating_gis( L ):
 | 
| 
 | 
    45     """ Accepts 2-d array of gi-key pairs such as this
 | 
| 
 | 
    46         L = [
 | 
| 
 | 
    47                 [gi1, 'key1'],
 | 
| 
 | 
    48                 [gi1, 'key2'],
 | 
| 
 | 
    49                 [gi2','key3']
 | 
| 
 | 
    50             ]
 | 
| 
 | 
    51 
 | 
| 
 | 
    52          Returns this:
 | 
| 
 | 
    53          [      [gi1, 'key1', 'key2'],
 | 
| 
 | 
    54                 [gi2, 'key3' ]
 | 
| 
 | 
    55          ]
 | 
| 
 | 
    56          
 | 
| 
 | 
    57          The first value in each sublist MUST be int
 | 
| 
 | 
    58     """
 | 
| 
 | 
    59     gi = []
 | 
| 
 | 
    60     i = 0
 | 
| 
 | 
    61     result = []
 | 
| 
 | 
    62     
 | 
| 
 | 
    63     try:
 | 
| 
 | 
    64         for item in L:
 | 
| 
 | 
    65             if i == 0:
 | 
| 
 | 
    66                 prev = item[0]
 | 
| 
 | 
    67             
 | 
| 
 | 
    68             if prev != item[0]:
 | 
| 
 | 
    69                 prev_L = []
 | 
| 
 | 
    70                 prev_L.append( prev )
 | 
| 
 | 
    71                 result.append( prev_L + gi )
 | 
| 
 | 
    72                 prev = item[0]
 | 
| 
 | 
    73                 gi =[]
 | 
| 
 | 
    74                 
 | 
| 
 | 
    75             gi.append( item[1] )
 | 
| 
 | 
    76             i += 1
 | 
| 
 | 
    77             
 | 
| 
 | 
    78     except Exception, e:
 | 
| 
 | 
    79         stop_err('%s\n' % e)
 | 
| 
 | 
    80         
 | 
| 
 | 
    81     prev_L = []
 | 
| 
 | 
    82     prev_L.append( prev )
 | 
| 
 | 
    83     result.append( prev_L + gi )
 | 
| 
 | 
    84     del(L)
 | 
| 
 | 
    85     return result
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 # -----------------------------------------------------------------------------------
 | 
| 
 | 
    88 
 | 
| 
 | 
    89 def get_taxId( gi2tax_file, gi_name_list, out_file ):
 | 
| 
 | 
    90     """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
 | 
| 
 | 
    91         prints result to out_file
 | 
| 
 | 
    92 
 | 
| 
 | 
    93         gi2tax_file MUST be sorted on GI column
 | 
| 
 | 
    94 
 | 
| 
 | 
    95         gi_name_list is a list that look slike this:
 | 
| 
 | 
    96         [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
 | 
| 
 | 
    97         where the first element of each sublist is a GI number
 | 
| 
 | 
    98         this list MUST also be sorted on GI
 | 
| 
 | 
    99 
 | 
| 
 | 
   100         This function searches through 117,000,000 rows of gi2taxId file from NCBI
 | 
| 
 | 
   101         in approximately 4 minutes. This time is not dependent on the length of
 | 
| 
 | 
   102         gi_name_list
 | 
| 
 | 
   103     """
 | 
| 
 | 
   104 
 | 
| 
 | 
   105     L = gi_name_list.pop(0)
 | 
| 
 | 
   106     my_gi = L[0]
 | 
| 
 | 
   107     F = open( out_file, 'w' )
 | 
| 
 | 
   108     gi = 0
 | 
| 
 | 
   109     for line in file( gi2tax_file ):
 | 
| 
 | 
   110         line = line.rstrip()
 | 
| 
 | 
   111         gi, taxId = string.split( line, '\t' )
 | 
| 
 | 
   112         gi = int( gi )
 | 
| 
 | 
   113         
 | 
| 
 | 
   114         if gi > my_gi:
 | 
| 
 | 
   115             try:
 | 
| 
 | 
   116                 while ( my_gi < gi ):
 | 
| 
 | 
   117                     L = gi_name_list.pop(0)
 | 
| 
 | 
   118                     my_gi = L[0]
 | 
| 
 | 
   119             except:
 | 
| 
 | 
   120                 break
 | 
| 
 | 
   121     
 | 
| 
 | 
   122         if  gi == my_gi:
 | 
| 
 | 
   123             for i in range( 1,len( L ) ):
 | 
| 
 | 
   124                 print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
 | 
| 
 | 
   125             try:
 | 
| 
 | 
   126                 L = gi_name_list.pop(0)
 | 
| 
 | 
   127                 my_gi = L[0]
 | 
| 
 | 
   128             except:
 | 
| 
 | 
   129                 break
 | 
| 
 | 
   130 
 | 
| 
 | 
   131 # -----------------------------------------------------------------------------------
 | 
| 
 | 
   132 
 | 
| 
 | 
   133 
 | 
| 
 | 
   134 try:
 | 
| 
 | 
   135     in_f          = sys.argv[1]            # input file with GIs
 | 
| 
 | 
   136     gi_col        = int( sys.argv[2] ) - 1 # column in input containing GIs
 | 
| 
 | 
   137     name_col      = int( sys.argv[3] ) - 1 # column containing sequence names
 | 
| 
 | 
   138     out_f         = sys.argv[4]            # output file
 | 
| 
 | 
   139     tool_data     = sys.argv[5]
 | 
| 
 | 
   140 except:
 | 
| 
 | 
   141     stop_err('Check arguments\n')
 | 
| 
 | 
   142 
 | 
| 
 | 
   143 #  GI2TAX point to a file produced by concatenation of:
 | 
| 
 | 
   144 #  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
 | 
| 
 | 
   145 #  and
 | 
| 
 | 
   146 #  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
 | 
| 
 | 
   147 #  a sorting using this command:
 | 
| 
 | 
   148 #  sort -n -k 1
 | 
| 
 | 
   149 
 | 
| 
 | 
   150 GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
 | 
| 
 | 
   151 
 | 
| 
 | 
   152 #  NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
 | 
| 
 | 
   153 #  files contained within:
 | 
| 
 | 
   154 #  ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
 | 
| 
 | 
   155 
 | 
| 
 | 
   156 NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
 | 
| 
 | 
   157 NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
 | 
| 
 | 
   158 
 | 
| 
 | 
   159 g2n =  gi_name_to_sorted_list(in_f, gi_col, name_col)
 | 
| 
 | 
   160 
 | 
| 
 | 
   161 if len(g2n) == 0:
 | 
| 
 | 
   162     stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
 | 
| 
 | 
   163 
 | 
| 
 | 
   164 tb_F = tempfile.NamedTemporaryFile('w')
 | 
| 
 | 
   165 
 | 
| 
 | 
   166 get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
 | 
| 
 | 
   167 
 | 
| 
 | 
   168 try:
 | 
| 
 | 
   169     tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
 | 
| 
 | 
   170     retcode = subprocess.call( tb_cmd, shell=True )
 | 
| 
 | 
   171     if retcode < 0:
 | 
| 
 | 
   172         print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
 | 
| 
 | 
   173 except OSError, e:
 | 
| 
 | 
   174     print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
 |