0
|
1 import sys
|
|
2 import string
|
|
3 import tempfile
|
|
4 import subprocess
|
|
5 from os import path
|
|
6
|
|
7 # -----------------------------------------------------------------------------------
|
|
8
|
|
9 def stop_err(msg):
|
|
10 sys.stderr.write(msg)
|
|
11 sys.exit()
|
|
12
|
|
13 # -----------------------------------------------------------------------------------
|
|
14 def gi_name_to_sorted_list(file_name, gi_col, name_col):
|
|
15 """ Suppose input file looks like this:
|
|
16 a 2
|
|
17 b 4
|
|
18 c 5
|
|
19 d 5
|
|
20 where column 1 is gi_col and column 0 is name_col
|
|
21 output of this function will look like this:
|
|
22 [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
|
|
23 """
|
|
24
|
|
25 result = []
|
|
26 try:
|
|
27 F = open( file_name, 'r' )
|
|
28 try:
|
|
29 for line in F:
|
|
30 file_cols = string.split(line.rstrip(), '\t')
|
|
31 file_cols[gi_col] = int( file_cols[gi_col] )
|
|
32 result.append( [ file_cols[gi_col], file_cols[name_col] ] )
|
|
33 except:
|
|
34 print >>sys.stderr, 'Non numeric GI field...skipping'
|
|
35
|
|
36 except Exception, e:
|
|
37 stop_err('%s\n' % e)
|
|
38 F.close()
|
|
39 result.sort()
|
|
40 return result
|
|
41
|
|
42 # -----------------------------------------------------------------------------------
|
|
43
|
|
44 def collapse_repeating_gis( L ):
|
|
45 """ Accepts 2-d array of gi-key pairs such as this
|
|
46 L = [
|
|
47 [gi1, 'key1'],
|
|
48 [gi1, 'key2'],
|
|
49 [gi2','key3']
|
|
50 ]
|
|
51
|
|
52 Returns this:
|
|
53 [ [gi1, 'key1', 'key2'],
|
|
54 [gi2, 'key3' ]
|
|
55 ]
|
|
56
|
|
57 The first value in each sublist MUST be int
|
|
58 """
|
|
59 gi = []
|
|
60 i = 0
|
|
61 result = []
|
|
62
|
|
63 try:
|
|
64 for item in L:
|
|
65 if i == 0:
|
|
66 prev = item[0]
|
|
67
|
|
68 if prev != item[0]:
|
|
69 prev_L = []
|
|
70 prev_L.append( prev )
|
|
71 result.append( prev_L + gi )
|
|
72 prev = item[0]
|
|
73 gi =[]
|
|
74
|
|
75 gi.append( item[1] )
|
|
76 i += 1
|
|
77
|
|
78 except Exception, e:
|
|
79 stop_err('%s\n' % e)
|
|
80
|
|
81 prev_L = []
|
|
82 prev_L.append( prev )
|
|
83 result.append( prev_L + gi )
|
|
84 del(L)
|
|
85 return result
|
|
86
|
|
87 # -----------------------------------------------------------------------------------
|
|
88
|
|
89 def get_taxId( gi2tax_file, gi_name_list, out_file ):
|
|
90 """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
|
|
91 prints result to out_file
|
|
92
|
|
93 gi2tax_file MUST be sorted on GI column
|
|
94
|
|
95 gi_name_list is a list that look slike this:
|
|
96 [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
|
|
97 where the first element of each sublist is a GI number
|
|
98 this list MUST also be sorted on GI
|
|
99
|
|
100 This function searches through 117,000,000 rows of gi2taxId file from NCBI
|
|
101 in approximately 4 minutes. This time is not dependent on the length of
|
|
102 gi_name_list
|
|
103 """
|
|
104
|
|
105 L = gi_name_list.pop(0)
|
|
106 my_gi = L[0]
|
|
107 F = open( out_file, 'w' )
|
|
108 gi = 0
|
|
109 for line in file( gi2tax_file ):
|
|
110 line = line.rstrip()
|
|
111 gi, taxId = string.split( line, '\t' )
|
|
112 gi = int( gi )
|
|
113
|
|
114 if gi > my_gi:
|
|
115 try:
|
|
116 while ( my_gi < gi ):
|
|
117 L = gi_name_list.pop(0)
|
|
118 my_gi = L[0]
|
|
119 except:
|
|
120 break
|
|
121
|
|
122 if gi == my_gi:
|
|
123 for i in range( 1,len( L ) ):
|
|
124 print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
|
|
125 try:
|
|
126 L = gi_name_list.pop(0)
|
|
127 my_gi = L[0]
|
|
128 except:
|
|
129 break
|
|
130
|
|
131 # -----------------------------------------------------------------------------------
|
|
132
|
|
133
|
|
134 try:
|
|
135 in_f = sys.argv[1] # input file with GIs
|
|
136 gi_col = int( sys.argv[2] ) - 1 # column in input containing GIs
|
|
137 name_col = int( sys.argv[3] ) - 1 # column containing sequence names
|
|
138 out_f = sys.argv[4] # output file
|
|
139 tool_data = sys.argv[5]
|
|
140 except:
|
|
141 stop_err('Check arguments\n')
|
|
142
|
|
143 # GI2TAX point to a file produced by concatenation of:
|
|
144 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
|
|
145 # and
|
|
146 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
|
|
147 # a sorting using this command:
|
|
148 # sort -n -k 1
|
|
149
|
|
150 GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
|
|
151
|
|
152 # NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
|
|
153 # files contained within:
|
|
154 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
|
|
155
|
|
156 NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
|
|
157 NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
|
|
158
|
|
159 g2n = gi_name_to_sorted_list(in_f, gi_col, name_col)
|
|
160
|
|
161 if len(g2n) == 0:
|
|
162 stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
|
|
163
|
|
164 tb_F = tempfile.NamedTemporaryFile('w')
|
|
165
|
|
166 get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
|
|
167
|
|
168 try:
|
|
169 tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
|
|
170 retcode = subprocess.call( tb_cmd, shell=True )
|
|
171 if retcode < 0:
|
|
172 print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
|
|
173 except OSError, e:
|
|
174 print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
|