annotate gi2taxonomy.py @ 0:7b1b03c4465d draft default tip

Imported from capsule None
author devteam
date Mon, 27 Jan 2014 09:28:26 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
1 import sys
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
2 import string
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
3 import tempfile
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
4 import subprocess
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
5 from os import path
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
6
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
7 # -----------------------------------------------------------------------------------
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
8
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
9 def stop_err(msg):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
10 sys.stderr.write(msg)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
11 sys.exit()
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
12
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
13 # -----------------------------------------------------------------------------------
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
14 def gi_name_to_sorted_list(file_name, gi_col, name_col):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
15 """ Suppose input file looks like this:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
16 a 2
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
17 b 4
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
18 c 5
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
19 d 5
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
20 where column 1 is gi_col and column 0 is name_col
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
21 output of this function will look like this:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
22 [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
23 """
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
24
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
25 result = []
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
26 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
27 F = open( file_name, 'r' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
28 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
29 for line in F:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
30 file_cols = string.split(line.rstrip(), '\t')
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
31 file_cols[gi_col] = int( file_cols[gi_col] )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
32 result.append( [ file_cols[gi_col], file_cols[name_col] ] )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
33 except:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
34 print >>sys.stderr, 'Non numeric GI field...skipping'
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
35
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
36 except Exception, e:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
37 stop_err('%s\n' % e)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
38 F.close()
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
39 result.sort()
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
40 return result
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
41
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
42 # -----------------------------------------------------------------------------------
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
43
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
44 def collapse_repeating_gis( L ):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
45 """ Accepts 2-d array of gi-key pairs such as this
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
46 L = [
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
47 [gi1, 'key1'],
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
48 [gi1, 'key2'],
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
49 [gi2','key3']
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
50 ]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
51
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
52 Returns this:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
53 [ [gi1, 'key1', 'key2'],
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
54 [gi2, 'key3' ]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
55 ]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
56
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
57 The first value in each sublist MUST be int
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
58 """
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
59 gi = []
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
60 i = 0
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
61 result = []
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
62
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
63 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
64 for item in L:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
65 if i == 0:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
66 prev = item[0]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
67
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
68 if prev != item[0]:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
69 prev_L = []
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
70 prev_L.append( prev )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
71 result.append( prev_L + gi )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
72 prev = item[0]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
73 gi =[]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
74
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
75 gi.append( item[1] )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
76 i += 1
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
77
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
78 except Exception, e:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
79 stop_err('%s\n' % e)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
80
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
81 prev_L = []
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
82 prev_L.append( prev )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
83 result.append( prev_L + gi )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
84 del(L)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
85 return result
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
86
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
87 # -----------------------------------------------------------------------------------
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
88
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
89 def get_taxId( gi2tax_file, gi_name_list, out_file ):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
90 """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
91 prints result to out_file
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
92
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
93 gi2tax_file MUST be sorted on GI column
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
94
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
95 gi_name_list is a list that look slike this:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
96 [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
97 where the first element of each sublist is a GI number
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
98 this list MUST also be sorted on GI
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
99
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
100 This function searches through 117,000,000 rows of gi2taxId file from NCBI
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
101 in approximately 4 minutes. This time is not dependent on the length of
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
102 gi_name_list
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
103 """
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
104
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
105 L = gi_name_list.pop(0)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
106 my_gi = L[0]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
107 F = open( out_file, 'w' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
108 gi = 0
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
109 for line in file( gi2tax_file ):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
110 line = line.rstrip()
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
111 gi, taxId = string.split( line, '\t' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
112 gi = int( gi )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
113
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
114 if gi > my_gi:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
115 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
116 while ( my_gi < gi ):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
117 L = gi_name_list.pop(0)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
118 my_gi = L[0]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
119 except:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
120 break
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
121
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
122 if gi == my_gi:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
123 for i in range( 1,len( L ) ):
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
124 print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
125 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
126 L = gi_name_list.pop(0)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
127 my_gi = L[0]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
128 except:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
129 break
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
130
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
131 # -----------------------------------------------------------------------------------
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
132
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
133
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
134 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
135 in_f = sys.argv[1] # input file with GIs
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
136 gi_col = int( sys.argv[2] ) - 1 # column in input containing GIs
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
137 name_col = int( sys.argv[3] ) - 1 # column containing sequence names
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
138 out_f = sys.argv[4] # output file
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
139 tool_data = sys.argv[5]
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
140 except:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
141 stop_err('Check arguments\n')
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
142
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
143 # GI2TAX point to a file produced by concatenation of:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
144 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
145 # and
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
146 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
147 # a sorting using this command:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
148 # sort -n -k 1
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
149
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
150 GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
151
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
152 # NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
153 # files contained within:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
154 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
155
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
156 NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
157 NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
158
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
159 g2n = gi_name_to_sorted_list(in_f, gi_col, name_col)
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
160
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
161 if len(g2n) == 0:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
162 stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
163
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
164 tb_F = tempfile.NamedTemporaryFile('w')
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
165
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
166 get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
167
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
168 try:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
169 tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
170 retcode = subprocess.call( tb_cmd, shell=True )
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
171 if retcode < 0:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
172 print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
173 except OSError, e:
7b1b03c4465d Imported from capsule None
devteam
parents:
diff changeset
174 print >>sys.stderr, "Execution of taxBuilder2tree failed:", e