Mercurial > repos > devteam > gi2taxonomy
comparison gi2taxonomy.py @ 0:7b1b03c4465d draft default tip
Imported from capsule None
| author | devteam |
|---|---|
| date | Mon, 27 Jan 2014 09:28:26 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7b1b03c4465d |
|---|---|
| 1 import sys | |
| 2 import string | |
| 3 import tempfile | |
| 4 import subprocess | |
| 5 from os import path | |
| 6 | |
| 7 # ----------------------------------------------------------------------------------- | |
| 8 | |
| 9 def stop_err(msg): | |
| 10 sys.stderr.write(msg) | |
| 11 sys.exit() | |
| 12 | |
| 13 # ----------------------------------------------------------------------------------- | |
| 14 def gi_name_to_sorted_list(file_name, gi_col, name_col): | |
| 15 """ Suppose input file looks like this: | |
| 16 a 2 | |
| 17 b 4 | |
| 18 c 5 | |
| 19 d 5 | |
| 20 where column 1 is gi_col and column 0 is name_col | |
| 21 output of this function will look like this: | |
| 22 [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']] | |
| 23 """ | |
| 24 | |
| 25 result = [] | |
| 26 try: | |
| 27 F = open( file_name, 'r' ) | |
| 28 try: | |
| 29 for line in F: | |
| 30 file_cols = string.split(line.rstrip(), '\t') | |
| 31 file_cols[gi_col] = int( file_cols[gi_col] ) | |
| 32 result.append( [ file_cols[gi_col], file_cols[name_col] ] ) | |
| 33 except: | |
| 34 print >>sys.stderr, 'Non numeric GI field...skipping' | |
| 35 | |
| 36 except Exception, e: | |
| 37 stop_err('%s\n' % e) | |
| 38 F.close() | |
| 39 result.sort() | |
| 40 return result | |
| 41 | |
| 42 # ----------------------------------------------------------------------------------- | |
| 43 | |
| 44 def collapse_repeating_gis( L ): | |
| 45 """ Accepts 2-d array of gi-key pairs such as this | |
| 46 L = [ | |
| 47 [gi1, 'key1'], | |
| 48 [gi1, 'key2'], | |
| 49 [gi2','key3'] | |
| 50 ] | |
| 51 | |
| 52 Returns this: | |
| 53 [ [gi1, 'key1', 'key2'], | |
| 54 [gi2, 'key3' ] | |
| 55 ] | |
| 56 | |
| 57 The first value in each sublist MUST be int | |
| 58 """ | |
| 59 gi = [] | |
| 60 i = 0 | |
| 61 result = [] | |
| 62 | |
| 63 try: | |
| 64 for item in L: | |
| 65 if i == 0: | |
| 66 prev = item[0] | |
| 67 | |
| 68 if prev != item[0]: | |
| 69 prev_L = [] | |
| 70 prev_L.append( prev ) | |
| 71 result.append( prev_L + gi ) | |
| 72 prev = item[0] | |
| 73 gi =[] | |
| 74 | |
| 75 gi.append( item[1] ) | |
| 76 i += 1 | |
| 77 | |
| 78 except Exception, e: | |
| 79 stop_err('%s\n' % e) | |
| 80 | |
| 81 prev_L = [] | |
| 82 prev_L.append( prev ) | |
| 83 result.append( prev_L + gi ) | |
| 84 del(L) | |
| 85 return result | |
| 86 | |
| 87 # ----------------------------------------------------------------------------------- | |
| 88 | |
| 89 def get_taxId( gi2tax_file, gi_name_list, out_file ): | |
| 90 """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and | |
| 91 prints result to out_file | |
| 92 | |
| 93 gi2tax_file MUST be sorted on GI column | |
| 94 | |
| 95 gi_name_list is a list that look slike this: | |
| 96 [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']] | |
| 97 where the first element of each sublist is a GI number | |
| 98 this list MUST also be sorted on GI | |
| 99 | |
| 100 This function searches through 117,000,000 rows of gi2taxId file from NCBI | |
| 101 in approximately 4 minutes. This time is not dependent on the length of | |
| 102 gi_name_list | |
| 103 """ | |
| 104 | |
| 105 L = gi_name_list.pop(0) | |
| 106 my_gi = L[0] | |
| 107 F = open( out_file, 'w' ) | |
| 108 gi = 0 | |
| 109 for line in file( gi2tax_file ): | |
| 110 line = line.rstrip() | |
| 111 gi, taxId = string.split( line, '\t' ) | |
| 112 gi = int( gi ) | |
| 113 | |
| 114 if gi > my_gi: | |
| 115 try: | |
| 116 while ( my_gi < gi ): | |
| 117 L = gi_name_list.pop(0) | |
| 118 my_gi = L[0] | |
| 119 except: | |
| 120 break | |
| 121 | |
| 122 if gi == my_gi: | |
| 123 for i in range( 1,len( L ) ): | |
| 124 print >>F, '%s\t%s\t%d' % (L[i], taxId, gi) | |
| 125 try: | |
| 126 L = gi_name_list.pop(0) | |
| 127 my_gi = L[0] | |
| 128 except: | |
| 129 break | |
| 130 | |
| 131 # ----------------------------------------------------------------------------------- | |
| 132 | |
| 133 | |
| 134 try: | |
| 135 in_f = sys.argv[1] # input file with GIs | |
| 136 gi_col = int( sys.argv[2] ) - 1 # column in input containing GIs | |
| 137 name_col = int( sys.argv[3] ) - 1 # column containing sequence names | |
| 138 out_f = sys.argv[4] # output file | |
| 139 tool_data = sys.argv[5] | |
| 140 except: | |
| 141 stop_err('Check arguments\n') | |
| 142 | |
| 143 # GI2TAX point to a file produced by concatenation of: | |
| 144 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip | |
| 145 # and | |
| 146 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip | |
| 147 # a sorting using this command: | |
| 148 # sort -n -k 1 | |
| 149 | |
| 150 GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' ) | |
| 151 | |
| 152 # NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg | |
| 153 # files contained within: | |
| 154 # ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz | |
| 155 | |
| 156 NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' ) | |
| 157 NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' ) | |
| 158 | |
| 159 g2n = gi_name_to_sorted_list(in_f, gi_col, name_col) | |
| 160 | |
| 161 if len(g2n) == 0: | |
| 162 stop_err('No valid GI-containing fields. Please, check your column assignments.\n') | |
| 163 | |
| 164 tb_F = tempfile.NamedTemporaryFile('w') | |
| 165 | |
| 166 get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name ) | |
| 167 | |
| 168 try: | |
| 169 tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f ) | |
| 170 retcode = subprocess.call( tb_cmd, shell=True ) | |
| 171 if retcode < 0: | |
| 172 print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode | |
| 173 except OSError, e: | |
| 174 print >>sys.stderr, "Execution of taxBuilder2tree failed:", e |
