0
|
1 #!/usr/bin/env python
|
|
2 #Guruprasad Ananda
|
|
3 """
|
|
4 Least Common Ancestor tool.
|
|
5 """
|
|
6 import sys, string, re, commands, tempfile, random
|
|
7
|
|
8 def stop_err(msg):
|
|
9 sys.stderr.write(msg)
|
|
10 sys.exit()
|
|
11
|
|
12 def main():
|
|
13 try:
|
|
14 inputfile = sys.argv[1]
|
|
15 outfile = sys.argv[2]
|
|
16 rank_bound = int( sys.argv[3] )
|
|
17 """
|
|
18 Mapping of ranks:
|
|
19 root :2,
|
|
20 superkingdom:3,
|
|
21 kingdom :4,
|
|
22 subkingdom :5,
|
|
23 superphylum :6,
|
|
24 phylum :7,
|
|
25 subphylum :8,
|
|
26 superclass :9,
|
|
27 class :10,
|
|
28 subclass :11,
|
|
29 superorder :12,
|
|
30 order :13,
|
|
31 suborder :14,
|
|
32 superfamily :15,
|
|
33 family :16,
|
|
34 subfamily :17,
|
|
35 tribe :18,
|
|
36 subtribe :19,
|
|
37 genus :20,
|
|
38 subgenus :21,
|
|
39 species :22,
|
|
40 subspecies :23,
|
|
41 """
|
|
42 except:
|
|
43 stop_err("Syntax error: Use correct syntax: program infile outfile")
|
|
44
|
|
45 fin = open(sys.argv[1],'r')
|
|
46 for j, line in enumerate( fin ):
|
|
47 elems = line.strip().split('\t')
|
|
48 if len(elems) < 24:
|
|
49 stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
|
|
50 if j > 30:
|
|
51 break
|
|
52 cols = range(1,len(elems))
|
|
53 fin.close()
|
|
54
|
|
55 group_col = 0
|
|
56 tmpfile = tempfile.NamedTemporaryFile()
|
|
57
|
|
58 try:
|
|
59 """
|
|
60 The -k option for the Posix sort command is as follows:
|
|
61 -k, --key=POS1[,POS2]
|
|
62 start a key at POS1, end it at POS2 (origin 1)
|
|
63 In other words, column positions start at 1 rather than 0, so
|
|
64 we need to add 1 to group_col.
|
|
65 if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
|
|
66 """
|
|
67 command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
|
|
68 except Exception, exc:
|
|
69 stop_err( 'Initialization error -> %s' %str(exc) )
|
|
70
|
|
71 error_code, stdout = commands.getstatusoutput(command_line)
|
|
72
|
|
73 if error_code != 0:
|
|
74 stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
|
|
75
|
|
76 prev_item = ""
|
|
77 prev_vals = []
|
|
78 remaining_vals = []
|
|
79 skipped_lines = 0
|
|
80 fout = open(outfile, "w")
|
|
81 block_valid = False
|
|
82
|
|
83
|
|
84 for ii, line in enumerate( file( tmpfile.name )):
|
|
85 if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
|
|
86 line = line.rstrip( '\r\n' )
|
|
87 try:
|
|
88 fields = line.split("\t")
|
|
89 item = fields[group_col]
|
|
90 if prev_item != "":
|
|
91 # At this level, we're grouping on values (item and prev_item) in group_col
|
|
92 if item == prev_item:
|
|
93 # Keep iterating and storing values until a new value is encountered.
|
|
94 if block_valid:
|
|
95 for i, col in enumerate(cols):
|
|
96 if col >= 3:
|
|
97 prev_vals[i].append(fields[col].strip())
|
|
98 if len(set(prev_vals[i])) > 1:
|
|
99 block_valid = False
|
|
100 break
|
|
101
|
|
102 else:
|
|
103 """
|
|
104 When a new value is encountered, write the previous value and the
|
|
105 corresponding aggregate values into the output file. This works
|
|
106 due to the sort on group_col we've applied to the data above.
|
|
107 """
|
|
108 out_list = ['']*24
|
|
109 out_list[0] = str(prev_item)
|
|
110 out_list[1] = str(prev_vals[0][0])
|
|
111 out_list[2] = str(prev_vals[1][0])
|
|
112
|
|
113 for k, col in enumerate(cols):
|
|
114 if col >= 3 and col < 24:
|
|
115 if len(set(prev_vals[k])) == 1:
|
|
116 out_list[col] = prev_vals[k][0]
|
|
117 else:
|
|
118 break
|
|
119 while k < 23:
|
|
120 out_list[k+1] = 'n'
|
|
121 k += 1
|
|
122
|
|
123 j = 0
|
|
124 while True:
|
|
125 try:
|
|
126 out_list.append(str(prev_vals[23+j][0]))
|
|
127 j += 1
|
|
128 except:
|
|
129 break
|
|
130
|
|
131 if rank_bound == 0:
|
|
132 print >>fout, '\t'.join(out_list).strip()
|
|
133 else:
|
|
134 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
|
|
135 print >>fout, '\t'.join(out_list).strip()
|
|
136
|
|
137 block_valid = True
|
|
138 prev_item = item
|
|
139 prev_vals = []
|
|
140 for col in cols:
|
|
141 val_list = []
|
|
142 val_list.append(fields[col].strip())
|
|
143 prev_vals.append(val_list)
|
|
144
|
|
145 else:
|
|
146 # This only occurs once, right at the start of the iteration.
|
|
147 block_valid = True
|
|
148 prev_item = item #groupby item
|
|
149 for col in cols: #everyting else
|
|
150 val_list = []
|
|
151 val_list.append(fields[col].strip())
|
|
152 prev_vals.append(val_list)
|
|
153
|
|
154 except:
|
|
155 skipped_lines += 1
|
|
156 else:
|
|
157 skipped_lines += 1
|
|
158
|
|
159 # Handle the last grouped value
|
|
160 out_list = ['']*24
|
|
161 out_list[0] = str(prev_item)
|
|
162 out_list[1] = str(prev_vals[0][0])
|
|
163 out_list[2] = str(prev_vals[1][0])
|
|
164
|
|
165 for k, col in enumerate(cols):
|
|
166 if col >= 3 and col < 24:
|
|
167 if len(set(prev_vals[k])) == 1:
|
|
168 out_list[col] = prev_vals[k][0]
|
|
169 else:
|
|
170 break
|
|
171 while k < 23:
|
|
172 out_list[k+1] = 'n'
|
|
173 k += 1
|
|
174
|
|
175 j = 0
|
|
176 while True:
|
|
177 try:
|
|
178 out_list.append(str(prev_vals[23+j][0]))
|
|
179 j += 1
|
|
180 except:
|
|
181 break
|
|
182
|
|
183 if rank_bound == 0:
|
|
184 print >>fout, '\t'.join(out_list).strip()
|
|
185 else:
|
|
186 if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
|
|
187 print >>fout, '\t'.join(out_list).strip()
|
|
188
|
|
189 if skipped_lines > 0:
|
|
190 print "Skipped %d invalid lines." % ( skipped_lines )
|
|
191
|
|
192 if __name__ == "__main__":
|
|
193 main() |