comparison microsats_alignment_level.py @ 0:d4368a5a3fc7

Imported from capsule None
author devteam
date Tue, 01 Apr 2014 10:50:33 -0400
parents
children ad471b193191
comparison
equal deleted inserted replaced
-1:000000000000 0:d4368a5a3fc7
1 #!/usr/bin/env python
2 #Guruprasad Ananda
3 """
4 Uses SPUTNIK to fetch microsatellites and extracts orthologous repeats from the sputnik output.
5 """
6 import os
7 import re
8 import string
9 import sys
10 import tempfile
11
12 def reverse_complement(text):
13 DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
14 comp = [ch for ch in text.translate(DNA_COMP)]
15 comp.reverse()
16 return "".join(comp)
17
18
19 def main():
20 if len(sys.argv) != 8:
21 print >> sys.stderr, "Insufficient number of arguments."
22 sys.exit()
23
24 infile = open(sys.argv[1],'r')
25 separation = int(sys.argv[2])
26 outfile = sys.argv[3]
27 mono_threshold = int(sys.argv[5])
28 non_mono_threshold = int(sys.argv[6])
29 allow_different_units = int(sys.argv[7])
30
31 print "Min distance = %d bp; Min threshold for mono repeats = %d; Min threshold for non-mono repeats = %d; Allow different motifs = %s" % ( separation, mono_threshold, non_mono_threshold, allow_different_units==1 )
32 try:
33 fout = open(outfile, "w")
34 print >> fout, "#Block\tSeq1_Name\tSeq1_Start\tSeq1_End\tSeq1_Type\tSeq1_Length\tSeq1_RepeatNumber\tSeq1_Unit\tSeq2_Name\tSeq2_Start\tSeq2_End\tSeq2_Type\tSeq2_Length\tSeq2_RepeatNumber\tSeq2_Unit"
35 #sputnik_cmd = os.path.join(os.path.split(sys.argv[0])[0], "sputnik")
36 sputnik_cmd = "sputnik"
37 input = infile.read()
38 block_num = 0
39 input = input.replace('\r','\n')
40 for block in input.split('\n\n'):
41 block_num += 1
42 tmpin = tempfile.NamedTemporaryFile()
43 tmpout = tempfile.NamedTemporaryFile()
44 tmpin.write(block.strip())
45 cmdline = sputnik_cmd + " " + tmpin.name + " > /dev/null 2>&1 >> " + tmpout.name
46 try:
47 os.system(cmdline)
48 except Exception:
49 continue
50 sputnik_out = tmpout.read()
51 tmpin.close()
52 tmpout.close()
53 if sputnik_out != "":
54 if len(block.split('>')[1:]) != 2: #len(sputnik_out.split('>')):
55 continue
56 align_block = block.strip().split('>')
57
58 lendict = {'mononucleotide':1, 'dinucleotide':2, 'trinucleotide':3, 'tetranucleotide':4, 'pentanucleotide':5, 'hexanucleotide':6}
59 blockdict = {}
60 r = 0
61 namelist = []
62 for k, sput_block in enumerate(sputnik_out.split('>')[1:]):
63 whole_seq = ''.join(align_block[k+1].split('\n')[1:]).replace('\n','').strip()
64 p = re.compile('\n(\S*nucleotide)')
65 repeats = p.split(sput_block.strip())
66 repeats_count = len(repeats)
67 j = 1
68 name = repeats[0].strip()
69 try:
70 coords = re.search('\d+[-_:]\d+', name).group()
71 coords = coords.replace('_', '-').replace(':', '-')
72 except Exception:
73 coords = '0-0'
74 r += 1
75 blockdict[r] = {}
76 try:
77 sp_name = name[:name.index('.')]
78 chr_name = name[name.index('.'):name.index('(')]
79 namelist.append(sp_name + chr_name)
80 except:
81 namelist.append(name[:20])
82 while j < repeats_count:
83 try:
84 if repeats[j].strip() not in lendict:
85 j += 2
86 continue
87
88 if blockdict[r].has_key('types'):
89 blockdict[r]['types'].append(repeats[j].strip()) #type of microsat
90 else:
91 blockdict[r]['types'] = [repeats[j].strip()] #type of microsat
92
93 start = int(repeats[j+1].split('--')[0].split(':')[0].strip())
94 #check to see if there are gaps before the start of the repeat, and change the start accordingly
95 sgaps = 0
96 ch_pos = start - 1
97 while ch_pos >= 0:
98 if whole_seq[ch_pos] == '-':
99 sgaps += 1
100 else:
101 break #break at the 1st non-gap character
102 ch_pos -= 1
103 if blockdict[r].has_key('starts'):
104 blockdict[r]['starts'].append(start+sgaps) #start co-ords adjusted with alignment co-ords to include GAPS
105 else:
106 blockdict[r]['starts'] = [start+sgaps]
107
108 end = int(repeats[j+1].split('--')[0].split(':')[1].strip())
109 #check to see if there are gaps after the end of the repeat, and change the end accordingly
110 egaps = 0
111 for ch in whole_seq[end:]:
112 if ch == '-':
113 egaps += 1
114 else:
115 break #break at the 1st non-gap character
116 if blockdict[r].has_key('ends'):
117 blockdict[r]['ends'].append(end+egaps) #end co-ords adjusted with alignment co-ords to include GAPS
118 else:
119 blockdict[r]['ends'] = [end+egaps]
120
121 repeat_seq = ''.join(repeats[j+1].replace('\r','\n').split('\n')[1:]).strip() #Repeat Sequence
122 repeat_len = repeats[j+1].split('--')[1].split()[1].strip()
123 gap_count = repeat_seq.count('-')
124 #print repeats[j+1].split('--')[1], len(repeat_seq), repeat_len, gap_count
125 repeat_len = str(int(repeat_len) - gap_count)
126
127 rel_start = blockdict[r]['starts'][-1]
128 gaps_before_start = whole_seq[:rel_start].count('-')
129
130 if blockdict[r].has_key('gaps_before_start'):
131 blockdict[r]['gaps_before_start'].append(gaps_before_start) #lengths
132 else:
133 blockdict[r]['gaps_before_start'] = [gaps_before_start] #lengths
134
135 whole_seq_start = int(coords.split('-')[0])
136 if blockdict[r].has_key('whole_seq_start'):
137 blockdict[r]['whole_seq_start'].append(whole_seq_start) #lengths
138 else:
139 blockdict[r]['whole_seq_start'] = [whole_seq_start] #lengths
140
141 if blockdict[r].has_key('lengths'):
142 blockdict[r]['lengths'].append(repeat_len) #lengths
143 else:
144 blockdict[r]['lengths'] = [repeat_len] #lengths
145
146 if blockdict[r].has_key('counts'):
147 blockdict[r]['counts'].append(str(int(repeat_len)/lendict[repeats[j].strip()])) #Repeat Unit
148 else:
149 blockdict[r]['counts'] = [str(int(repeat_len)/lendict[repeats[j].strip()])] #Repeat Unit
150
151 if blockdict[r].has_key('units'):
152 blockdict[r]['units'].append(repeat_seq[:lendict[repeats[j].strip()]]) #Repeat Unit
153 else:
154 blockdict[r]['units'] = [repeat_seq[:lendict[repeats[j].strip()]]] #Repeat Unit
155
156 except Exception:
157 pass
158 j += 2
159 #check the co-ords of all repeats corresponding to a sequence and remove adjacent repeats separated by less than the user-specified 'separation'.
160 delete_index_list = []
161 for ind, item in enumerate(blockdict[r]['ends']):
162 try:
163 if blockdict[r]['starts'][ind+1]-item < separation:
164 if ind not in delete_index_list:
165 delete_index_list.append(ind)
166 if ind+1 not in delete_index_list:
167 delete_index_list.append(ind+1)
168 except Exception:
169 pass
170 for index in delete_index_list: #mark them for deletion
171 try:
172 blockdict[r]['starts'][index] = 'marked'
173 blockdict[r]['ends'][index] = 'marked'
174 blockdict[r]['types'][index] = 'marked'
175 blockdict[r]['gaps_before_start'][index] = 'marked'
176 blockdict[r]['whole_seq_start'][index] = 'marked'
177 blockdict[r]['lengths'][index] = 'marked'
178 blockdict[r]['counts'][index] = 'marked'
179 blockdict[r]['units'][index] = 'marked'
180 except Exception:
181 pass
182 #remove 'marked' elements from all the lists
183 """
184 for key in blockdict[r].keys():
185 for elem in blockdict[r][key]:
186 if elem == 'marked':
187 blockdict[r][key].remove(elem)
188 """
189 #print blockdict
190
191 #make sure that the blockdict has keys for both the species
192 if (1 not in blockdict) or (2 not in blockdict):
193 continue
194
195 visited_2 = [0 for x in range(len(blockdict[2]['starts']))]
196 for ind1, coord_s1 in enumerate(blockdict[1]['starts']):
197 if coord_s1 == 'marked':
198 continue
199 coord_e1 = blockdict[1]['ends'][ind1]
200 out = []
201 for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
202 if coord_s2 == 'marked':
203 visited_2[ind2] = 1
204 continue
205 coord_e2 = blockdict[2]['ends'][ind2]
206 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
207 if allow_different_units == 0:
208 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
209 continue
210 else:
211 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2) and (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2):
212 continue
213 #print >> sys.stderr, (reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2)
214 #skip if the repeat number thresholds are not met
215 if blockdict[1]['types'][ind1] == 'mononucleotide':
216 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
217 continue
218 else:
219 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
220 continue
221
222 if blockdict[2]['types'][ind2] == 'mononucleotide':
223 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
224 continue
225 else:
226 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
227 continue
228 #print "s1,e1=%s,%s; s2,e2=%s,%s" % ( coord_s1, coord_e1, coord_s2, coord_e2 )
229 if (coord_s1 in range(coord_s2, coord_e2)) or (coord_e1 in range(coord_s2, coord_e2)):
230 out.append(str(block_num))
231 out.append(namelist[0])
232 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
233 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
234 out.append(str(rel_start))
235 out.append(str(rel_end))
236 out.append(blockdict[1]['types'][ind1])
237 out.append(blockdict[1]['lengths'][ind1])
238 out.append(blockdict[1]['counts'][ind1])
239 out.append(blockdict[1]['units'][ind1])
240 out.append(namelist[1])
241 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
242 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
243 out.append(str(rel_start))
244 out.append(str(rel_end))
245 out.append(blockdict[2]['types'][ind2])
246 out.append(blockdict[2]['lengths'][ind2])
247 out.append(blockdict[2]['counts'][ind2])
248 out.append(blockdict[2]['units'][ind2])
249 print >> fout, '\t'.join(out)
250 visited_2[ind2] = 1
251 out = []
252
253 if 0 in visited_2: #there are still some elements in 2nd set which haven't found orthologs yet.
254 for ind2, coord_s2 in enumerate(blockdict[2]['starts']):
255 if coord_s2 == 'marked':
256 continue
257 if visited_2[ind] != 0:
258 continue
259 coord_e2 = blockdict[2]['ends'][ind2]
260 out = []
261 for ind1, coord_s1 in enumerate(blockdict[1]['starts']):
262 if coord_s1 == 'marked':
263 continue
264 coord_e1 = blockdict[1]['ends'][ind1]
265 #skip if the 2 repeats are not of the same type or don't have the same repeating unit.
266 if allow_different_units == 0:
267 if (blockdict[1]['types'][ind1] != blockdict[2]['types'][ind2]):
268 continue
269 else:
270 if (blockdict[1]['units'][ind1] not in blockdict[2]['units'][ind2]*2):# and reverse_complement(blockdict[1]['units'][ind1]) not in blockdict[2]['units'][ind2]*2:
271 continue
272 #skip if the repeat number thresholds are not met
273 if blockdict[1]['types'][ind1] == 'mononucleotide':
274 if (int(blockdict[1]['counts'][ind1]) < mono_threshold):
275 continue
276 else:
277 if (int(blockdict[1]['counts'][ind1]) < non_mono_threshold):
278 continue
279
280 if blockdict[2]['types'][ind2] == 'mononucleotide':
281 if (int(blockdict[2]['counts'][ind2]) < mono_threshold):
282 continue
283 else:
284 if (int(blockdict[2]['counts'][ind2]) < non_mono_threshold):
285 continue
286
287 if (coord_s2 in range(coord_s1, coord_e1)) or (coord_e2 in range(coord_s1, coord_e1)):
288 out.append(str(block_num))
289 out.append(namelist[0])
290 rel_start = blockdict[1]['whole_seq_start'][ind1] + coord_s1 - blockdict[1]['gaps_before_start'][ind1]
291 rel_end = rel_start + int(blockdict[1]['lengths'][ind1])
292 out.append(str(rel_start))
293 out.append(str(rel_end))
294 out.append(blockdict[1]['types'][ind1])
295 out.append(blockdict[1]['lengths'][ind1])
296 out.append(blockdict[1]['counts'][ind1])
297 out.append(blockdict[1]['units'][ind1])
298 out.append(namelist[1])
299 rel_start = blockdict[2]['whole_seq_start'][ind2] + coord_s2 - blockdict[2]['gaps_before_start'][ind2]
300 rel_end = rel_start + int(blockdict[2]['lengths'][ind2])
301 out.append(str(rel_start))
302 out.append(str(rel_end))
303 out.append(blockdict[2]['types'][ind2])
304 out.append(blockdict[2]['lengths'][ind2])
305 out.append(blockdict[2]['counts'][ind2])
306 out.append(blockdict[2]['units'][ind2])
307 print >> fout, '\t'.join(out)
308 visited_2[ind2] = 1
309 out = []
310
311 #print >> fout, blockdict
312 except Exception, exc:
313 print >> sys.stderr, "type(exc),args,exc: %s, %s, %s" % ( type(exc), exc.args, exc )
314
315
316 if __name__ == "__main__":
317 main()