comparison commons/tools/refalign2fasta.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
comparison
equal deleted inserted replaced
17:b0e8584489e6 18:94ab73e8a190
1 #!/usr/bin/env python
2
3 ##@file
4 # Convert the output from Refalign (MSA program) into the 'fasta' format.
5 # Usually used before subsequent analysis such as the estimation of deletion rate.
6 #
7 # usage: refalign2fasta.py [ options ]
8 # options:
9 # -h: this help
10 # -i: name of the input file (output from refalign)
11 # -r: name of the reference sequence (discard if not provided)
12 # -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')
13 # -o: name of the output file (default=inFileName'.fa_aln',format='fasta')
14
15 import os
16 import sys
17 import getopt
18 import exceptions
19
20 if not os.environ.has_key( "REPET_PATH" ):
21 print "ERROR: no environment variable REPET_PATH"
22 sys.exit(1)
23 sys.path.append( os.environ["REPET_PATH"] )
24
25 import pyRepet.seq.Bioseq
26
27
28 def help():
29 """
30 Give the list of the command-line options.
31 """
32 print
33 print "usage:",sys.argv[0]," [ options ]"
34 print "options:"
35 print " -h: this help"
36 print " -i: name of the input file (output from refalign)"
37 print " -r: name of the reference sequence (discard if not provided)"
38 print " -g: for the gaps, keep only deletions ('d'), only insertions ('i') or both (default='id')"
39 print " -o: name of the output file (default=inFileName'.fa_aln',format='fasta')"
40 print
41
42
43 def getAlignments( inFileName ):
44 """
45 Retrieve the alignments from the input file.
46
47 @param inFileName: name of the input file
48 @type: string
49
50 @return: list of alignments ( refseq, seq, header of seq )
51 @rtype: list of 3d-tuples
52 """
53
54 lAlign = []
55
56 inFile = open( inFileName, "r" )
57 line = inFile.readline()
58 while True:
59 if line == "":
60 break
61 refseq, seq, label = line[:-1].split("\t")[:3]
62 lAlign.append( ( refseq, seq, label ) )
63 line = inFile.readline()
64 inFile.close()
65
66 return lAlign
67
68
69 def getGaps( seq ):
70 """
71 Get the gaps on a sequence, start coordinate and length. The start
72 coordinate of a gap is the # of the nucleotide after which it starts.
73
74 @param seq: sequence to analyse
75 @type seq: string
76
77 @return: list of gaps ( start coordinate, length )
78 @rtype: list of 2d-tuples
79 """
80
81 prev = "N"
82 lGapsOnSeq = []
83 i = 0
84 lengthGap = 0
85 for c in seq:
86 if c == "-" and prev != "-":
87 startGap = i
88 if c != "-" and prev == "-":
89 lGapsOnSeq.append( ( startGap, lengthGap ) )
90 lengthGap = 0
91 if c != "-":
92 i += 1
93 else:
94 lengthGap += 1
95 prev = c
96
97 # case with a gap at the end of the sequence
98 if seq[ len(seq) - 1 ] == "-":
99 lGapsOnSeq.append( ( startGap, lengthGap ) )
100
101 return lGapsOnSeq
102
103
104 def getGapsOnRefSeq( lAlign ):
105 """
106 Retrieve the gaps on the ref seq in all the alignments.
107
108 @param lAlign: list of alignments ( refseq, seq, header of seq )
109 @type lAlign: list of 3d-tuples
110
111 @return: list of gaps per alignment
112 @rtype: list of lists of 2d-tuples
113 """
114
115 lGapsOnRef = []
116
117 for align in lAlign:
118 refseq = align[0]
119 lGapsOnRef.append( getGaps( refseq ) )
120
121 return lGapsOnRef
122
123
124 def insertGap( seq, startGap, lengthGap ):
125 """
126 Get a new seq by inserting a gap in the give seq.
127
128 @param seq: sequence
129 @type seq: string
130
131 @param startGap:
132 @type: startGap: integer
133
134 @param lengthGap: length of the gap
135 @type lengthGap: integer
136
137 @return: new seq made from the give seq by inserting the gap
138 @rtype: string
139 """
140
141 new_seq = seq[:startGap] + (lengthGap*'-') + seq[startGap:]
142 return new_seq
143
144
145 def insertListGaps( inSeq, lGaps ):
146 """
147 Insert all the gaps from the list into the sequence.
148
149 @param inSeq: sequence
150 @type inSeq: string
151
152 @param lGaps: list of gaps ( start coordinate, length )
153 @type: list of 2d-tuples
154
155 @return: sequence with the gaps
156 @rtype: string
157 """
158
159 # insert gaps from the right to the left
160 lGaps.sort()
161 lGaps.reverse()
162
163 prevPos = 0
164 outSeq = inSeq
165
166 for startGap, lengthGap in lGaps:
167 if startGap != prevPos:
168 outSeq = insertGap( outSeq, startGap, lengthGap )
169 prevPos = startGap
170
171 return outSeq
172
173
174 def insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName ):
175 """
176 Get the ref seq with all its gaps inserted.
177
178 @param lAlign: list of alignments ( refseq, seq, header of seq )
179 @type lAlign: list of 3d-tuples
180
181 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
182 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
183
184 @param refseqName: name of the reference sequence
185 @type refseqName: string
186
187 @return: ref seq with all its gaps inserted
188 @rtype: string
189 """
190
191 # retrieve the initial ref seq, ie without any gap
192 initRefSeq = lAlign[0][0].replace("-","")
193
194 # convert the list of lists of gaps into a list of gaps
195 flat_lGaps = []
196 for gaps in lGapsOnRefSeqPerAlign:
197 flat_lGaps.extend( gaps )
198
199 # insert the gaps in the sequence of ref seq
200 finalRefSeq = insertListGaps( initRefSeq, flat_lGaps )
201
202 Align_refseq = ( refseqName, finalRefSeq )
203
204 return Align_refseq
205
206
207 def insertgap_seq( lAlign, lGapsOnRefSeqPerAlign ):
208 """
209 Insert in the sequences all the gaps found on the ref seq.
210
211 @param lAlign: list of alignments ( refseq, seq, header of seq )
212 @type lAlign: list of 3d-tuples
213
214 @param lGapsOnRefSeqPerAlign: list of list of gaps on the seq ref per alignment
215 @type lGapsOnRefSeqPerAlign: list of list of 2d-tuples
216
217 @return: list of lists (sequence with gaps, header)
218 @rtype: list of 2d-tuples
219
220 insert dans les seq les gaps donnes par la liste de liste de gap
221 retourne une liste des nouvelles seq apres insertion des gaps
222 """
223
224 # for each gap, add the nb of the alignment in which it has been found
225 newlistgap_seq =[]
226 alignID = 0
227 for lGaps in lGapsOnRefSeqPerAlign:
228 for startGap,lengthGap in lGaps:
229 newlistgap_seq.append( ( startGap, lengthGap, alignID ) )
230 alignID += 1
231 newlistgap_seq.sort()
232
233 Align_seq = []
234
235 # for each alignment
236 for j in xrange(0,len(lAlign)):
237
238 #create a new list = list of gaps to be inserted in a given seq
239 newlist = []
240 offset = 0
241 longmax = 0
242 longself = 0
243 posprec = 0
244 for startGap, lengthGap, alignID in newlistgap_seq:
245 # when 2 gaps have the same start, we keep the longer one
246 if startGap != posprec:
247 if longmax > longself:
248 newlist.append( (posprec + offset, longmax - longself) )
249 offset += longself
250 longmax=0
251 longself=0
252 #lorsque le numero de la seq du tuple a la meme valeur que j
253 #on stocke la valeur de lengthGap
254 if j == alignID:
255 longself = lengthGap
256 #sinon on prend comme valeur de longmax la valeur maximale de longmax et lengthGap
257 else:
258 longmax = max(longmax, lengthGap)
259 posprec = startGap
260 if longmax > longself:
261 newlist.append((posprec + offset, longmax - longself))
262
263 newSeq = insertListGaps( (lAlign[j][1]), newlist )
264 header = lAlign[j][2]
265 Align_seq.append( ( header, newSeq ) )
266
267 return Align_seq
268
269
270 def getSeqWithDeletions( lAlign ):
271 """
272 Get the sequences by putting gaps only when they correspond to a deletion compare to ref seq.
273 Used for instance when we want to estimate the deletion rate.
274
275 @param lAlign: list of alignments ( refseq, seq, header of seq )
276 @type lAlign: list of 3d-tuples
277
278 @return: list of lists ( header, sequence with gaps )
279 @rtype: list of 2d-tuples
280 """
281
282 Align_seq = []
283
284 for align in lAlign:
285 refseq = align[0]
286 seq = align[1]
287 header = align[2]
288 newSeq = ""
289 for i in xrange(0,len(refseq)):
290 if refseq[i] != "-":
291 newSeq += seq[i]
292 Align_seq.append( ( header, newSeq ) )
293
294 return Align_seq
295
296
297 def saveMSA( outFileName, Align_seq, Align_seqref=None ):
298 """
299 Save the results as a multiple sequence alignment (MSA) in the 'fasta' format.
300
301 @param outFileName: name of the output file
302 @type outFileName: string
303
304 @param Align_seqref: sequence of ref seq
305 @type Align_seqref: string
306 """
307
308 outFile = open( outFileName, "w" )
309 bs = pyRepet.seq.Bioseq.Bioseq()
310
311 # if provided, save the ref seq
312 if Align_seqref != None:
313 bs.header = Align_seqref[0]
314 bs.sequence = Align_seqref[1]
315 bs.write( outFile )
316
317 # save the other sequences
318 for i in Align_seq:
319 bs.header = i[0]
320 bs.sequence = i[1]
321 bs.write( outFile )
322
323 outFile.close()
324
325
326 def saveOnlyWithDeletions( lAlign, refseqName, outFileName ):
327 Align_seq = getSeqWithDeletions( lAlign )
328 if refseqName != "":
329 Align_seqref = ( refseqName, lAlign[0][0].replace("-","") )
330 saveMSA( outFileName, Align_seq, Align_seqref )
331 else:
332 saveMSA( outFileName, Align_seq )
333
334
335 def main():
336
337 inFileName = ""
338 refseqName = ""
339 keepGap = "id"
340 outFileName = ""
341 global verbose
342 verbose = 0
343
344 try:
345 opts, args = getopt.getopt(sys.argv[1:],"hi:r:g:o:v:")
346 except getopt.GetoptError, err:
347 print str(err)
348 help()
349 sys.exit(1)
350 for o,a in opts:
351 if o == "-h":
352 help()
353 sys.exit(0)
354 elif o == "-i":
355 inFileName = a
356 elif o == "-r":
357 refseqName = a
358 elif o == "-g":
359 keepGap = a
360 elif o == "-o":
361 outFileName = a
362 elif o == "-v":
363 verbose = int(a)
364
365 if inFileName == "":
366 print "ERROR: missing input file name"
367 help()
368 sys.exit(1)
369
370 if verbose > 0:
371 print "START %s" % (sys.argv[0].split("/")[-1])
372 sys.stdout.flush()
373
374 lAlign = getAlignments( inFileName )
375 if verbose > 0:
376 print "nb of alignments: %i" % ( len(lAlign) )
377 sys.stdout.flush()
378
379 if outFileName == "":
380 outFileName = "%s.fa_aln" % ( inFileName )
381 if verbose > 0:
382 print "output file: '%s'" % ( outFileName )
383
384 if keepGap == "id":
385 lGapsOnRefSeqPerAlign = getGapsOnRefSeq( lAlign )
386 Align_seq = insertgap_seq( lAlign, lGapsOnRefSeqPerAlign )
387 if refseqName != "":
388 Align_seqref = insertGapsInRefSeq( lAlign, lGapsOnRefSeqPerAlign, refseqName )
389 saveMSA( outFileName, Align_seq, Align_seqref )
390 else:
391 saveMSA( outFileName, Align_seq )
392
393 elif keepGap == "d":
394 saveOnlyWithDeletions( lAlign, refseqName, outFileName )
395
396 elif keepGap == "i":
397 print "ERROR: '-g i' not yet available"
398 sys.exit(1)
399
400 if verbose > 0:
401 print "END %s" % (sys.argv[0].split("/")[-1])
402 sys.stdout.flush()
403
404 return 0
405
406
407 if __name__ == "__main__" :
408 main ()