18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Interface to orient sequences before making a multiple alignment.
|
|
5 Use hashing or suffix tree to get an idea of the appropriate strand.
|
|
6 Use 'orienter' by default, otherwise use 'mummer'.
|
|
7 """
|
|
8
|
|
9 import sys
|
|
10 import os
|
|
11 import glob
|
|
12 import getopt
|
|
13
|
|
14 from commons.core.seq.BioseqDB import BioseqDB
|
|
15 import pyRepet.seq.fastaDB
|
|
16 from commons.core.checker.CheckerUtils import CheckerUtils
|
|
17
|
|
18 class OrientSequences( object ):
|
|
19 """
|
|
20 Interface to orient sequences before making a multiple alignment.
|
|
21 Use hashing or suffix tree to get an idea of the appropriate strand.
|
|
22 Use 'orienter' by default, otherwise use 'mummer'.
|
|
23 """
|
|
24
|
|
25 def __init__(self, inFileName="", minMatchLength=10, prgToOrient = "orienter", outFileName="", clean=False, verbosity=1):
|
|
26 """
|
|
27 Constructor.
|
|
28 """
|
|
29 self._inFileName = inFileName
|
|
30 self._minMatchLength = minMatchLength
|
|
31 self._prgToOrient = prgToOrient
|
|
32 self._outFileName = outFileName
|
|
33 self._clean = clean
|
|
34 self._verbose = verbosity
|
|
35
|
|
36 def help( self ):
|
|
37 """
|
|
38 Display the help on stdout.
|
|
39 """
|
|
40 print
|
|
41 print "usage:",sys.argv[0].split("/")[-1],"[options]"
|
|
42 print "options:"
|
|
43 print " -h: this help"
|
|
44 print " -i: name of the input file (format='fasta')"
|
|
45 print " -m: minimum match length (default=10)"
|
|
46 print " -p: program to use first (default=orienter/mummer)"
|
|
47 print " -o: name of the output file (default=inFileName+'.oriented')"
|
|
48 print " -c: clean"
|
|
49 print " -v: verbosity level (0/default=1/2)"
|
|
50 print
|
|
51
|
|
52 def setAttributesFromCmdLine( self ):
|
|
53 """
|
|
54 Set the attributes from the command-line.
|
|
55 """
|
|
56 try:
|
|
57 opts, args = getopt.getopt(sys.argv[1:],"hi:m:p:o:cv:")
|
|
58 except getopt.GetoptError, err:
|
|
59 print str(err); self.help(); sys.exit(1)
|
|
60 for o,a in opts:
|
|
61 if o == "-h":
|
|
62 self.help(); sys.exit(0)
|
|
63 elif o == "-i":
|
|
64 self.setInputFileName( a )
|
|
65 elif o == "-m":
|
|
66 self.setMinMatchLength( a )
|
|
67 elif o == "-p":
|
|
68 self.setPrgToOrient( a )
|
|
69 elif o == "-o":
|
|
70 self.setOutputFileName( a )
|
|
71 elif o == "-c":
|
|
72 self.setClean()
|
|
73 elif o == "-v":
|
|
74 self.setVerbosityLevel( a )
|
|
75
|
|
76 def setInputFileName( self, inFileName ):
|
|
77 self._inFileName = inFileName
|
|
78
|
|
79 def setMinMatchLength( self, minMatchLength ):
|
|
80 self._minMatchLength = int(minMatchLength)
|
|
81
|
|
82 def setPrgToOrient( self, prgToOrient ):
|
|
83 self._prgToOrient = prgToOrient
|
|
84
|
|
85 def setOutputFileName( self, outFileName ):
|
|
86 self._outFileName = outFileName
|
|
87
|
|
88 def setClean( self ):
|
|
89 self._clean = True
|
|
90
|
|
91 def setVerbosityLevel( self, verbose ):
|
|
92 self._verbose = int(verbose)
|
|
93
|
|
94 def checkAttributes( self ):
|
|
95 """
|
|
96 Check the attributes are valid before running the algorithm.
|
|
97 """
|
|
98 if self._inFileName == "":
|
|
99 print "ERROR: missing input file name"
|
|
100 self.help(); sys.exit(1)
|
|
101 if not os.path.exists( self._inFileName ):
|
|
102 print "ERROR: input file '%s' doesn't exist" % ( self._inFileName )
|
|
103 self.help(); sys.exit(1)
|
|
104 if self._prgToOrient not in [ "orienter", "mummer" ]:
|
|
105 print "ERROR: unknown program '%s'" % ( self._prgToOrient )
|
|
106 self.help(); sys.exit(1)
|
|
107 if self._outFileName == "":
|
|
108 self._outFileName = "%s.oriented" % ( self._inFileName )
|
|
109
|
|
110 def useOrienter( self ):
|
|
111 """
|
|
112 Use 'orienter'.
|
|
113 @return: exit value of 'orienter'
|
|
114 """
|
|
115 prg = "orienter"
|
|
116 cmd = prg
|
|
117 cmd += " -k"
|
|
118 # cmd += " -l %i" % ( self._minMatchLength )
|
|
119 cmd += " -v %i" % ( self._verbose )
|
|
120 cmd += " %s" % ( self._inFileName )
|
|
121 log = os.system( cmd )
|
|
122 if log == 0 and self._outFileName.split(".")[-1] != "oriented":
|
|
123 os.rename( self._inFileName + ".oriented", self._outFileName )
|
|
124 return log
|
|
125
|
|
126 def compareInputSequencesWithMummer( self, nbInSeq ):
|
|
127 """
|
|
128 Launch MUmmer on two single-sequence fasta files to find all maximal matches regardless of their uniqueness and record stdout.
|
|
129 Only N(N-1)/2 comparisons are made.
|
|
130 @param nbInSeq: number of input sequences
|
|
131 @type nbInSeq: integer
|
|
132 """
|
|
133 if self._verbose > 0:
|
|
134 print "aligning input sequences..."
|
|
135 sys.stdout.flush()
|
|
136 if not CheckerUtils.isExecutableInUserPath( "mummer" ):
|
|
137 msg = "ERROR: 'mummer' is not in your PATH"
|
|
138 sys.stderr.write( "%s\n" % ( msg ) )
|
|
139 sys.exit(1)
|
|
140 lInFiles = glob.glob( "batch_*.fa" )
|
|
141 for i in range( 1, nbInSeq+1 ):
|
|
142 for j in range( i+1, nbInSeq+1 ):
|
|
143 if self._verbose > 1:
|
|
144 print "launch MUmmer on %i versus %i" % ( i, j )
|
|
145 sys.stdout.flush()
|
|
146 prg = "mummer"
|
|
147 cmd = prg
|
|
148 cmd += " -maxmatch"
|
|
149 cmd += " -l %i" % ( self._minMatchLength )
|
|
150 cmd += " -b"
|
|
151 cmd += " -F"
|
|
152 cmd += " batch_%s.fa" % ( str(j).zfill( len( str( len(lInFiles) ) ) ) )
|
|
153 cmd += " batch_%s.fa" % ( str(i).zfill( len( str( len(lInFiles) ) ) ) )
|
|
154 cmd += " > mummer_%i_vs_%i.txt" % ( i, j )
|
|
155 if self._verbose < 3:
|
|
156 cmd += " 2> /dev/null"
|
|
157 returnStatus = os.system( cmd )
|
|
158 if returnStatus != 0:
|
|
159 msg = "ERROR: '%s' returned '%i'" % ( prg, returnStatus )
|
|
160 sys.stderr.write( "%s\n" % ( msg ) )
|
|
161 sys.exit(1)
|
|
162
|
|
163 def parseMummerOutput( self, mummerFileName, nameSeq1, nameSeq2 ):
|
|
164 """
|
|
165 Parse the output from MUmmer.
|
|
166 @param mummerFileName: file with the output from MUmmer
|
|
167 @type mummerFileName: string
|
|
168 @param nameSeq1: name of the first sequence in the pairwise comparison
|
|
169 @type nameSeq1: string
|
|
170 @param nameSeq2: name of the first sequence in the pairwise comparison
|
|
171 @type nameSeq2: string
|
|
172 @return: dictionary whose keys are strands and values the number of maximal matches
|
|
173 """
|
|
174 if self._verbose > 1:
|
|
175 print "parse '%s'" % ( mummerFileName )
|
|
176 sys.stdout.flush()
|
|
177 dStrand2NbMatches = {}
|
|
178 mummerF = open( mummerFileName, "r" )
|
|
179 strand = "direct"
|
|
180 nbMatches = 0
|
|
181 line = mummerF.readline()
|
|
182 while True:
|
|
183 if line == "":
|
|
184 break
|
|
185 if line[0] == ">":
|
|
186 if nameSeq1 not in line:
|
|
187 print "ERROR"
|
|
188 print nameSeq1
|
|
189 print nameSeq2
|
|
190 sys.exit(1)
|
|
191 if "Reverse" in line:
|
|
192 dStrand2NbMatches[ strand ] = nbMatches
|
|
193 strand = "reverse"
|
|
194 nbMatches = 0
|
|
195 else:
|
|
196 nbMatches += 1
|
|
197 line = mummerF.readline()
|
|
198 dStrand2NbMatches[ strand ] = nbMatches
|
|
199 mummerF.close()
|
|
200 if self._verbose > 1:
|
|
201 print " direct: %i maximal matches" % ( dStrand2NbMatches["direct"] )
|
|
202 print " reverse: %i maximal matches" % ( dStrand2NbMatches["reverse"] )
|
|
203 sys.stdout.flush()
|
|
204 return dStrand2NbMatches
|
|
205
|
|
206 def getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( self, lInHeaders, nbInSeq ):
|
|
207 """
|
|
208 For each pairwise comparison, retrieve the number of matches on both strand.
|
|
209 @param lInHeaders: list of the sequence headers
|
|
210 @type lInHeaders: list of strings
|
|
211 @param nbInSeq: number of input sequences
|
|
212 @type nbInSeq: integer
|
|
213 @return: dictionary whose keys are pairwise comparisons and values are number of matches on both strands
|
|
214 """
|
|
215 dMatrix = {}
|
|
216 for i in range( 1, nbInSeq+1 ):
|
|
217 for j in range( i+1, nbInSeq+1 ):
|
|
218 pairComp = "%i_vs_%i" % ( i, j )
|
|
219 dStrand2NbMatches = self.parseMummerOutput( "mummer_%s.txt" % ( pairComp ), lInHeaders[i-1], lInHeaders[j-1] )
|
|
220 dMatrix[ pairComp ] = dStrand2NbMatches
|
|
221 return dMatrix
|
|
222
|
|
223 def showResultsAsOrienter( self, nbInSeq, dMatrix ):
|
|
224 """
|
|
225 Not used for the moment but can be useful...
|
|
226 """
|
|
227 for i in range( 1, nbInSeq+1 ):
|
|
228 for j in range( i+1, nbInSeq+1 ):
|
|
229 pairComp = "%i_vs_%i" % ( i, j )
|
|
230 string = "aligning %i with %i" % ( i, j )
|
|
231 string += " %i %i" % ( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] )
|
|
232 string += " max=%i" % ( max( dMatrix[pairComp]["direct"], dMatrix[pairComp]["reverse"] ) )
|
|
233 if dMatrix[pairComp]["reverse"] > dMatrix[pairComp]["direct"]:
|
|
234 string += " strand=-"
|
|
235 string += " nb=%i" % ( dMatrix[pairComp]["reverse"] )
|
|
236 else:
|
|
237 string += " strand=+"
|
|
238 string += " nb=%i" % ( dMatrix[pairComp]["direct"] )
|
|
239 print string; sys.stdout.flush()
|
|
240
|
|
241 def getSequencesToReverseFromMatrix( self, dMatrix, lNewHeaders ):
|
|
242 """
|
|
243 Retrieve the sequences than need to be re-oriented.
|
|
244 @param dMatrix:
|
|
245 @type dMatrix:
|
|
246 @param lInHeaders: list of the new sequence headers
|
|
247 @type lInHeaders: list of strings
|
|
248 @return: list of headers corresponding to sequences than need to be re-oriented
|
|
249 """
|
|
250 lSeqToOrient = []
|
|
251
|
|
252 for i in range( 1, len(lNewHeaders)+1 ):
|
|
253 for j in range( i+1, len(lNewHeaders)+1 ):
|
|
254 comp = "%i_vs_%i" % ( i, j )
|
|
255 nbDirectMatches = dMatrix[ comp ][ "direct" ]
|
|
256 nbReverseMatches = dMatrix[ comp ][ "reverse" ]
|
|
257 if self._verbose > 1:
|
|
258 print "%s: direct=%i reverse=%i" % ( comp, nbDirectMatches, nbReverseMatches )
|
|
259 if nbReverseMatches > nbDirectMatches and lNewHeaders[i-1] not in lSeqToOrient:
|
|
260 if lNewHeaders[ j-1 ] not in lSeqToOrient:
|
|
261 if self._verbose > 2:
|
|
262 "need to reverse '%s'" % ( lNewHeaders[j-1] )
|
|
263 lSeqToOrient.append( lNewHeaders[ j-1 ] )
|
|
264 return lSeqToOrient
|
|
265
|
|
266
|
|
267 def getSequencesToReverse( self ):
|
|
268 """
|
|
269 Return a list with the headers of the sequences to reverse.
|
|
270 """
|
|
271 lSequenceHeadersToReverse = []
|
|
272
|
|
273 if self._prgToOrient == "orienter":
|
|
274 exitStatus = self.useOrienter()
|
|
275 if exitStatus == 0:
|
|
276 self.end()
|
|
277 sys.exit(0)
|
|
278 if exitStatus != 0:
|
|
279 print "\nWARNING: 'orienter' had a problem, switching to 'mummer'"
|
|
280 sys.stdout.flush()
|
|
281
|
|
282 lInHeaders = pyRepet.seq.fastaDB.dbHeaders( self._inFileName )
|
|
283 nbInSeq = len( lInHeaders )
|
|
284 if self._verbose > 0:
|
|
285 print "nb of input sequences: %i" % ( nbInSeq )
|
|
286 sys.stdout.flush()
|
|
287
|
|
288 pyRepet.seq.fastaDB.shortenSeqHeaders( self._inFileName, 1 )
|
|
289 tmpFileName = "%s.shortH" % ( self._inFileName )
|
|
290 lNewHeaders = pyRepet.seq.fastaDB.dbHeaders( tmpFileName )
|
|
291 dNew2Init = pyRepet.seq.fastaDB.retrieveLinksNewInitialHeaders( "%slink" % ( tmpFileName ) )
|
|
292
|
|
293 pyRepet.seq.fastaDB.dbSplit( tmpFileName, nbSeqPerBatch=1, newDir=True )
|
|
294 os.chdir( "batches" )
|
|
295 self.compareInputSequencesWithMummer( nbInSeq )
|
|
296 dMatrix = self.getCumulativeMatchLengthsOnBothStrandForEachPairwiseComparison( lNewHeaders, nbInSeq )
|
|
297 os.chdir( ".." )
|
|
298
|
|
299 lNewHeadersToReverse = self.getSequencesToReverseFromMatrix( dMatrix, lNewHeaders )
|
|
300 for newH in lNewHeadersToReverse:
|
|
301 lSequenceHeadersToReverse.append( dNew2Init[ newH ] )
|
|
302 if self._verbose > 0:
|
|
303 print "nb of sequences to reverse: %i" % ( len(lNewHeadersToReverse) )
|
|
304 for initH in lSequenceHeadersToReverse: print " %s" % ( initH )
|
|
305 sys.stdout.flush()
|
|
306
|
|
307 if self._clean:
|
|
308 os.remove( tmpFileName )
|
|
309 os.remove( "%slink" % ( tmpFileName ) )
|
|
310
|
|
311 return lSequenceHeadersToReverse
|
|
312
|
|
313 def orientInputSequences( self, lSequenceHeadersToReverse, tmpFileName="" ):
|
|
314 """
|
|
315 Save input sequences while re-orienting those needing it.
|
|
316 @param lSequenceHeadersToReverse: list of headers corresponding to sequences than need to be re-oriented
|
|
317 @type lSequenceHeadersToReverse: list of strings
|
|
318 @param tmpFileName: name of a fasta file (inFileName by default)
|
|
319 @type tmpFileName: string
|
|
320 """
|
|
321 if self._verbose > 0:
|
|
322 print "saving oriented sequences..."
|
|
323 sys.stdout.flush()
|
|
324 if tmpFileName == "":
|
|
325 tmpFileName = self._inFileName
|
|
326 inDB = BioseqDB( tmpFileName )
|
|
327 outDB = BioseqDB()
|
|
328 for bs in inDB.db:
|
|
329 if bs.header in lSequenceHeadersToReverse:
|
|
330 bs.reverseComplement()
|
|
331 bs.header += " re-oriented"
|
|
332 outDB.add( bs )
|
|
333 outDB.save( self._outFileName )
|
|
334
|
|
335 def clean( self ):
|
|
336 if os.path.exists( "batches" ):
|
|
337 os.system( "rm -rf batches" )
|
|
338 if os.path.exists( "orienter_error.log" ):
|
|
339 os.remove( "orienter_error.log" )
|
|
340 for f in glob.glob( "core.*" ):
|
|
341 os.remove( f )
|
|
342
|
|
343 def start( self ):
|
|
344 """
|
|
345 Useful commands before running the program.
|
|
346 """
|
|
347 self.checkAttributes()
|
|
348 if self._verbose > 0:
|
|
349 print "START %s" % ( type(self).__name__ )
|
|
350 print "input file: %s" % ( self._inFileName )
|
|
351 sys.stdout.flush()
|
|
352
|
|
353 def end( self ):
|
|
354 """
|
|
355 Useful commands before ending the program.
|
|
356 """
|
|
357 if self._clean:
|
|
358 self.clean()
|
|
359 if self._verbose > 0:
|
|
360 print "END %s" % ( type(self).__name__ )
|
|
361 sys.stdout.flush()
|
|
362
|
|
363 def run( self ):
|
|
364 """
|
|
365 Run the program.
|
|
366 """
|
|
367 self.start()
|
|
368 lSequenceHeadersToReverse = self.getSequencesToReverse()
|
|
369 self.orientInputSequences( lSequenceHeadersToReverse )
|
|
370 self.end()
|
|
371
|
|
372 if __name__ == "__main__":
|
|
373 i = OrientSequences()
|
|
374 i.setAttributesFromCmdLine()
|
|
375 i.run()
|