Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.0/commons/core/coord/AlignUtils.py @ 6:20ec0d14798e draft
Uploaded
author | urgi-team |
---|---|
date | Wed, 20 Jul 2016 05:00:24 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:4093a2fb58be | 6:20ec0d14798e |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import os | |
33 import sys | |
34 import shutil | |
35 from commons.core.coord.Align import Align | |
36 | |
37 | |
38 ## Static methods manipulating Align instances | |
39 # | |
40 class AlignUtils( object ): | |
41 | |
42 ## Return a list with Align instances from the given file | |
43 # | |
44 # @param inFile name of a file in the Align format | |
45 # | |
46 def getAlignListFromFile( inFile ): | |
47 lAlignInstances = [] | |
48 inFileHandler = open( inFile, "r" ) | |
49 while True: | |
50 line = inFileHandler.readline() | |
51 if line == "": | |
52 break | |
53 a = Align() | |
54 a.setFromString( line ) | |
55 lAlignInstances.append( a ) | |
56 inFileHandler.close() | |
57 return lAlignInstances | |
58 | |
59 getAlignListFromFile = staticmethod( getAlignListFromFile ) | |
60 | |
61 | |
62 ## Return a list with all the scores | |
63 # | |
64 # @param lAlignInstances: list of Align instances | |
65 # | |
66 def getListOfScores( lAlignInstances ): | |
67 lScores = [] | |
68 for iAlign in lAlignInstances: | |
69 lScores.append( iAlign.score ) | |
70 return lScores | |
71 | |
72 getListOfScores = staticmethod( getListOfScores ) | |
73 | |
74 | |
75 ## Return a list with all the scores from the given file | |
76 # | |
77 # @param inFile name of a file in the Align format | |
78 # | |
79 def getScoreListFromFile(inFile): | |
80 lScores = [] | |
81 append = lScores.append | |
82 with open(inFile, "r") as inFileHandler: | |
83 line = inFileHandler.readline() | |
84 while line: | |
85 if line != "\n": | |
86 append(int(line.split('\t')[7])) | |
87 line = inFileHandler.readline() | |
88 return lScores | |
89 | |
90 getScoreListFromFile = staticmethod( getScoreListFromFile ) | |
91 | |
92 | |
93 ## for each line of a given Align file, write the coordinates on the query and the subject as two distinct lines in a Map file | |
94 # | |
95 # @param alignFile: name of the input Align file | |
96 # @param mapFile: name of the output Map file | |
97 # | |
98 def convertAlignFileIntoMapFileWithQueriesAndSubjects( alignFile, mapFile ): | |
99 alignFileHandler = open( alignFile, "r" ) | |
100 mapFileHandler = open( mapFile, "w" ) | |
101 iAlign = Align() | |
102 while True: | |
103 line = alignFileHandler.readline() | |
104 if line == "": | |
105 break | |
106 iAlign.setFromString( line ) | |
107 iMapQ, iMapS = iAlign.getMapsOfQueryAndSubject() | |
108 iMapQ.write( mapFileHandler ) | |
109 iMapS.write( mapFileHandler ) | |
110 alignFileHandler.close() | |
111 mapFileHandler.close() | |
112 | |
113 convertAlignFileIntoMapFileWithQueriesAndSubjects = staticmethod( convertAlignFileIntoMapFileWithQueriesAndSubjects ) | |
114 | |
115 | |
116 ## for each line of a given Align file, write the coordinates of the subject on the query as one line in a Map file | |
117 # | |
118 # @param alignFile: name of the input Align file | |
119 # @param mapFile: name of the output Map file | |
120 # | |
121 def convertAlignFileIntoMapFileWithSubjectsOnQueries( alignFile, mapFile ): | |
122 alignFileHandler = open( alignFile, "r" ) | |
123 mapFileHandler = open( mapFile, "w" ) | |
124 iAlign = Align() | |
125 while True: | |
126 line = alignFileHandler.readline() | |
127 if line == "": | |
128 break | |
129 iAlign.setFromString( line ) | |
130 iMapQ = iAlign.getSubjectAsMapOfQuery() | |
131 iMapQ.write( mapFileHandler ) | |
132 alignFileHandler.close() | |
133 mapFileHandler.close() | |
134 | |
135 convertAlignFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertAlignFileIntoMapFileWithSubjectsOnQueries ) | |
136 | |
137 | |
138 ## return a list of Align instances sorted in decreasing order according to their score, then their length on the query and finally their initial order | |
139 # | |
140 # @param lAligns: list of Align instances | |
141 # | |
142 def getAlignListSortedByDecreasingScoreThenLength( lAligns ): | |
143 return sorted( lAligns, key=lambda iAlign: ( 1 / float(iAlign.getScore()), 1 / float(iAlign.getLengthOnQuery()) ) ) | |
144 | |
145 getAlignListSortedByDecreasingScoreThenLength = staticmethod( getAlignListSortedByDecreasingScoreThenLength ) | |
146 | |
147 | |
148 ## Convert an Align file into a Path file | |
149 # | |
150 # @param alignFile string name of the input Align file | |
151 # @param pathFile string name of the output Path file | |
152 # | |
153 def convertAlignFileIntoPathFile( alignFile, pathFile ): | |
154 alignFileHandler = open( alignFile, "r" ) | |
155 pathFileHandler = open( pathFile, "w" ) | |
156 iAlign = Align() | |
157 countAlign = 0 | |
158 while True: | |
159 line = alignFileHandler.readline() | |
160 if line == "": | |
161 break | |
162 countAlign += 1 | |
163 iAlign.setFromString( line, "\t" ) | |
164 pathFileHandler.write( "%i\t%s\n" % ( countAlign, iAlign.toString() ) ) | |
165 alignFileHandler.close() | |
166 pathFileHandler.close() | |
167 | |
168 convertAlignFileIntoPathFile = staticmethod( convertAlignFileIntoPathFile ) | |
169 | |
170 | |
171 ## Sort an Align file | |
172 # | |
173 def sortAlignFile( inFile, outFile="" ): | |
174 if outFile == "": | |
175 outFile = "%s.sort" % ( inFile ) | |
176 prg = "sort" | |
177 cmd = prg | |
178 cmd += " -k 1,1 -k 4,4 -k 2,2n -k 3,3n -k 5,5n -k 6,6n -k 8,8n" | |
179 cmd += " %s" % ( inFile ) | |
180 cmd += " > %s" % ( outFile ) | |
181 exitStatus = os.system( cmd ) | |
182 if exitStatus != 0: | |
183 msg = "ERROR: '%s' returned '%i'" % ( prg, exitStatus ) | |
184 sys.stderr.write( "%s\n" % ( msg ) ) | |
185 sys.exit( exitStatus ) | |
186 | |
187 sortAlignFile = staticmethod( sortAlignFile ) | |
188 | |
189 | |
190 ## Write Align instances contained in the given list | |
191 # | |
192 # @param lAlign a list of Align instances | |
193 # @param fileName name of the file to write the Align instances | |
194 # @param mode the open mode of the file ""w"" or ""a"" | |
195 # | |
196 def writeListInFile( lAlign, fileName, mode="w" ): | |
197 fileHandler = open( fileName, mode ) | |
198 for iAlign in lAlign: | |
199 iAlign.write( fileHandler ) | |
200 fileHandler.close() | |
201 | |
202 writeListInFile = staticmethod( writeListInFile ) | |
203 | |
204 | |
205 ## Split a list of Align instances according to the name of the query | |
206 # | |
207 # @param lInAlign list of align instances | |
208 # @return lOutAlignList list of align instances lists | |
209 # | |
210 def splitAlignListByQueryName( lInAlign ): | |
211 lSortedAlign = sorted(lInAlign, key=lambda o: o.range_query.seqname) | |
212 lOutAlignList = [] | |
213 if len(lSortedAlign) != 0 : | |
214 lAlignForCurrentQuery = [] | |
215 previousQuery = lSortedAlign[0].range_query.seqname | |
216 for align in lSortedAlign : | |
217 currentQuery = align.range_query.seqname | |
218 if previousQuery != currentQuery : | |
219 lOutAlignList.append(lAlignForCurrentQuery) | |
220 previousQuery = currentQuery | |
221 lAlignForCurrentQuery = [] | |
222 lAlignForCurrentQuery.append(align) | |
223 | |
224 lOutAlignList.append(lAlignForCurrentQuery) | |
225 | |
226 return lOutAlignList | |
227 | |
228 splitAlignListByQueryName = staticmethod( splitAlignListByQueryName ) | |
229 | |
230 | |
231 ## Create an Align file from each list of Align instances in the input list | |
232 # | |
233 # @param lAlignList list of lists with Align instances | |
234 # @param pattern string | |
235 # @param dirName string | |
236 # | |
237 def createAlignFiles( lAlignList, pattern, dirName="" ): | |
238 savedDir = os.getcwd() | |
239 nbFiles = len(lAlignList) | |
240 countFile = 1 | |
241 if dirName != "" : | |
242 try: | |
243 os.makedirs(dirName) | |
244 except: | |
245 pass | |
246 os.chdir(dirName) | |
247 | |
248 for lAlign in lAlignList: | |
249 fileName = "%s_%s.align" % (pattern, str(countFile).zfill(len(str(nbFiles)))) | |
250 AlignUtils.writeListInFile(lAlign, fileName) | |
251 countFile += 1 | |
252 os.chdir(savedDir) | |
253 | |
254 createAlignFiles = staticmethod( createAlignFiles ) | |
255 | |
256 | |
257 ## Return a list with Align instances sorted by query name, subject name, query start, query end and score | |
258 # | |
259 def sortList( lAligns ): | |
260 return sorted( lAligns, key=lambda iAlign: ( iAlign.getQueryName(), | |
261 iAlign.getSubjectName(), | |
262 iAlign.getQueryStart(), | |
263 iAlign.getQueryEnd(), | |
264 iAlign.getScore() ) ) | |
265 | |
266 sortList = staticmethod( sortList ) | |
267 | |
268 | |
269 ## Return a list after merging all overlapping Align instances | |
270 # | |
271 def mergeList( lAligns ): | |
272 lMerged = [] | |
273 | |
274 lSorted = AlignUtils.sortList( lAligns ) | |
275 | |
276 prev_count = 0 | |
277 for iAlign in lSorted: | |
278 if prev_count != len(lSorted): | |
279 for i in lSorted[ prev_count + 1: ]: | |
280 if iAlign.isOverlapping( i ): | |
281 iAlign.merge( i ) | |
282 IsAlreadyInList = False | |
283 for newAlign in lMerged: | |
284 if newAlign.isOverlapping( iAlign ): | |
285 IsAlreadyInList = True | |
286 newAlign.merge( iAlign ) | |
287 lMerged [ lMerged.index( newAlign ) ] = newAlign | |
288 if not IsAlreadyInList: | |
289 lMerged.append( iAlign ) | |
290 prev_count += 1 | |
291 | |
292 return lMerged | |
293 | |
294 mergeList = staticmethod( mergeList ) | |
295 | |
296 | |
297 ## Merge all Align instance in a given Align file | |
298 # | |
299 def mergeFile( inFile, outFile="" ): | |
300 if outFile == "": | |
301 outFile = "%s.merged" % ( inFile ) | |
302 if os.path.exists( outFile ): | |
303 os.remove( outFile ) | |
304 | |
305 tmpFile = "%s.sorted" % ( inFile ) | |
306 AlignUtils.sortAlignFile( inFile, tmpFile ) | |
307 | |
308 tmpF = open( tmpFile, "r" ) | |
309 dQrySbj2Aligns = {} | |
310 prevPairQrySbj = "" | |
311 while True: | |
312 line = tmpF.readline() | |
313 if line == "": | |
314 break | |
315 iAlign = Align() | |
316 iAlign.setFromString( line ) | |
317 pairQrySbj = "%s_%s" % ( iAlign.getQueryName(), iAlign.getSubjectName() ) | |
318 if not dQrySbj2Aligns.has_key( pairQrySbj ): | |
319 if prevPairQrySbj != "": | |
320 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) | |
321 AlignUtils.writeListInFile( lMerged, outFile, "a" ) | |
322 del dQrySbj2Aligns[ prevPairQrySbj ] | |
323 prevPairQrySbj = pairQrySbj | |
324 else: | |
325 prevPairQrySbj = pairQrySbj | |
326 dQrySbj2Aligns[ pairQrySbj ] = [] | |
327 dQrySbj2Aligns[ pairQrySbj ].append( iAlign ) | |
328 lMerged = [] | |
329 if len(dQrySbj2Aligns.keys()) > 0: | |
330 lMerged = AlignUtils.mergeList( dQrySbj2Aligns[ prevPairQrySbj ] ) | |
331 AlignUtils.writeListInFile( lMerged, outFile, "a" ) | |
332 tmpF.close() | |
333 os.remove( tmpFile ) | |
334 | |
335 mergeFile = staticmethod( mergeFile ) | |
336 | |
337 | |
338 ## Update the scores of each match in the input file | |
339 # | |
340 # @note the new score is the length on the query times the percentage of identity | |
341 # | |
342 def updateScoresInFile( inFile, outFile ): | |
343 inHandler = open( inFile, "r" ) | |
344 outHandler = open( outFile, "w" ) | |
345 iAlign = Align() | |
346 | |
347 while True: | |
348 line = inHandler.readline() | |
349 if line == "": | |
350 break | |
351 iAlign.reset() | |
352 iAlign.setFromString( line, "\t" ) | |
353 iAlign.updateScore() | |
354 iAlign.write( outHandler ) | |
355 | |
356 inHandler.close() | |
357 outHandler.close() | |
358 | |
359 updateScoresInFile = staticmethod( updateScoresInFile ) |