comparison commons/core/coord/ConvCoord.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #!/usr/bin/env python
2
3 ##@file
4 # Convert coordinates from chunks to chromosomes or the opposite.
5 #
6 # usage: ConvCoord.py [ options ]
7 # options:
8 # -h: this help
9 # -i: input data with coordinates to convert (file or table)
10 # -f: input data format (default='align'/'path')
11 # -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs')
12 # -m: mapping of chunks on chromosomes (format='map')
13 # -x: convert from chromosomes to chunks (opposite by default)
14 # -o: output data (file or table, same as input)
15 # -C: configuration file (for database connection)
16 # -v: verbosity level (default=0/1/2)
17
18
19 import os
20 import sys
21 import getopt
22 import time
23 from commons.core.sql.DbFactory import DbFactory
24 from commons.core.coord.MapUtils import MapUtils
25 from commons.core.sql.TableMapAdaptator import TableMapAdaptator
26 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
27 from commons.core.coord.PathUtils import PathUtils
28 from commons.core.coord.Align import Align
29 from commons.core.coord.Path import Path
30 from commons.core.coord.Range import Range
31
32
33 ## Class to handle coordinate conversion
34 #
35 class ConvCoord( object ):
36
37 ## Constructor
38 #
39 def __init__( self, inData="", mapData="", outData="", configFile="", verbosity=0):
40 self._inData = inData
41 self._formatInData = "align"
42 self._coordToConvert = "q"
43 self._mapData = mapData
44 self._mergeChunkOverlaps = True
45 self._convertChunks = True
46 self._outData = outData
47 self._configFile = configFile
48 self._verbose = verbosity
49 self._typeInData = "file"
50 self._typeMapData = "file"
51 self._tpa = None
52 if self._configFile != "" and os.path.exists(self._configFile):
53 self._iDb = DbFactory.createInstance(self._configFile)
54 else:
55 self._iDb = DbFactory.createInstance()
56
57
58 ## Display the help on stdout
59 #
60 def help( self ):
61 print
62 print "usage: ConvCoord.py [ options ]"
63 print "options:"
64 print " -h: this help"
65 print " -i: input data with coordinates to convert (file or table)"
66 print " -f: input data format (default='align'/'path')"
67 print " -c: coordinates to convert (query, subject or both; default='q'/'s'/'qs')"
68 print " -m: mapping of chunks on chromosomes (format='map')"
69 print " -M: merge chunk overlaps (default=yes/no)"
70 print " -x: convert from chromosomes to chunks (opposite by default)"
71 print " -o: output data (file or table, same as input)"
72 print " -C: configuration file (for database connection)"
73 print " -v: verbosity level (default=0/1/2)"
74 print
75
76
77 ## Set the attributes from the command-line
78 #
79 def setAttributesFromCmdLine( self ):
80 try:
81 opts, args = getopt.getopt(sys.argv[1:],"hi:f:c:m:M:xo:C:v:")
82 except getopt.GetoptError, err:
83 sys.stderr.write( "%s\n" % ( str(err) ) )
84 self.help(); sys.exit(1)
85 for o,a in opts:
86 if o == "-h":
87 self.help(); sys.exit(0)
88 elif o == "-i":
89 self.setInputData( a )
90 elif o == "-f":
91 self.setInputFormat( a )
92 elif o == "-c":
93 self.setCoordinatesToConvert( a )
94 elif o == "-m":
95 self.setMapData( a )
96 elif o == "-M":
97 self.setMergeChunkOverlaps( a )
98 elif o == "-o":
99 self.setOutputData( a )
100 elif o == "-C":
101 self.setConfigFile( a )
102 elif o == "-v":
103 self.setVerbosityLevel( a )
104
105
106 def setInputData( self, inData ):
107 self._inData = inData
108
109 def setInputFormat( self, formatInData ):
110 self._formatInData = formatInData
111
112 def setCoordinatesToConvert( self, coordToConvert ):
113 self._coordToConvert = coordToConvert
114
115 def setMapData( self, mapData ):
116 self._mapData = mapData
117
118 def setMergeChunkOverlaps( self, mergeChunkOverlaps ):
119 if mergeChunkOverlaps == "yes":
120 self._mergeChunkOverlaps = True
121 else:
122 self._mergeChunkOverlaps = False
123
124 def setOutputData( self, outData ):
125 self._outData = outData
126
127 def setConfigFile( self, configFile ):
128 self._configFile = configFile
129
130 def setVerbosityLevel( self, verbose ):
131 self._verbose = int(verbose)
132
133
134 ## Check the attributes are valid before running the algorithm
135 #
136 def checkAttributes( self ):
137 if self._inData == "":
138 msg = "ERROR: missing input data (-i)"
139 sys.stderr.write( "%s\n" % ( msg ) )
140 self.help(); sys.exit(1)
141 if self._formatInData not in ["align","path"]:
142 msg = "ERROR: unrecognized format '%s' (-f)" % ( self._formatInData )
143 sys.stderr.write( "%s\n" % ( msg ) )
144 self.help(); sys.exit(1)
145 if self._configFile == "":
146 self._iDb = DbFactory.createInstance()
147 elif not os.path.exists( self._configFile ):
148 msg = "ERROR: configuration file '%s' doesn't exist" % ( self._configFile )
149 sys.stderr.write( "%s\n" % ( msg ) )
150 self.help(); sys.exit(1)
151 else:
152 self._iDb = DbFactory.createInstance(self._configFile)
153 if not os.path.exists( self._inData ) and not self._iDb.doesTableExist( self._inData ):
154 msg = "ERROR: input data '%s' doesn't exist" % ( self._inData )
155 sys.stderr.write( "%s\n" % ( msg ) )
156 self.help(); sys.exit(1)
157 if os.path.exists( self._inData ):
158 self._typeInData = "file"
159 elif self._iDb.doesTableExist( self._inData ):
160 self._typeInData = "table"
161 if self._coordToConvert == "":
162 msg = "ERROR: missing coordinates to convert (-c)"
163 sys.stderr.write( "%s\n" % ( msg ) )
164 self.help(); sys.exit(1)
165 if self._coordToConvert not in [ "q", "s", "qs" ]:
166 msg = "ERROR: unrecognized coordinates to convert '%s' (-c)" % ( self._coordToConvert )
167 sys.stderr.write( "%s\n" % ( msg ) )
168 self.help(); sys.exit(1)
169 if self._mapData == "":
170 msg = "ERROR: missing mapping coordinates of chunks on chromosomes (-m)"
171 sys.stderr.write( "%s\n" % ( msg ) )
172 self.help(); sys.exit(1)
173 if not os.path.exists( self._mapData ) and not self._iDb.doesTableExist( self._mapData ):
174 msg = "ERROR: mapping data '%s' doesn't exist" % ( self._mapData )
175 sys.stderr.write( "%s\n" % ( msg ) )
176 self.help(); sys.exit(1)
177 if os.path.exists( self._mapData ):
178 self._typeMapData = "file"
179 elif self._iDb.doesTableExist( self._mapData ):
180 self._typeMapData = "table"
181 if self._outData == "":
182 if self._convertChunks:
183 self._outData = "%s.onChr" % ( self._inData )
184 else:
185 self._outData = "%s.onChk" % ( self._inData )
186 if self._typeInData == "table":
187 self._outData = self._outData.replace(".","_")
188
189
190 ## Return a dictionary with the mapping of the chunks on the chromosomes
191 #
192 def getChunkCoordsOnChromosomes( self ):
193 if self._typeMapData == "file":
194 dChunks2CoordMaps = MapUtils.getDictPerNameFromMapFile( self._mapData )
195 elif self._typeMapData == "table":
196 tma = TableMapAdaptator( self._iDb, self._mapData )
197 dChunks2CoordMaps = tma.getDictPerName()
198 if self._verbose > 0:
199 msg = "nb of chunks: %i" % ( len(dChunks2CoordMaps.keys()) )
200 sys.stdout.write( "%s\n" % ( msg ) )
201 return dChunks2CoordMaps
202
203
204 def getRangeOnChromosome( self, chkRange, dChunks2CoordMaps ):
205 chrRange = Range()
206 chunkName = chkRange.seqname
207 chrRange.seqname = dChunks2CoordMaps[ chunkName ].seqname
208 if dChunks2CoordMaps[ chunkName ].start == 1:
209 chrRange.start = chkRange.start
210 chrRange.end = chkRange.end
211 else:
212 startOfChkOnChr = dChunks2CoordMaps[ chunkName ].start
213 chrRange.start = startOfChkOnChr + chkRange.start - 1
214 chrRange.end = startOfChkOnChr + chkRange.end - 1
215 return chrRange
216
217
218 def convCoordsChkToChrFromAlignFile( self, inFile, dChunks2CoordMaps ):
219 return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "align" )
220
221
222 def convCoordsChkToChrFromPathFile( self, inFile, dChunks2CoordMaps ):
223 return self.convCoordsChkToChrFromAlignOrPathFile( inFile, dChunks2CoordMaps, "path" )
224
225
226
227 ## Convert coordinates of a Path or Align file from chunks to chromosomes
228 #
229 def convCoordsChkToChrFromAlignOrPathFile( self, inFile, dChunks2CoordMaps, format ):
230 if self._verbose > 0:
231 msg = "start method 'convCoordsChkToChrFromAlignOrPathFile'"
232 sys.stdout.write( "%s\n" % ( msg ) )
233 outFile = "%s.tmp" % ( inFile )
234 inFileHandler = open( inFile, "r" )
235 outFileHandler = open( outFile, "w" )
236 if format == "align":
237 iObject = Align()
238 else:
239 iObject = Path()
240 countLine = 0
241
242 while True:
243 line = inFileHandler.readline()
244 if line == "":
245 break
246 countLine += 1
247 iObject.setFromString( line )
248 if self._coordToConvert in [ "q", "qs" ]:
249 queryOnChr = self.getRangeOnChromosome( iObject.range_query, dChunks2CoordMaps )
250 iObject.range_query = queryOnChr
251 if self._coordToConvert in [ "s", "qs" ]:
252 subjectOnChr = self.getRangeOnChromosome( iObject.range_subject, dChunks2CoordMaps )
253 iObject.range_subject = subjectOnChr
254 iObject.write( outFileHandler )
255 iObject.reset()
256
257 inFileHandler.close()
258 outFileHandler.close()
259 if self._verbose > 0:
260 msg = "end method 'convCoordsChkToChrFromAlignOrPathFile'"
261 sys.stdout.write( "%s\n" % ( msg ) )
262 return outFile
263
264 ## Convert coordinates of a file from chunks to chromosomes
265 #
266 def convCoordsChkToChrFromFile( self, inFile, format, dChunks2CoordMaps ):
267 if self._verbose > 0:
268 msg = "start convCoordsChkToChrFromFile"
269 sys.stdout.write( "%s\n" % ( msg ) )
270 if format == "align":
271 tmpAlignFile = self.convCoordsChkToChrFromAlignFile( inFile, dChunks2CoordMaps )
272 tmpAlignTable = tmpAlignFile.replace(".","_").replace("-","_")
273 self._iDb.createTable( tmpAlignTable, "align", tmpAlignFile, True)
274 os.remove( tmpAlignFile )
275 self._iDb.removeDoublons( tmpAlignTable )
276 outTable = "%s_path" % ( tmpAlignTable )
277 self._iDb.convertAlignTableIntoPathTable( tmpAlignTable, outTable )
278 self._iDb.dropTable( tmpAlignTable )
279 elif format == "path":
280 tmpPathFile = self.convCoordsChkToChrFromPathFile( inFile, dChunks2CoordMaps )
281 outTable = tmpPathFile.replace(".","_").replace("-","_")
282 self._iDb.createTable( outTable, "path", tmpPathFile, True)
283 os.remove( tmpPathFile )
284 if self._verbose > 0:
285 msg = "end convCoordsChkToChrFromFile"
286 sys.stdout.write( "%s\n" % ( msg ) )
287 return outTable
288
289
290 ## Convert coordinates of a table from chunks to chromosomes
291 #
292 def convCoordsChkToChrFromTable( self, inTable, format, dChunks2CoordMaps ):
293 tmpFile = inTable
294 self._iDb.exportDataToFile( inTable, tmpFile, False )
295 outTable = self.convCoordsChkToChrFromFile( tmpFile, format, dChunks2CoordMaps )
296 os.remove( tmpFile )
297 return outTable
298
299
300 def getListsDirectAndReversePaths( self, lPaths ):
301 lDirectPaths = []
302 lReversePaths = []
303 for iPath in lPaths:
304 if iPath.isQueryOnDirectStrand() and iPath.isSubjectOnDirectStrand():
305 lDirectPaths.append( iPath )
306 else:
307 lReversePaths.append( iPath )
308 return lDirectPaths, lReversePaths
309
310
311 def mergePaths( self, lPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId ):
312 if len(lPaths) < 2:
313 lIdsToInsert.append( lPaths[0].id )
314 return
315 i = 0
316 while i < len(lPaths) - 1:
317 i += 1
318 if self._verbose > 1 and i==1 :
319 print lPaths[i-1]
320 if self._verbose > 1:
321 print lPaths[i]
322 sys.stdout.flush()
323 idPrev = lPaths[i-1].id
324 idNext = lPaths[i].id
325 if lPaths[i-1].canMerge( lPaths[i] ):
326 dOldIdToNewId[ idNext ] = idPrev
327 if idPrev not in lIdsToInsert:
328 lIdsToInsert.append( idPrev )
329 if idNext not in lIdsToDelete:
330 lIdsToDelete.append( idNext )
331 lPaths[i-1].merge( lPaths[i] )
332 del lPaths[i]
333 i -= 1
334
335
336 def insertPaths( self, lPaths, lIdsToInsert, dOldIdToNewId ):
337 for iPath in lPaths:
338 if dOldIdToNewId.has_key( iPath.id ):
339 iPath.id = dOldIdToNewId[ iPath.id ]
340 if iPath.id in lIdsToInsert:
341 self._tpa.insert( iPath )
342
343
344 ## Merge Path instances in a Path table when they correspond to chunk overlaps
345 #
346 def mergeCoordsOnChunkOverlaps( self, dChunks2CoordMaps, tmpPathTable ):
347 if self._verbose > 0:
348 msg = "start method 'mergeCoordsOnChunkOverlaps'"
349 sys.stdout.write( "%s\n" % ( msg ) )
350 self._tpa = TablePathAdaptator( self._iDb, tmpPathTable )
351 nbChunks = len(dChunks2CoordMaps.keys())
352 for numChunk in range(1,nbChunks):
353 chunkName1 = "chunk%s" % ( str(numChunk).zfill( len(str(nbChunks)) ) )
354 chunkName2 = "chunk%s" % ( str(numChunk+1).zfill( len(str(nbChunks)) ) )
355 if not dChunks2CoordMaps.has_key( chunkName2 ):
356 break
357 if self._verbose > 1:
358 msg = "try merge on '%s' and '%s'" % ( chunkName1, chunkName2 )
359 sys.stdout.write( "%s\n" % ( msg ) )
360 sys.stdout.flush()
361 chrName = dChunks2CoordMaps[ chunkName1 ].seqname
362 if dChunks2CoordMaps[ chunkName2 ].seqname != chrName:
363 if self._verbose > 1:
364 msg = "not on same chromosome (%s != %s)" % ( dChunks2CoordMaps[ chunkName2 ].seqname, chrName )
365 sys.stdout.write( "%s\n" % ( msg ) )
366 sys.stdout.flush()
367 continue
368 minCoord = min( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start )
369 maxCoord = max( dChunks2CoordMaps[ chunkName1 ].end, dChunks2CoordMaps[ chunkName2 ].start )
370 lPaths = self._tpa.getChainListOverlappingQueryCoord( chrName, minCoord, maxCoord )
371 if len(lPaths) == 0:
372 if self._verbose > 1:
373 msg = "no overlapping matches on %s (%i->%i)" % ( chrName, minCoord, maxCoord )
374 sys.stdout.write( "%s\n" % ( msg ) )
375 sys.stdout.flush()
376 continue
377 if self._verbose > 1:
378 msg = "%i overlapping matche(s) on %s (%i->%i)" % ( len(lPaths), chrName, minCoord, maxCoord )
379 sys.stdout.write( "%s\n" % ( msg ) )
380 sys.stdout.flush()
381 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
382 lDirectPaths, lReversePaths = self.getListsDirectAndReversePaths( lSortedPaths )
383 lIdsToInsert = []
384 lIdsToDelete = []
385 dOldIdToNewId = {}
386 if len(lDirectPaths) > 0:
387 self.mergePaths( lDirectPaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId )
388 if len(lReversePaths) > 0:
389 self.mergePaths( lReversePaths, lIdsToInsert, lIdsToDelete, dOldIdToNewId )
390 self._tpa.deleteFromIdList( lIdsToDelete )
391 self._tpa.deleteFromIdList( lIdsToInsert )
392 self.insertPaths( lDirectPaths, lIdsToInsert, dOldIdToNewId )
393 self.insertPaths( lReversePaths, lIdsToInsert, dOldIdToNewId )
394 if self._verbose > 0:
395 msg = "end method 'mergeCoordsOnChunkOverlaps'"
396 sys.stdout.write( "%s\n" % ( msg ) )
397 sys.stdout.flush()
398
399
400 def saveChrCoordsAsFile( self, tmpPathTable, outFile ):
401 self._iDb.exportDataToFile( tmpPathTable, tmpPathTable, False )
402 self._iDb.dropTable( tmpPathTable )
403 if self._formatInData == "align":
404 PathUtils.convertPathFileIntoAlignFile( tmpPathTable, outFile )
405 os.remove( tmpPathTable )
406 elif self._formatInData == "path":
407 os.rename( tmpPathTable, outFile )
408
409
410 def saveChrCoordsAsTable( self, tmpPathTable, outTable ):
411 if self._formatInData == "align":
412 self._iDb.convertPathTableIntoAlignTable( tmpPathTable, outTable )
413 self._iDb.dropTable( tmpPathTable )
414 elif self._formatInData == "path":
415 self._iDb.renameTable( tmpPathTable, outTable )
416
417
418 ## Convert coordinates from chunks to chromosomes
419 #
420 def convertCoordinatesFromChunksToChromosomes( self ):
421 dChunks2CoordMaps = self.getChunkCoordsOnChromosomes()
422
423 if self._typeInData == "file":
424 tmpPathTable = self.convCoordsChkToChrFromFile( self._inData, self._formatInData, dChunks2CoordMaps )
425 elif self._typeInData == "table":
426 tmpPathTable = self.convCoordsChkToChrFromTable( self._inData, self._formatInData, dChunks2CoordMaps )
427
428 if self._mergeChunkOverlaps:
429 self.mergeCoordsOnChunkOverlaps( dChunks2CoordMaps, tmpPathTable );
430
431 if self._typeInData == "file":
432 self.saveChrCoordsAsFile( tmpPathTable, self._outData )
433 elif self._typeInData == "table":
434 self.saveChrCoordsAsTable( tmpPathTable, self._outData )
435
436
437 ## Convert coordinates from chromosomes to chunks
438 #
439 def convertCoordinatesFromChromosomesToChunks( self ):
440 msg = "ERROR: convert coordinates from chromosomes to chunks not yet available"
441 sys.stderr.write( "%s\n" % ( msg ) )
442 sys.exit(1)
443
444
445 ## Useful commands before running the program
446 #
447 def start( self ):
448 self.checkAttributes()
449 if self._verbose > 0:
450 msg = "START ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") )
451 msg += "\ninput data: %s" % ( self._inData )
452 if self._typeInData == "file":
453 msg += " (file)\n"
454 else:
455 msg += " (table)\n"
456 msg += "format: %s\n" % ( self._formatInData )
457 msg += "coordinates to convert: %s\n" % ( self._coordToConvert )
458 msg += "mapping data: %s" % ( self._mapData )
459 if self._typeMapData == "file":
460 msg += " (file)\n"
461 else:
462 msg += " (table)\n"
463 if self._mergeChunkOverlaps:
464 msg += "merge chunk overlaps\n"
465 else:
466 msg += "don't merge chunk overlaps\n"
467 if self._convertChunks:
468 msg += "convert chunks to chromosomes\n"
469 else:
470 msg += "convert chromosomes to chunks\n"
471 msg += "output data: %s" % ( self._outData )
472 if self._typeInData == "file":
473 msg += " (file)\n"
474 else:
475 msg += " (table)\n"
476 sys.stdout.write( msg )
477
478
479 ## Useful commands before ending the program
480 #
481 def end( self ):
482 self._iDb.close()
483 if self._verbose > 0:
484 msg = "END ConvCoord.py (%s)" % ( time.strftime("%m/%d/%Y %H:%M:%S") )
485 sys.stdout.write( "%s\n" % ( msg ) )
486
487
488 ## Run the program
489 #
490 def run( self ):
491 self.start()
492
493 if self._convertChunks:
494 self.convertCoordinatesFromChunksToChromosomes()
495 else:
496 self.convertCoordinatesFromChromosomesToChunks()
497
498 self.end()
499
500
501 if __name__ == "__main__":
502 i = ConvCoord()
503 i.setAttributesFromCmdLine()
504 i.run()