6
|
1 #
|
|
2 # Copyright INRA-URGI 2009-2010
|
|
3 #
|
|
4 # This software is governed by the CeCILL license under French law and
|
|
5 # abiding by the rules of distribution of free software. You can use,
|
|
6 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
7 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
8 # "http://www.cecill.info".
|
|
9 #
|
|
10 # As a counterpart to the access to the source code and rights to copy,
|
|
11 # modify and redistribute granted by the license, users are provided only
|
|
12 # with a limited warranty and the software's author, the holder of the
|
|
13 # economic rights, and the successive licensors have only limited
|
|
14 # liability.
|
|
15 #
|
|
16 # In this respect, the user's attention is drawn to the risks associated
|
|
17 # with loading, using, modifying and/or developing or reproducing the
|
|
18 # software by the user in light of its specific status of free software,
|
|
19 # that may mean that it is complicated to manipulate, and that also
|
|
20 # therefore means that it is reserved for developers and experienced
|
|
21 # professionals having in-depth computer knowledge. Users are therefore
|
|
22 # encouraged to load and test the software's suitability as regards their
|
|
23 # requirements in conditions enabling the security of their systems and/or
|
|
24 # data to be ensured and, more generally, to use and operate it in the
|
|
25 # same conditions as regards security.
|
|
26 #
|
|
27 # The fact that you are presently reading this means that you have had
|
|
28 # knowledge of the CeCILL license and that you accept its terms.
|
|
29 #
|
|
30 import sys
|
|
31 import random
|
|
32 from SMART.Java.Python.misc import Utils
|
|
33 from SMART.Java.Python.structure.Transcript import Transcript
|
|
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList
|
|
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
|
|
36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
|
|
37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable
|
|
38 from SMART.Java.Python.misc.Progress import Progress
|
|
39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
|
|
40
|
|
41
|
|
42
|
|
43 class TranscriptListsComparator(object):
|
|
44 """
|
|
45 Compare two transcript lists, using a database for one of the list
|
|
46 Uses one TranscriptContainer for query data,
|
|
47 one TranscriptContainer exported to MySqlTranscriptTable for reference data,
|
|
48 one MySqlTranscriptTable for transformed reference data
|
|
49 @ivar inputTranscriptContainers: parsers to the list of query transcripts
|
|
50 @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
|
|
51 @ivar writer: transcript list writer
|
|
52 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
|
|
53 @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently)
|
|
54 @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}
|
|
55 @ivar introns: compare transcripts or exons only
|
|
56 @type introns: list of 2 boolean
|
|
57 @ivar starts: restrict the query transcripts to first nucleotides
|
|
58 @type starts: list of 2 int or None
|
|
59 @ivar fivePrimes: extend a list of transcripts by their 5' end
|
|
60 @type fivePrimes: list of 2 int or None
|
|
61 @ivar threePrimes: extend a list of transcripts by their 3' end
|
|
62 @type threePrimes: list of 2 int or None
|
|
63 @ivar minDistance: min distance between two transcripts [default: 0]
|
|
64 @type minDistance: int
|
|
65 @ivar maxDistance: max distance between two transcripts [default: 0]
|
|
66 @type maxDistance: int
|
|
67 @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap
|
|
68 @type minOverlap: int
|
|
69 @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap
|
|
70 @type pcOverlap: int
|
|
71 @ivar upstreams: consider distances with elements which are upstream of the transcripts
|
|
72 @type upstreams: boolean
|
|
73 @ivar downstreams: consider distances with elements which are downstream of the transcripts
|
|
74 @type downstreams: boolean
|
|
75 @ivar colinear: whether transcripts should overlap in the same direction
|
|
76 @type colinear: boolean
|
|
77 @ivar antisense: whether transcripts should overlap in the opposite direction
|
|
78 @type antisense: boolean
|
|
79 @ivar outputDistance: output distance between query and reference instead of query transcript
|
|
80 @type outputDistance: boolean
|
|
81 @ivar absolute: do not consider the strand while computing distance
|
|
82 @type absolute: boolean
|
|
83 @ivar strandedDistance: return a line per strand while computing distances
|
|
84 @type strandedDistance: boolean
|
|
85 @ivar QUERY: constant specifying the query objects
|
|
86 @type QUERY: int
|
|
87 @ivar REFERENCE: constant specifying the reference objects
|
|
88 @type REFERENCE: int
|
|
89 @ivar INPUTTYPES: set of input types of data (query or reference) objects
|
|
90 @type INPUTTYPES: list of 2 int
|
|
91 @ivar typeToString: string representation of the previous types
|
|
92 @type typeToString: dict
|
|
93 @ivar tableNames: name of the transcript tables
|
|
94 @type tableNames: dict of strings
|
|
95 @ivar nbTranscripts: number of transcript in the query/reference set
|
|
96 @type nbTranscripts: list of 2 int or None
|
|
97 @ivar nbNucleotides: number of nucleotides in the query/reference set
|
|
98 @type nbNucleotides: list of 2 int or None
|
|
99 @ivar transcriptsToBeStored: transcripts that will be stored into database
|
|
100 @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>}
|
|
101 @ivar multiple: in merge mode, aggregate multiple transcripts
|
|
102 @type multiple: boolean
|
|
103 @ivar normalization: normalize each element by the number of mappings of this element
|
|
104 @type normalization: boolean
|
|
105 @ivar invert: invert the current comparison
|
|
106 @type invert: boolean
|
|
107 @ivar splitDifference: split into intervals when computing difference
|
|
108 @type splitDifference: boolean
|
|
109 @ivar odds: whether odds about the comparison should be computed
|
|
110 @type odds: boolean
|
|
111 @ivar overlapResults: count the number of overlaps
|
|
112 @type overlapResults: dictionary
|
|
113 @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one
|
|
114 @type oddResults: dictionary
|
|
115 @ivar outputContainer: container of the output transcripts
|
|
116 @type outputContainer: class L{TranscriptContainer<TranscriptContainer>}
|
|
117 @ivar logHandle: log handle
|
|
118 @type logHandle: file
|
|
119 @ivar verbosity: verbosity
|
|
120 @type verbosity: int
|
|
121 """
|
|
122
|
|
123 def __init__(self, logHandle = None, verbosity = 0):
|
|
124 """
|
|
125 Constructor
|
|
126 @param transcriptListParser2: parser to the list of reference transcripts
|
|
127 @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
|
|
128 @param logHandle: log handle
|
|
129 @type logHandle: file
|
|
130 @param verbosity: verbosity
|
|
131 @type verbosity: int
|
|
132 """
|
|
133 self.QUERY = 0
|
|
134 self.REFERENCE = 1
|
|
135 self.WORKING = 2
|
|
136 self.INPUTTYPES = (self.QUERY, self.REFERENCE)
|
|
137 self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING)
|
|
138 self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"}
|
|
139
|
|
140 self.logHandle = logHandle
|
|
141 self.verbosity = verbosity
|
|
142 self.mySqlConnection = MySqlConnection(self.verbosity-1)
|
|
143 self.inputTranscriptContainers = [None, None]
|
|
144 self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
|
|
145 self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames]
|
|
146 self.writer = None
|
|
147 self.introns = [False, False]
|
|
148 self.starts = [None, None]
|
|
149 self.ends = [None, None]
|
|
150 self.fivePrimes = [None, None]
|
|
151 self.threePrimes = [None, None]
|
|
152 self.minDistance = None
|
|
153 self.maxDistance = 0
|
|
154 self.minOverlap = 1
|
|
155 self.pcOverlap = None
|
|
156 self.colinear = False
|
|
157 self.antisense = False
|
|
158 self.downstreams = [False, False]
|
|
159 self.upstreams = [False, False]
|
|
160 self.outputDistance = False
|
|
161 self.absolute = False
|
|
162 self.strandedDistance = False
|
|
163 self.nbTranscripts = [None, None]
|
|
164 self.nbNucleotides = [None, None]
|
|
165 self.normalization = False
|
|
166 self.included = False
|
|
167 self.including = False
|
|
168 self.invert = False
|
|
169 self.notOverlapping = False
|
|
170 self.splitDifference = False
|
|
171 self.multiple = False
|
|
172 self.odds = False
|
|
173 self.overlapResults = None
|
|
174 self.oddResults = None
|
|
175 self.outputContainer = None
|
|
176 self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES])
|
|
177 self.nbPrinted = 0
|
|
178
|
|
179 self.mySqlConnection.createDatabase()
|
|
180
|
|
181
|
|
182 def __del__(self):
|
|
183 """
|
|
184 Destructor
|
|
185 Remove all temporary tables
|
|
186 """
|
|
187 for type in self.INPUTWORKINGTYPES:
|
|
188 self.mySqlTranscriptWriters[type].removeTables()
|
|
189 self.mySqlConnection.deleteDatabase()
|
|
190
|
|
191 def acceptIntrons(self, type, bool):
|
|
192 """
|
|
193 Compare transcripts or exons only
|
|
194 @param type: whether use query/reference data
|
|
195 @type type: int
|
|
196 @param bool: include introns or not
|
|
197 @type bool: boolean
|
|
198 """
|
|
199 self.introns[type] = bool
|
|
200
|
|
201
|
|
202 def restrictToStart(self, type, size):
|
|
203 """
|
|
204 Restrict a list of transcripts to first nucleotides
|
|
205 @param type: whether use query/reference data
|
|
206 @type type: int
|
|
207 @param size: the size of the transcript to be considered
|
|
208 @type size: int
|
|
209 """
|
|
210 self.starts[type] = size
|
|
211 self.introns[type] = False
|
|
212
|
|
213
|
|
214 def restrictToEnd(self, type, size):
|
|
215 """
|
|
216 Restrict a list of transcripts to first nucleotides
|
|
217 @param type: whether use query/reference data
|
|
218 @type type: int
|
|
219 @param size: the size of the transcript to be considered
|
|
220 @type size: int
|
|
221 """
|
|
222 self.ends[type] = size
|
|
223 self.introns[type] = False
|
|
224
|
|
225
|
|
226 def extendFivePrime(self, type, size):
|
|
227 """
|
|
228 Extend a list of transcripts by their 5' end
|
|
229 @param type: whether use query/reference data
|
|
230 @type type: int
|
|
231 @param size: size of the extension
|
|
232 @type size: int
|
|
233 """
|
|
234 self.fivePrimes[type] = size
|
|
235
|
|
236
|
|
237 def extendThreePrime(self, type, size):
|
|
238 """
|
|
239 Extend the list of query transcripts by their 3' end
|
|
240 @param type: whether use query/reference data
|
|
241 @type type: int
|
|
242 @param size: size of the extension
|
|
243 @type size: int
|
|
244 """
|
|
245 self.threePrimes[type] = size
|
|
246
|
|
247
|
|
248 def setMinDistance(self, distance):
|
|
249 """
|
|
250 Set the min distance between two transcripts
|
|
251 @param distance: distance
|
|
252 @type distance: int
|
|
253 """
|
|
254 self.minDistance = distance
|
|
255
|
|
256
|
|
257 def setMaxDistance(self, distance):
|
|
258 """
|
|
259 Set the max distance between two transcripts
|
|
260 @param distance: distance
|
|
261 @type distance: int
|
|
262 """
|
|
263 self.maxDistance = distance
|
|
264
|
|
265
|
|
266 def setMinOverlap(self, overlap):
|
|
267 """
|
|
268 Set the minimum number of nucleotides to declare an overlap
|
|
269 @param overlap: minimum number of nucleotides
|
|
270 @type overlap: int
|
|
271 """
|
|
272 self.minOverlap = overlap
|
|
273
|
|
274
|
|
275 def setPcOverlap(self, overlap):
|
|
276 """
|
|
277 Set the percentage of nucleotides to declare an overlap
|
|
278 @param overlap: percentage of nucleotides
|
|
279 @type overlap: int
|
|
280 """
|
|
281 self.pcOverlap = overlap
|
|
282
|
|
283
|
|
284 def setUpstream(self, type, boolean):
|
|
285 """
|
|
286 Consider transcripts which are upstream of some transcripts
|
|
287 @param type: whether use query/reference data
|
|
288 @type type: int
|
|
289 @param boolean: consider only these transcripts or not
|
|
290 @type boolean: boolean
|
|
291 """
|
|
292 self.upstreams[type] = boolean
|
|
293
|
|
294
|
|
295 def setDownstream(self, type, boolean):
|
|
296 """
|
|
297 Consider transcripts which are downstream of some transcripts
|
|
298 @param type: whether use query/reference data
|
|
299 @type type: int
|
|
300 @param boolean: consider only these transcripts or not
|
|
301 @type boolean: boolean
|
|
302 """
|
|
303 self.downstreams[type] = boolean
|
|
304
|
|
305
|
|
306 def setOutputDistance(self, boolean):
|
|
307 """
|
|
308 Output distance between query and reference instead of query transcript
|
|
309 @param boolean: whether distance should be output
|
|
310 @type boolean: boolean
|
|
311 """
|
|
312 self.outputDistance = boolean
|
|
313
|
|
314
|
|
315 def setAbsolute(self, boolean):
|
|
316 """
|
|
317 Do not consider strand when computing distance (thus, having only non-negative values)
|
|
318 @param boolean: whether we should consider strands
|
|
319 @type boolean: boolean
|
|
320 """
|
|
321 self.absolute = boolean
|
|
322
|
|
323
|
|
324 def setStrandedDistance(self, boolean):
|
|
325 """
|
|
326 Return two distance distributions, one per strand
|
|
327 @param boolean: whether we should return 2 distance distance
|
|
328 @type boolean: boolean
|
|
329 """
|
|
330 self.strandedDistance = boolean
|
|
331
|
|
332
|
|
333 def getColinearOnly(self, boolean):
|
|
334 """
|
|
335 Only consider transcripts that overlap in the same direction
|
|
336 @param boolean: whether transcripts should overlap in the same direction
|
|
337 @type boolean: boolean
|
|
338 """
|
|
339 self.colinear = boolean
|
|
340
|
|
341
|
|
342 def getAntisenseOnly(self, boolean):
|
|
343 """
|
|
344 Only consider transcripts that overlap in the opposite direction
|
|
345 @param boolean: whether transcripts should overlap in the opposite direction
|
|
346 @type boolean: boolean
|
|
347 """
|
|
348 self.antisense = boolean
|
|
349
|
|
350
|
|
351 def setIncludedOnly(self, boolean):
|
|
352 """
|
|
353 Keep the elements from first set which are included in the second set
|
|
354 @param boolean: whether to keep included elements only
|
|
355 @type boolean: boolean
|
|
356 """
|
|
357 self.included = boolean
|
|
358
|
|
359
|
|
360 def setIncludingOnly(self, boolean):
|
|
361 """
|
|
362 Keep the elements from second set which are included in the first set
|
|
363 @param boolean: whether to keep included elements only
|
|
364 @type boolean: boolean
|
|
365 """
|
|
366 self.including = boolean
|
|
367
|
|
368
|
|
369 def setNormalization(self, boolean):
|
|
370 """
|
|
371 Normalize the elements by the number of mappings in the genome
|
|
372 @param boolean: whether normalize
|
|
373 @type boolean: boolean
|
|
374 """
|
|
375 self.normalization = boolean
|
|
376
|
|
377
|
|
378 def getInvert(self, boolean):
|
|
379 """
|
|
380 Only consider transcripts that do not overlap
|
|
381 @param boolean: whether invert the selection
|
|
382 @type boolean: boolean
|
|
383 """
|
|
384 self.invert = boolean
|
|
385
|
|
386
|
|
387 def includeNotOverlapping(self, boolean):
|
|
388 """
|
|
389 Also output the elements which do not overlap
|
|
390 @param boolean: whether output the elements which do not overlap
|
|
391 @type boolean: boolean
|
|
392 """
|
|
393 self.notOverlapping = boolean
|
|
394
|
|
395
|
|
396 def setSplitDifference(self, boolean):
|
|
397 """
|
|
398 Split into intervals when computing difference
|
|
399 @param boolean: whether to split
|
|
400 @type boolean: boolean
|
|
401 """
|
|
402 self.splitDifference = boolean
|
|
403
|
|
404
|
|
405 def aggregate(self, boolean):
|
|
406 """
|
|
407 In merge mode, aggregate multiple transcripts
|
|
408 @param boolean: aggregate multiple transcripts
|
|
409 @type boolean: boolean
|
|
410 """
|
|
411 self.multiple = boolean
|
|
412
|
|
413
|
|
414 def getTables(self, type):
|
|
415 """
|
|
416 Get the SQL tables
|
|
417 @param type: type of the table (query, reference, etc.)
|
|
418 @type type: int
|
|
419 """
|
|
420 return self.mySqlTranscriptWriters[type].getTables()
|
|
421
|
|
422
|
|
423 def computeOdds(self, boolean):
|
|
424 """
|
|
425 Compute odds
|
|
426 @param boolean: whether odds should be computed
|
|
427 @type boolean: boolean
|
|
428 """
|
|
429 self.odds = boolean
|
|
430 if self.odds:
|
|
431 self.overlapResults = dict()
|
|
432
|
|
433
|
|
434 def computeOddsPerTranscript(self, boolean):
|
|
435 """
|
|
436 Compute odds for each transcript
|
|
437 @param boolean: whether odds for each transcript should be computed
|
|
438 @type boolean: boolean
|
|
439 """
|
|
440 self.odds = boolean
|
|
441 if self.odds:
|
|
442 self.overlapResults = dict()
|
|
443
|
|
444
|
|
445 def removeTables(self):
|
|
446 """
|
|
447 Remove the temporary MySQL tables
|
|
448 """
|
|
449 for type in self.INPUTWORKINGTYPES:
|
|
450 for chromosome in self.getTables(type):
|
|
451 self.getTables(type)[chromosome].remove()
|
|
452
|
|
453
|
|
454 def clearTables(self):
|
|
455 """
|
|
456 Empty the content of the databases
|
|
457 """
|
|
458 for type in self.INPUTWORKINGTYPES:
|
|
459 if self.transcriptListParsers[type] != None:
|
|
460 for chromosome in self.getTables(type):
|
|
461 self.getTables(type)[chromosome].clear()
|
|
462
|
|
463
|
|
464 def extendTranscript(self, type, transcript):
|
|
465 """
|
|
466 Extend a transcript corresponding to the parameters of the class
|
|
467 @param transcript: a transcript
|
|
468 @type transcript: class L{Transcript<Transcript>}
|
|
469 @return: the possibly extended transcript
|
|
470 """
|
|
471 extendedTranscript = Transcript()
|
|
472 extendedTranscript.copy(transcript)
|
|
473 if self.starts[type] != None:
|
|
474 extendedTranscript.restrictStart(self.starts[type])
|
|
475 if self.ends[type] != None:
|
|
476 extendedTranscript.restrictEnd(self.ends[type])
|
|
477 if self.fivePrimes[type] != None:
|
|
478 extendedTranscript.extendStart(self.fivePrimes[type])
|
|
479 if self.threePrimes[type] != None:
|
|
480 extendedTranscript.extendEnd(self.threePrimes[type])
|
|
481 return extendedTranscript
|
|
482
|
|
483
|
|
484 def storeTranscript(self, type, transcript, now = True):
|
|
485 """
|
|
486 Add a transcript to a MySQL database, or postpone the store
|
|
487 @param type: whether use query/reference table
|
|
488 @type type: int
|
|
489 @param transcript: a transcript
|
|
490 @type transcript: class L{Transcript<Transcript>}
|
|
491 @param now: whether transcript should be stored now (or stored can be postponed)
|
|
492 @type now: bool
|
|
493 """
|
|
494 self.mySqlTranscriptWriters[type].addTranscript(transcript)
|
|
495 if type == self.REFERENCE:
|
|
496 self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript)
|
|
497 if now:
|
|
498 self.mySqlTranscriptWriters[type].write()
|
|
499 if type == self.REFERENCE:
|
|
500 self.mySqlTranscriptWriters[self.WORKING].write()
|
|
501
|
|
502
|
|
503 def writeTranscript(self, transcript):
|
|
504 """
|
|
505 Write a transcript in the output file
|
|
506 @param transcript: a transcript
|
|
507 @type transcript: class L{Transcript<Transcript>}
|
|
508 """
|
|
509 if self.writer != None:
|
|
510 self.writer.addTranscript(transcript)
|
|
511 self.nbPrinted += 1
|
|
512
|
|
513
|
|
514 def flushData(self, type = None):
|
|
515 """
|
|
516 Store the remaining transcripts
|
|
517 @param type: whether use query/reference table (None for all)
|
|
518 @type type: int or None
|
|
519 """
|
|
520 if type == None:
|
|
521 types = self.INPUTWORKINGTYPES
|
|
522 else:
|
|
523 types = [type]
|
|
524 for type in types:
|
|
525 self.mySqlTranscriptWriters[type].write()
|
|
526 if self.writer != None:
|
|
527 self.writer.write()
|
|
528
|
|
529
|
|
530 def unstoreTranscript(self, type, transcript):
|
|
531 """
|
|
532 Remove a transcript from a MySQL database
|
|
533 @param type: whether use query/reference table
|
|
534 @type type: int
|
|
535 @param transcript: a transcript
|
|
536 @type transcript: class L{Transcript<Transcript>}
|
|
537 """
|
|
538 self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript)
|
|
539 if type == self.REFERENCE:
|
|
540 self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript)
|
|
541
|
|
542
|
|
543 def addIndexes(self, tables):
|
|
544 """
|
|
545 Add useful indexes to the tables
|
|
546 @param tables: which tables should be indexed
|
|
547 @type tables: list of int
|
|
548 """
|
|
549 for type in tables:
|
|
550 for chromosome in self.getTables(type):
|
|
551 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"])
|
|
552 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"])
|
|
553
|
|
554
|
|
555 def storeTranscriptList(self, type, transcriptListParser, extension):
|
|
556 """
|
|
557 Store a transcript list into database
|
|
558 @param type: whether use query/reference parser
|
|
559 @type type: int
|
|
560 @param parser: a parser of transcript list
|
|
561 @type parser: class L{TranscriptContainer<TranscriptContainer>}
|
|
562 @param extension: extend (or not) the transcripts
|
|
563 @type extension: boolean
|
|
564 """
|
|
565 progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1)
|
|
566 for transcript in transcriptListParser.getIterator():
|
|
567 if extension:
|
|
568 transcript = self.extendTranscript(type, transcript)
|
|
569 self.mySqlTranscriptWriters[type].addTranscript(transcript)
|
|
570 progress.inc()
|
|
571 self.mySqlTranscriptWriters[type].write()
|
|
572 progress.done()
|
|
573 if type == self.REFERENCE:
|
|
574 for chromosome in self.getTables(self.REFERENCE):
|
|
575 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1)
|
|
576 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome])
|
|
577
|
|
578
|
|
579 def setInputTranscriptContainer(self, type, inputTranscriptContainer):
|
|
580 """
|
|
581 Set an input transcript list container
|
|
582 @param type: whether use query/reference parser
|
|
583 @type type: int
|
|
584 @param inputTranscriptContainer: a container
|
|
585 @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>}
|
|
586 """
|
|
587 self.inputTranscriptContainers[type] = inputTranscriptContainer
|
|
588 self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts()
|
|
589 self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides()
|
|
590
|
|
591
|
|
592 def setOutputWriter(self, writer):
|
|
593 """
|
|
594 Set an output transcript list writer
|
|
595 @param writer: a writer
|
|
596 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
|
|
597 """
|
|
598 self.writer = writer
|
|
599
|
|
600
|
|
601 def compareTranscript(self, transcript1, transcript2, includeDistance = False):
|
|
602 """
|
|
603 Compare two transcripts, using user defined parameters
|
|
604 @param transcript1: a transcript from the query set (already extended)
|
|
605 @type transcript1: class L{Transcript<Transcript>}
|
|
606 @param transcript2: a transcript from the reference set (already extended)
|
|
607 @type transcript2: class L{Transcript<Transcript>}
|
|
608 @param includeDistance: take into account the distance too
|
|
609 @type includeDistance: boolean
|
|
610 @return: true, if they overlap
|
|
611 """
|
|
612 extendedTranscript1 = Transcript()
|
|
613 extendedTranscript1.copy(transcript1)
|
|
614 if includeDistance:
|
|
615 if self.maxDistance > 0:
|
|
616 extendedTranscript1.extendStart(self.maxDistance)
|
|
617 extendedTranscript1.extendEnd(self.maxDistance)
|
|
618
|
|
619 minOverlap = self.minOverlap
|
|
620 if self.pcOverlap != None:
|
|
621 minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap)
|
|
622 if not extendedTranscript1.overlapWith(transcript2, self.minOverlap):
|
|
623 return False
|
|
624 if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \
|
|
625 (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \
|
|
626 (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \
|
|
627 (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()):
|
|
628 return False
|
|
629 if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()):
|
|
630 return False
|
|
631 if self.included and not transcript2.include(extendedTranscript1):
|
|
632 return False
|
|
633 if self.including and not extendedTranscript1.include(transcript2):
|
|
634 return False
|
|
635 if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
|
|
636 if self.logHandle != None:
|
|
637 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2)))
|
|
638 return True
|
|
639 if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap):
|
|
640 if self.logHandle != None:
|
|
641 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2)))
|
|
642 return True
|
|
643 return False
|
|
644
|
|
645
|
|
646 def compareTranscriptToList(self, transcript1):
|
|
647 """
|
|
648 Compare a transcript to the reference list of transcripts
|
|
649 (Do not extend the transcripts, except for the distance)
|
|
650 @param transcript1: a transcript (from the query set)
|
|
651 @type transcript1: class L{Transcript<Transcript>}
|
|
652 @return: the reference transcripts overlapping
|
|
653 """
|
|
654 # no transcript in the reference table
|
|
655 if transcript1.getChromosome() not in self.getTables(self.WORKING):
|
|
656 return
|
|
657
|
|
658 # retrieve the the transcripts that may overlap in the working tables
|
|
659 clauses = []
|
|
660 extendedTranscript1 = Transcript()
|
|
661 extendedTranscript1.copy(transcript1)
|
|
662 if self.maxDistance > 0:
|
|
663 extendedTranscript1.extendStart(self.maxDistance)
|
|
664 if self.maxDistance > 0:
|
|
665 extendedTranscript1.extendEnd(self.maxDistance)
|
|
666 command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName())
|
|
667 for binPair in extendedTranscript1.getBins():
|
|
668 clause = "bin "
|
|
669 if binPair[0] == binPair[1]:
|
|
670 clause += "= %i" % (binPair[0])
|
|
671 else:
|
|
672 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
|
|
673 clauses.append(clause)
|
|
674 command += " OR ".join(clauses)
|
|
675 command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart())
|
|
676
|
|
677 for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command):
|
|
678 if self.compareTranscript(extendedTranscript1, transcript2):
|
|
679 yield transcript2
|
|
680
|
|
681
|
|
682 def compareTranscriptList(self):
|
|
683 """
|
|
684 Compare a list of transcript to the reference one
|
|
685 @return: the transcripts that overlap with the reference set
|
|
686 """
|
|
687 distance = 0
|
|
688 nbClustersIn = 0
|
|
689 nbClustersOut = 0
|
|
690 if self.maxDistance != None:
|
|
691 distance = self.maxDistance
|
|
692
|
|
693 self.addIndexes([self.QUERY, self.REFERENCE])
|
|
694
|
|
695 # export the container into tables
|
|
696 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
|
|
697 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
|
|
698
|
|
699 # looping
|
|
700 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
|
|
701 # get range of transcripts
|
|
702 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
|
|
703 query = self.mySqlConnection.executeQuery(command)
|
|
704 result = query.getLine()
|
|
705 first = result[0]
|
|
706 last = result[1]
|
|
707 nb = result[2]
|
|
708
|
|
709 transcripts1 = []
|
|
710 toBeRemoved1 = []
|
|
711 transcripts2 = []
|
|
712 toBeRemoved2 = []
|
|
713 overlapsWith = []
|
|
714 nbOverlaps = []
|
|
715 nbChunks = max(1, nb / 100)
|
|
716 chunkSize = (last - first) / nbChunks
|
|
717 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
|
|
718 for chunk in range(nbChunks + 1):
|
|
719
|
|
720 # load transcripts
|
|
721 start = first + chunk * chunkSize
|
|
722 end = start + chunkSize - 1
|
|
723 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
|
|
724 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
|
|
725 transcripts1.append(transcript1)
|
|
726 overlapsWith.append([])
|
|
727 nbOverlaps.append(0)
|
|
728 nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
|
|
729 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
|
|
730 self.mySqlConnection.executeQuery(command)
|
|
731 if chromosome1 in self.getTables(self.REFERENCE):
|
|
732 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
|
|
733 if chunk == 0:
|
|
734 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
|
|
735 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
|
|
736 transcripts2.append(transcript2)
|
|
737 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
|
|
738 self.mySqlConnection.executeQuery(command)
|
|
739
|
|
740 # compare sets
|
|
741 for index1, transcript1 in enumerate(transcripts1):
|
|
742 overlappingNames = []
|
|
743 nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
|
|
744 for transcript2 in transcripts2:
|
|
745 if self.compareTranscript(transcript1, transcript2, True):
|
|
746 id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
|
|
747 if id2 not in overlapsWith[index1]:
|
|
748 overlapsWith[index1].append(id2)
|
|
749 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements")
|
|
750 if self.odds:
|
|
751 if transcript2.getName() not in self.overlapResults:
|
|
752 self.overlapResults[transcript2.getName()] = 0
|
|
753 self.overlapResults[transcript2.getName()] += nbElements1
|
|
754
|
|
755 # check if query transcript extends bounds of the chunk
|
|
756 if transcript1.getEnd() < end:
|
|
757 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
|
|
758 if overlapsWith[index1]:
|
|
759 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
|
|
760 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
|
|
761 elif self.notOverlapping:
|
|
762 transcript1.setTagValue("nbOverlaps", "0")
|
|
763 self.writeTranscript(transcript1)
|
|
764 nbClustersOut += nbElements1
|
|
765 toBeRemoved1.append(index1)
|
|
766
|
|
767 # update list of query transcripts
|
|
768 for index1 in reversed(toBeRemoved1):
|
|
769 del transcripts1[index1]
|
|
770 del overlapsWith[index1]
|
|
771 del nbOverlaps[index1]
|
|
772 toBeRemoved1 = []
|
|
773
|
|
774 # check if the reference transcripts extends bounds of the chunk
|
|
775 for index2, transcript2 in enumerate(transcripts2):
|
|
776 if transcript2.getEnd() + distance < end:
|
|
777 toBeRemoved2.append(index2)
|
|
778 for index2 in reversed(toBeRemoved2):
|
|
779 del transcripts2[index2]
|
|
780 toBeRemoved2 = []
|
|
781
|
|
782 progress.inc()
|
|
783
|
|
784 for index1, transcript1 in enumerate(transcripts1):
|
|
785 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
|
|
786 if overlapsWith[index1]:
|
|
787 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
|
|
788 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
|
|
789 elif self.notOverlapping:
|
|
790 transcript1.setTagValue("nbOverlaps", "0")
|
|
791 self.writeTranscript(transcript1)
|
|
792 nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
|
|
793 progress.done()
|
|
794 self.getTables(self.QUERY)[chromosome1].remove()
|
|
795 if chromosome1 in self.getTables(self.REFERENCE):
|
|
796 self.getTables(self.REFERENCE)[chromosome1].remove()
|
|
797 self.getTables(self.WORKING)[chromosome1].remove()
|
|
798
|
|
799 self.flushData()
|
|
800 if self.writer != None:
|
|
801 self.writer.close()
|
|
802 self.writer = None
|
|
803
|
|
804 if self.verbosity > 0:
|
|
805 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
|
|
806 print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn)
|
|
807 if self.nbTranscripts[self.QUERY] != 0:
|
|
808 print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100),
|
|
809 if nbClustersOut != 0:
|
|
810 print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100)
|
|
811
|
|
812
|
|
813 def compareTranscriptListDistance(self):
|
|
814 """
|
|
815 Compare a list of transcript to the reference one
|
|
816 @return: the distance distributions in a hash
|
|
817 """
|
|
818 nbDistances = 0
|
|
819 distances = {}
|
|
820 absDistances = {}
|
|
821 strandedDistances = dict([(strand, {}) for strand in (1, -1)])
|
|
822
|
|
823 # export the container into tables
|
|
824 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
|
|
825 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
|
|
826
|
|
827 progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1)
|
|
828 for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
|
|
829 # get the distance
|
|
830 transcript1 = self.extendTranscript(self.QUERY, transcript1)
|
|
831 distance = self.maxDistance + 1
|
|
832 strand = None
|
|
833 closestElement = "None"
|
|
834 for transcript2 in self.compareTranscriptToList(transcript1):
|
|
835 thisStrand = transcript1.getDirection() * transcript2.getDirection()
|
|
836 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()):
|
|
837 transcript2.reverse()
|
|
838 if self.absolute:
|
|
839 transcript2.setDirection(transcript1.getDirection())
|
|
840 if transcript2.getDirection() == transcript1.getDirection():
|
|
841 if self.starts[self.REFERENCE] != None:
|
|
842 transcript2.restrictStart(self.starts[self.REFERENCE])
|
|
843 if self.ends[self.REFERENCE] != None:
|
|
844 transcript2.restrictEnd(self.ends[self.REFERENCE])
|
|
845 thisDistance = transcript1.getRelativeDistance(transcript2)
|
|
846 if (self.absolute):
|
|
847 thisDistance = abs(thisDistance)
|
|
848 if abs(thisDistance) < abs(distance):
|
|
849 distance = thisDistance
|
|
850 strand = thisStrand
|
|
851 closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
|
|
852 if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
|
|
853 nbDistances += 1
|
|
854 distances[distance] = distances.get(distance, 0) + 1
|
|
855 absDistance = abs(distance)
|
|
856 absDistances[absDistance] = absDistances.get(absDistance, 0) + 1
|
|
857 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0)
|
|
858 if distance not in strandedDistances[-strand]:
|
|
859 strandedDistances[-strand][distance] = 0
|
|
860
|
|
861 # write transcript
|
|
862 if distance == self.maxDistance + 1:
|
|
863 distance = "None"
|
|
864 tmpTranscript = Transcript()
|
|
865 tmpTranscript.copy(transcript1)
|
|
866 tmpTranscript.setTagValue("distance", distance)
|
|
867 tmpTranscript.setTagValue("closestElement", closestElement)
|
|
868 self.writeTranscript(tmpTranscript)
|
|
869
|
|
870 progress.inc()
|
|
871 progress.done()
|
|
872
|
|
873 self.flushData()
|
|
874
|
|
875 if self.verbosity > 0:
|
|
876 print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
|
|
877 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
|
|
878 if nbDistances == 0:
|
|
879 print "Nothing matches"
|
|
880 else:
|
|
881 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances)
|
|
882 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)
|
|
883
|
|
884 if self.strandedDistance:
|
|
885 return strandedDistances
|
|
886 return distances
|
|
887
|
|
888
|
|
889 def compareTranscriptListMerge(self):
|
|
890 """
|
|
891 Merge the query list of transcript with itself
|
|
892 @return: the merged transcripts in a transcript list database
|
|
893 """
|
|
894 nbMerges = 0
|
|
895
|
|
896 for type in (self.QUERY, self.REFERENCE):
|
|
897 self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)
|
|
898 self.flushData()
|
|
899
|
|
900 # Loop on the chromosomes
|
|
901 for chromosome in sorted(self.getTables(self.QUERY).keys()):
|
|
902 if chromosome not in self.getTables(self.REFERENCE):
|
|
903 continue
|
|
904
|
|
905 # Get the size of the chromosome
|
|
906 maxEnd = 0
|
|
907 nbChunks = 0
|
|
908 for type in (self.QUERY, self.REFERENCE):
|
|
909 command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName())
|
|
910 query = self.mySqlConnection.executeQuery(command)
|
|
911 maxEnd = max(maxEnd, int(query.getLine()[0]))
|
|
912 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements())
|
|
913
|
|
914 mergedTranscripts = {}
|
|
915 transcripts = {self.QUERY: [], self.REFERENCE: []}
|
|
916 progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1)
|
|
917 for i in range(nbChunks):
|
|
918 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
|
|
919 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks))
|
|
920
|
|
921 # Get all transcripts in query and reference from chunk
|
|
922 for type in (self.QUERY, self.REFERENCE):
|
|
923 correction = 0 if self.QUERY else self.maxDistance
|
|
924 command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction)
|
|
925 for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command):
|
|
926 transcripts[type].append(transcript)
|
|
927
|
|
928 # Merge elements between the two samples
|
|
929 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
|
|
930 for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
|
|
931 if referenceTranscript == None: continue
|
|
932 if self.compareTranscript(queryTranscript, referenceTranscript, True):
|
|
933 if queryTranscript.getDirection() != referenceTranscript.getDirection():
|
|
934 referenceTranscript.setDirection(queryTranscript.getDirection())
|
|
935 queryTranscript.merge(referenceTranscript, self.normalization)
|
|
936 nbMerges += 1
|
|
937 transcripts[self.REFERENCE][iReference] = None
|
|
938 if not self.multiple:
|
|
939 mergedTranscripts[iQuery] = 0
|
|
940
|
|
941 # Remove transcripts from database
|
|
942 for type in (self.QUERY, self.REFERENCE):
|
|
943 correction = 0 if self.QUERY else self.maxDistance
|
|
944 command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction)
|
|
945 query = self.mySqlConnection.executeQuery(command)
|
|
946
|
|
947 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
|
|
948 if (self.multiple):
|
|
949 for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
|
|
950 if queryTranscript1 == None: continue
|
|
951 for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]):
|
|
952 if iQuery2 <= iQuery1 or queryTranscript2 == None: continue
|
|
953 minOverlap = self.minOverlap
|
|
954 if self.pcOverlap != None:
|
|
955 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap)
|
|
956 if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear):
|
|
957 if queryTranscript1.getDirection() != queryTranscript2.getDirection():
|
|
958 queryTranscript2.setDirection(queryTranscript1.getDirection())
|
|
959 queryTranscript1.merge(queryTranscript2, self.normalization)
|
|
960 transcripts[self.QUERY][iQuery2] = None
|
|
961 nbMerges += 1
|
|
962 if not self.multiple:
|
|
963 mergedTranscripts[iQuery1] = 0
|
|
964
|
|
965 # Update the sets of transcripts and write into database (also update mergedTranscripts)
|
|
966 newTranscripts = {self.QUERY: [], self.REFERENCE: []}
|
|
967 newMergedTranscripts = {}
|
|
968 for type in (self.QUERY, self.REFERENCE):
|
|
969 for i, transcript in enumerate(transcripts[type]):
|
|
970 if transcript == None: continue
|
|
971 correction = 0 if self.QUERY else self.maxDistance
|
|
972 if transcript.getEnd() < rangeEnd - correction:
|
|
973 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
|
|
974 self.writeTranscript(transcripts[type][i])
|
|
975 else:
|
|
976 if type == self.QUERY and i in mergedTranscripts:
|
|
977 newMergedTranscripts[len(newTranscripts[type])] = 0
|
|
978 newTranscripts[type].append(transcript)
|
|
979 transcripts = newTranscripts
|
|
980 mergedTranscripts = newMergedTranscripts
|
|
981
|
|
982 progress.inc()
|
|
983 progress.done()
|
|
984
|
|
985 for type in (self.QUERY, self.REFERENCE):
|
|
986 for i, transcript in enumerate(transcripts[type]):
|
|
987 if transcripts == None: continue
|
|
988 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
|
|
989 self.writeTranscript(transcripts[type][i])
|
|
990
|
|
991 # Manage chromosomes with no corresponding data
|
|
992 if self.multiple:
|
|
993 for type in self.INPUTTYPES:
|
|
994 for chromosome in self.getTables(type):
|
|
995 if chromosome in self.getTables(1 - type):
|
|
996 continue
|
|
997 for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator():
|
|
998 self.writeTranscript(transcript)
|
|
999
|
|
1000 self.flushData()
|
|
1001 if self.writer != None:
|
|
1002 self.writer.close()
|
|
1003 self.writer = None
|
|
1004
|
|
1005 if self.verbosity > 0:
|
|
1006 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
|
|
1007 print "# merges: %d" % (nbMerges)
|
|
1008 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
|
|
1009
|
|
1010
|
|
1011 def compareTranscriptListSelfMerge(self):
|
|
1012 """
|
|
1013 Merge the query list of transcript with itself
|
|
1014 @return: the merged transcripts in a transcript list database
|
|
1015 """
|
|
1016 nbMerges = 0
|
|
1017 distance = self.maxDistance if self.maxDistance != None else 0
|
|
1018
|
|
1019 self.addIndexes([self.QUERY])
|
|
1020 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
|
|
1021 self.flushData()
|
|
1022
|
|
1023 # looping
|
|
1024 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
|
|
1025 transcripts2 = []
|
|
1026
|
|
1027 # get range of transcripts
|
|
1028 progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
|
|
1029 command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName())
|
|
1030 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
|
|
1031
|
|
1032 # compare sets
|
|
1033 toBeRemoved = set()
|
|
1034 toBePrinted = set()
|
|
1035 for index2, transcript2 in enumerate(transcripts2):
|
|
1036
|
|
1037 if self.compareTranscript(transcript1, transcript2, True):
|
|
1038 if transcript1.getDirection() != transcript2.getDirection():
|
|
1039 transcript2.setDirection(transcript1.getDirection())
|
|
1040 transcript1.merge(transcript2, self.normalization)
|
|
1041 toBeRemoved.add(index2)
|
|
1042 nbMerges += 1
|
|
1043 elif transcript2.getEnd() + distance < transcript1.getStart():
|
|
1044 toBePrinted.add(index2)
|
|
1045 transcripts2.append(transcript1)
|
|
1046
|
|
1047 for index2 in sorted(toBePrinted):
|
|
1048 self.writeTranscript(transcripts2[index2])
|
|
1049 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)]
|
|
1050
|
|
1051 for transcript2 in transcripts2:
|
|
1052 self.writeTranscript(transcript2)
|
|
1053 progress.done()
|
|
1054 self.getTables(self.QUERY)[chromosome1].remove()
|
|
1055
|
|
1056 self.flushData()
|
|
1057 if self.writer != None:
|
|
1058 self.writer.close()
|
|
1059 self.writer = None
|
|
1060
|
|
1061 if self.verbosity > 0:
|
|
1062 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
|
|
1063 print "# merges: %d" % (nbMerges)
|
|
1064 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
|
|
1065
|
|
1066
|
|
1067 def getDifferenceTranscriptList(self):
|
|
1068 """
|
|
1069 Get the elements of the first list which do not overlap the second list (at the nucleotide level)
|
|
1070 @return: the transcripts that overlap with the reference set
|
|
1071 """
|
|
1072 distance = 0 if self.maxDistance == None else self.maxDistance
|
|
1073
|
|
1074 self.addIndexes([self.QUERY, self.REFERENCE])
|
|
1075
|
|
1076 # export the container into tables
|
|
1077 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
|
|
1078 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
|
|
1079
|
|
1080 # looping
|
|
1081 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
|
|
1082 # get range of transcripts
|
|
1083 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
|
|
1084 query = self.mySqlConnection.executeQuery(command)
|
|
1085 result = query.getLine()
|
|
1086 first = result[0]
|
|
1087 last = result[1]
|
|
1088 nb = result[2]
|
|
1089
|
|
1090 transcripts1 = []
|
|
1091 transcripts2 = []
|
|
1092 nbChunks = max(1, nb / 100)
|
|
1093 chunkSize = (last - first) / nbChunks
|
|
1094 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
|
|
1095 for chunk in range(nbChunks + 1):
|
|
1096
|
|
1097 # load transcripts
|
|
1098 start = first + chunk * chunkSize
|
|
1099 end = start + chunkSize - 1
|
|
1100 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
|
|
1101 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
|
|
1102 transcripts1.append(transcript1)
|
|
1103 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
|
|
1104 self.mySqlConnection.executeQuery(command)
|
|
1105 if chromosome1 in self.getTables(self.REFERENCE):
|
|
1106 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
|
|
1107 if chunk == 0:
|
|
1108 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
|
|
1109 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
|
|
1110 transcripts2.append(transcript2)
|
|
1111 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
|
|
1112 self.mySqlConnection.executeQuery(command)
|
|
1113
|
|
1114 # compare sets
|
|
1115 toBeRemoved1 = []
|
|
1116 for index1, transcript1 in enumerate(transcripts1):
|
|
1117 newTranscript1 = Transcript()
|
|
1118 newTranscript1.copy(transcript1)
|
|
1119 for transcript2 in transcripts2:
|
|
1120 newTranscript1 = newTranscript1.getDifference(transcript2)
|
|
1121 if newTranscript1 == None:
|
|
1122 toBeRemoved1.append(index1)
|
|
1123 break
|
|
1124 transcripts1[index1] = newTranscript1
|
|
1125
|
|
1126 # check if query transcript extends bounds of the chunk
|
|
1127 if newTranscript1 != None and newTranscript1.getEnd() < end:
|
|
1128 if self.splitDifference:
|
|
1129 for exon in newTranscript1.getExons():
|
|
1130 transcript = Transcript()
|
|
1131 transcript.copy(exon)
|
|
1132 self.writeTranscript(transcript)
|
|
1133 else:
|
|
1134 self.writeTranscript(newTranscript1)
|
|
1135 toBeRemoved1.append(index1)
|
|
1136
|
|
1137 # update list of query transcripts
|
|
1138 for index1 in reversed(toBeRemoved1):
|
|
1139 del transcripts1[index1]
|
|
1140
|
|
1141 # check if the reference transcripts extends bounds of the chunk
|
|
1142 toBeRemoved2 = []
|
|
1143 for index2, transcript2 in enumerate(transcripts2):
|
|
1144 if transcript2.getEnd() + distance < end:
|
|
1145 toBeRemoved2.append(index2)
|
|
1146 for index2 in reversed(toBeRemoved2):
|
|
1147 del transcripts2[index2]
|
|
1148
|
|
1149 progress.inc()
|
|
1150
|
|
1151 for transcript1 in transcripts1:
|
|
1152 if self.splitDifference:
|
|
1153 for exon in transcript1.getExons():
|
|
1154 transcript = Transcript()
|
|
1155 transcript.copy(exon)
|
|
1156 self.writeTranscript(transcript)
|
|
1157 else:
|
|
1158 self.writeTranscript(transcript1)
|
|
1159 progress.done()
|
|
1160 self.getTables(self.QUERY)[chromosome1].remove()
|
|
1161 if chromosome1 in self.getTables(self.REFERENCE):
|
|
1162 self.getTables(self.REFERENCE)[chromosome1].remove()
|
|
1163 self.getTables(self.WORKING)[chromosome1].remove()
|
|
1164
|
|
1165 self.flushData()
|
|
1166 if self.writer != None:
|
|
1167 self.writer.close()
|
|
1168 self.writer = None
|
|
1169
|
|
1170 if self.verbosity > 0:
|
|
1171 print "query: %d elements" % (self.nbTranscripts[self.QUERY])
|
|
1172 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
|
|
1173 print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
|
|
1174
|
|
1175
|
|
1176 def getOddsPerTranscript(self):
|
|
1177 """
|
|
1178 Return overlap results
|
|
1179 @return a dict of data
|
|
1180 """
|
|
1181 if not self.odds:
|
|
1182 raise Exception("Did not compute odds!")
|
|
1183 return self.overlapResults
|
|
1184
|
|
1185
|
|
1186 def getOdds(self):
|
|
1187 """
|
|
1188 Return odds about the overlap
|
|
1189 @return a dict of data
|
|
1190 """
|
|
1191 if not self.odds:
|
|
1192 raise Exception("Did not compute odds!")
|
|
1193 if self.oddResults != None:
|
|
1194 return self.oddResults
|
|
1195 self.oddResults = {}
|
|
1196 for name, value in self.overlapResults.iteritems():
|
|
1197 self.oddResults[value] = self.oddResults.get(value, 0) + 1
|
|
1198 return self.oddResults
|