comparison SMART/Java/Python/structure/TranscriptListsComparator.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30 import sys
31 import random
32 from SMART.Java.Python.misc import Utils
33 from SMART.Java.Python.structure.Transcript import Transcript
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer
36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable
38 from SMART.Java.Python.misc.Progress import Progress
39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
40
41
42
43 class TranscriptListsComparator(object):
44 """
45 Compare two transcript lists, using a database for one of the list
46 Uses one TranscriptContainer for query data,
47 one TranscriptContainer exported to MySqlTranscriptTable for reference data,
48 one MySqlTranscriptTable for transformed reference data
49 @ivar inputTranscriptContainers: parsers to the list of query transcripts
50 @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>}
51 @ivar writer: transcript list writer
52 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
53 @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently)
54 @type mySqlConnection: class L{MySqlConnection<MySqlConnection>}
55 @ivar introns: compare transcripts or exons only
56 @type introns: list of 2 boolean
57 @ivar starts: restrict the query transcripts to first nucleotides
58 @type starts: list of 2 int or None
59 @ivar fivePrimes: extend a list of transcripts by their 5' end
60 @type fivePrimes: list of 2 int or None
61 @ivar threePrimes: extend a list of transcripts by their 3' end
62 @type threePrimes: list of 2 int or None
63 @ivar minDistance: min distance between two transcripts [default: 0]
64 @type minDistance: int
65 @ivar maxDistance: max distance between two transcripts [default: 0]
66 @type maxDistance: int
67 @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap
68 @type minOverlap: int
69 @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap
70 @type pcOverlap: int
71 @ivar upstreams: consider distances with elements which are upstream of the transcripts
72 @type upstreams: boolean
73 @ivar downstreams: consider distances with elements which are downstream of the transcripts
74 @type downstreams: boolean
75 @ivar colinear: whether transcripts should overlap in the same direction
76 @type colinear: boolean
77 @ivar antisense: whether transcripts should overlap in the opposite direction
78 @type antisense: boolean
79 @ivar outputDistance: output distance between query and reference instead of query transcript
80 @type outputDistance: boolean
81 @ivar absolute: do not consider the strand while computing distance
82 @type absolute: boolean
83 @ivar strandedDistance: return a line per strand while computing distances
84 @type strandedDistance: boolean
85 @ivar QUERY: constant specifying the query objects
86 @type QUERY: int
87 @ivar REFERENCE: constant specifying the reference objects
88 @type REFERENCE: int
89 @ivar INPUTTYPES: set of input types of data (query or reference) objects
90 @type INPUTTYPES: list of 2 int
91 @ivar typeToString: string representation of the previous types
92 @type typeToString: dict
93 @ivar tableNames: name of the transcript tables
94 @type tableNames: dict of strings
95 @ivar nbTranscripts: number of transcript in the query/reference set
96 @type nbTranscripts: list of 2 int or None
97 @ivar nbNucleotides: number of nucleotides in the query/reference set
98 @type nbNucleotides: list of 2 int or None
99 @ivar transcriptsToBeStored: transcripts that will be stored into database
100 @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>}
101 @ivar multiple: in merge mode, aggregate multiple transcripts
102 @type multiple: boolean
103 @ivar normalization: normalize each element by the number of mappings of this element
104 @type normalization: boolean
105 @ivar invert: invert the current comparison
106 @type invert: boolean
107 @ivar splitDifference: split into intervals when computing difference
108 @type splitDifference: boolean
109 @ivar odds: whether odds about the comparison should be computed
110 @type odds: boolean
111 @ivar overlapResults: count the number of overlaps
112 @type overlapResults: dictionary
113 @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one
114 @type oddResults: dictionary
115 @ivar outputContainer: container of the output transcripts
116 @type outputContainer: class L{TranscriptContainer<TranscriptContainer>}
117 @ivar logHandle: log handle
118 @type logHandle: file
119 @ivar verbosity: verbosity
120 @type verbosity: int
121 """
122
123 def __init__(self, logHandle = None, verbosity = 0):
124 """
125 Constructor
126 @param transcriptListParser2: parser to the list of reference transcripts
127 @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>}
128 @param logHandle: log handle
129 @type logHandle: file
130 @param verbosity: verbosity
131 @type verbosity: int
132 """
133 self.QUERY = 0
134 self.REFERENCE = 1
135 self.WORKING = 2
136 self.INPUTTYPES = (self.QUERY, self.REFERENCE)
137 self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING)
138 self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"}
139
140 self.logHandle = logHandle
141 self.verbosity = verbosity
142 self.mySqlConnection = MySqlConnection(self.verbosity-1)
143 self.inputTranscriptContainers = [None, None]
144 self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))]
145 self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames]
146 self.writer = None
147 self.introns = [False, False]
148 self.starts = [None, None]
149 self.ends = [None, None]
150 self.fivePrimes = [None, None]
151 self.threePrimes = [None, None]
152 self.minDistance = None
153 self.maxDistance = 0
154 self.minOverlap = 1
155 self.pcOverlap = None
156 self.colinear = False
157 self.antisense = False
158 self.downstreams = [False, False]
159 self.upstreams = [False, False]
160 self.outputDistance = False
161 self.absolute = False
162 self.strandedDistance = False
163 self.nbTranscripts = [None, None]
164 self.nbNucleotides = [None, None]
165 self.normalization = False
166 self.included = False
167 self.including = False
168 self.invert = False
169 self.notOverlapping = False
170 self.splitDifference = False
171 self.multiple = False
172 self.odds = False
173 self.overlapResults = None
174 self.oddResults = None
175 self.outputContainer = None
176 self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES])
177 self.nbPrinted = 0
178
179 self.mySqlConnection.createDatabase()
180
181
182 def __del__(self):
183 """
184 Destructor
185 Remove all temporary tables
186 """
187 for type in self.INPUTWORKINGTYPES:
188 self.mySqlTranscriptWriters[type].removeTables()
189 self.mySqlConnection.deleteDatabase()
190
191 def acceptIntrons(self, type, bool):
192 """
193 Compare transcripts or exons only
194 @param type: whether use query/reference data
195 @type type: int
196 @param bool: include introns or not
197 @type bool: boolean
198 """
199 self.introns[type] = bool
200
201
202 def restrictToStart(self, type, size):
203 """
204 Restrict a list of transcripts to first nucleotides
205 @param type: whether use query/reference data
206 @type type: int
207 @param size: the size of the transcript to be considered
208 @type size: int
209 """
210 self.starts[type] = size
211 self.introns[type] = False
212
213
214 def restrictToEnd(self, type, size):
215 """
216 Restrict a list of transcripts to first nucleotides
217 @param type: whether use query/reference data
218 @type type: int
219 @param size: the size of the transcript to be considered
220 @type size: int
221 """
222 self.ends[type] = size
223 self.introns[type] = False
224
225
226 def extendFivePrime(self, type, size):
227 """
228 Extend a list of transcripts by their 5' end
229 @param type: whether use query/reference data
230 @type type: int
231 @param size: size of the extension
232 @type size: int
233 """
234 self.fivePrimes[type] = size
235
236
237 def extendThreePrime(self, type, size):
238 """
239 Extend the list of query transcripts by their 3' end
240 @param type: whether use query/reference data
241 @type type: int
242 @param size: size of the extension
243 @type size: int
244 """
245 self.threePrimes[type] = size
246
247
248 def setMinDistance(self, distance):
249 """
250 Set the min distance between two transcripts
251 @param distance: distance
252 @type distance: int
253 """
254 self.minDistance = distance
255
256
257 def setMaxDistance(self, distance):
258 """
259 Set the max distance between two transcripts
260 @param distance: distance
261 @type distance: int
262 """
263 self.maxDistance = distance
264
265
266 def setMinOverlap(self, overlap):
267 """
268 Set the minimum number of nucleotides to declare an overlap
269 @param overlap: minimum number of nucleotides
270 @type overlap: int
271 """
272 self.minOverlap = overlap
273
274
275 def setPcOverlap(self, overlap):
276 """
277 Set the percentage of nucleotides to declare an overlap
278 @param overlap: percentage of nucleotides
279 @type overlap: int
280 """
281 self.pcOverlap = overlap
282
283
284 def setUpstream(self, type, boolean):
285 """
286 Consider transcripts which are upstream of some transcripts
287 @param type: whether use query/reference data
288 @type type: int
289 @param boolean: consider only these transcripts or not
290 @type boolean: boolean
291 """
292 self.upstreams[type] = boolean
293
294
295 def setDownstream(self, type, boolean):
296 """
297 Consider transcripts which are downstream of some transcripts
298 @param type: whether use query/reference data
299 @type type: int
300 @param boolean: consider only these transcripts or not
301 @type boolean: boolean
302 """
303 self.downstreams[type] = boolean
304
305
306 def setOutputDistance(self, boolean):
307 """
308 Output distance between query and reference instead of query transcript
309 @param boolean: whether distance should be output
310 @type boolean: boolean
311 """
312 self.outputDistance = boolean
313
314
315 def setAbsolute(self, boolean):
316 """
317 Do not consider strand when computing distance (thus, having only non-negative values)
318 @param boolean: whether we should consider strands
319 @type boolean: boolean
320 """
321 self.absolute = boolean
322
323
324 def setStrandedDistance(self, boolean):
325 """
326 Return two distance distributions, one per strand
327 @param boolean: whether we should return 2 distance distance
328 @type boolean: boolean
329 """
330 self.strandedDistance = boolean
331
332
333 def getColinearOnly(self, boolean):
334 """
335 Only consider transcripts that overlap in the same direction
336 @param boolean: whether transcripts should overlap in the same direction
337 @type boolean: boolean
338 """
339 self.colinear = boolean
340
341
342 def getAntisenseOnly(self, boolean):
343 """
344 Only consider transcripts that overlap in the opposite direction
345 @param boolean: whether transcripts should overlap in the opposite direction
346 @type boolean: boolean
347 """
348 self.antisense = boolean
349
350
351 def setIncludedOnly(self, boolean):
352 """
353 Keep the elements from first set which are included in the second set
354 @param boolean: whether to keep included elements only
355 @type boolean: boolean
356 """
357 self.included = boolean
358
359
360 def setIncludingOnly(self, boolean):
361 """
362 Keep the elements from second set which are included in the first set
363 @param boolean: whether to keep included elements only
364 @type boolean: boolean
365 """
366 self.including = boolean
367
368
369 def setNormalization(self, boolean):
370 """
371 Normalize the elements by the number of mappings in the genome
372 @param boolean: whether normalize
373 @type boolean: boolean
374 """
375 self.normalization = boolean
376
377
378 def getInvert(self, boolean):
379 """
380 Only consider transcripts that do not overlap
381 @param boolean: whether invert the selection
382 @type boolean: boolean
383 """
384 self.invert = boolean
385
386
387 def includeNotOverlapping(self, boolean):
388 """
389 Also output the elements which do not overlap
390 @param boolean: whether output the elements which do not overlap
391 @type boolean: boolean
392 """
393 self.notOverlapping = boolean
394
395
396 def setSplitDifference(self, boolean):
397 """
398 Split into intervals when computing difference
399 @param boolean: whether to split
400 @type boolean: boolean
401 """
402 self.splitDifference = boolean
403
404
405 def aggregate(self, boolean):
406 """
407 In merge mode, aggregate multiple transcripts
408 @param boolean: aggregate multiple transcripts
409 @type boolean: boolean
410 """
411 self.multiple = boolean
412
413
414 def getTables(self, type):
415 """
416 Get the SQL tables
417 @param type: type of the table (query, reference, etc.)
418 @type type: int
419 """
420 return self.mySqlTranscriptWriters[type].getTables()
421
422
423 def computeOdds(self, boolean):
424 """
425 Compute odds
426 @param boolean: whether odds should be computed
427 @type boolean: boolean
428 """
429 self.odds = boolean
430 if self.odds:
431 self.overlapResults = dict()
432
433
434 def computeOddsPerTranscript(self, boolean):
435 """
436 Compute odds for each transcript
437 @param boolean: whether odds for each transcript should be computed
438 @type boolean: boolean
439 """
440 self.odds = boolean
441 if self.odds:
442 self.overlapResults = dict()
443
444
445 def removeTables(self):
446 """
447 Remove the temporary MySQL tables
448 """
449 for type in self.INPUTWORKINGTYPES:
450 for chromosome in self.getTables(type):
451 self.getTables(type)[chromosome].remove()
452
453
454 def clearTables(self):
455 """
456 Empty the content of the databases
457 """
458 for type in self.INPUTWORKINGTYPES:
459 if self.transcriptListParsers[type] != None:
460 for chromosome in self.getTables(type):
461 self.getTables(type)[chromosome].clear()
462
463
464 def extendTranscript(self, type, transcript):
465 """
466 Extend a transcript corresponding to the parameters of the class
467 @param transcript: a transcript
468 @type transcript: class L{Transcript<Transcript>}
469 @return: the possibly extended transcript
470 """
471 extendedTranscript = Transcript()
472 extendedTranscript.copy(transcript)
473 if self.starts[type] != None:
474 extendedTranscript.restrictStart(self.starts[type])
475 if self.ends[type] != None:
476 extendedTranscript.restrictEnd(self.ends[type])
477 if self.fivePrimes[type] != None:
478 extendedTranscript.extendStart(self.fivePrimes[type])
479 if self.threePrimes[type] != None:
480 extendedTranscript.extendEnd(self.threePrimes[type])
481 return extendedTranscript
482
483
484 def storeTranscript(self, type, transcript, now = True):
485 """
486 Add a transcript to a MySQL database, or postpone the store
487 @param type: whether use query/reference table
488 @type type: int
489 @param transcript: a transcript
490 @type transcript: class L{Transcript<Transcript>}
491 @param now: whether transcript should be stored now (or stored can be postponed)
492 @type now: bool
493 """
494 self.mySqlTranscriptWriters[type].addTranscript(transcript)
495 if type == self.REFERENCE:
496 self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript)
497 if now:
498 self.mySqlTranscriptWriters[type].write()
499 if type == self.REFERENCE:
500 self.mySqlTranscriptWriters[self.WORKING].write()
501
502
503 def writeTranscript(self, transcript):
504 """
505 Write a transcript in the output file
506 @param transcript: a transcript
507 @type transcript: class L{Transcript<Transcript>}
508 """
509 if self.writer != None:
510 self.writer.addTranscript(transcript)
511 self.nbPrinted += 1
512
513
514 def flushData(self, type = None):
515 """
516 Store the remaining transcripts
517 @param type: whether use query/reference table (None for all)
518 @type type: int or None
519 """
520 if type == None:
521 types = self.INPUTWORKINGTYPES
522 else:
523 types = [type]
524 for type in types:
525 self.mySqlTranscriptWriters[type].write()
526 if self.writer != None:
527 self.writer.write()
528
529
530 def unstoreTranscript(self, type, transcript):
531 """
532 Remove a transcript from a MySQL database
533 @param type: whether use query/reference table
534 @type type: int
535 @param transcript: a transcript
536 @type transcript: class L{Transcript<Transcript>}
537 """
538 self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript)
539 if type == self.REFERENCE:
540 self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript)
541
542
543 def addIndexes(self, tables):
544 """
545 Add useful indexes to the tables
546 @param tables: which tables should be indexed
547 @type tables: list of int
548 """
549 for type in tables:
550 for chromosome in self.getTables(type):
551 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"])
552 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"])
553
554
555 def storeTranscriptList(self, type, transcriptListParser, extension):
556 """
557 Store a transcript list into database
558 @param type: whether use query/reference parser
559 @type type: int
560 @param parser: a parser of transcript list
561 @type parser: class L{TranscriptContainer<TranscriptContainer>}
562 @param extension: extend (or not) the transcripts
563 @type extension: boolean
564 """
565 progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1)
566 for transcript in transcriptListParser.getIterator():
567 if extension:
568 transcript = self.extendTranscript(type, transcript)
569 self.mySqlTranscriptWriters[type].addTranscript(transcript)
570 progress.inc()
571 self.mySqlTranscriptWriters[type].write()
572 progress.done()
573 if type == self.REFERENCE:
574 for chromosome in self.getTables(self.REFERENCE):
575 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1)
576 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome])
577
578
579 def setInputTranscriptContainer(self, type, inputTranscriptContainer):
580 """
581 Set an input transcript list container
582 @param type: whether use query/reference parser
583 @type type: int
584 @param inputTranscriptContainer: a container
585 @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>}
586 """
587 self.inputTranscriptContainers[type] = inputTranscriptContainer
588 self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts()
589 self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides()
590
591
592 def setOutputWriter(self, writer):
593 """
594 Set an output transcript list writer
595 @param writer: a writer
596 @type writer: class L{TranscriptListWriter<TranscriptListWriter>}
597 """
598 self.writer = writer
599
600
601 def compareTranscript(self, transcript1, transcript2, includeDistance = False):
602 """
603 Compare two transcripts, using user defined parameters
604 @param transcript1: a transcript from the query set (already extended)
605 @type transcript1: class L{Transcript<Transcript>}
606 @param transcript2: a transcript from the reference set (already extended)
607 @type transcript2: class L{Transcript<Transcript>}
608 @param includeDistance: take into account the distance too
609 @type includeDistance: boolean
610 @return: true, if they overlap
611 """
612 extendedTranscript1 = Transcript()
613 extendedTranscript1.copy(transcript1)
614 if includeDistance:
615 if self.maxDistance > 0:
616 extendedTranscript1.extendStart(self.maxDistance)
617 extendedTranscript1.extendEnd(self.maxDistance)
618
619 minOverlap = self.minOverlap
620 if self.pcOverlap != None:
621 minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap)
622 if not extendedTranscript1.overlapWith(transcript2, self.minOverlap):
623 return False
624 if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \
625 (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \
626 (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \
627 (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()):
628 return False
629 if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()):
630 return False
631 if self.included and not transcript2.include(extendedTranscript1):
632 return False
633 if self.including and not extendedTranscript1.include(transcript2):
634 return False
635 if self.introns[self.REFERENCE] and self.introns[self.QUERY]:
636 if self.logHandle != None:
637 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2)))
638 return True
639 if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap):
640 if self.logHandle != None:
641 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2)))
642 return True
643 return False
644
645
646 def compareTranscriptToList(self, transcript1):
647 """
648 Compare a transcript to the reference list of transcripts
649 (Do not extend the transcripts, except for the distance)
650 @param transcript1: a transcript (from the query set)
651 @type transcript1: class L{Transcript<Transcript>}
652 @return: the reference transcripts overlapping
653 """
654 # no transcript in the reference table
655 if transcript1.getChromosome() not in self.getTables(self.WORKING):
656 return
657
658 # retrieve the the transcripts that may overlap in the working tables
659 clauses = []
660 extendedTranscript1 = Transcript()
661 extendedTranscript1.copy(transcript1)
662 if self.maxDistance > 0:
663 extendedTranscript1.extendStart(self.maxDistance)
664 if self.maxDistance > 0:
665 extendedTranscript1.extendEnd(self.maxDistance)
666 command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName())
667 for binPair in extendedTranscript1.getBins():
668 clause = "bin "
669 if binPair[0] == binPair[1]:
670 clause += "= %i" % (binPair[0])
671 else:
672 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1])
673 clauses.append(clause)
674 command += " OR ".join(clauses)
675 command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart())
676
677 for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command):
678 if self.compareTranscript(extendedTranscript1, transcript2):
679 yield transcript2
680
681
682 def compareTranscriptList(self):
683 """
684 Compare a list of transcript to the reference one
685 @return: the transcripts that overlap with the reference set
686 """
687 distance = 0
688 nbClustersIn = 0
689 nbClustersOut = 0
690 if self.maxDistance != None:
691 distance = self.maxDistance
692
693 self.addIndexes([self.QUERY, self.REFERENCE])
694
695 # export the container into tables
696 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
697 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
698
699 # looping
700 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
701 # get range of transcripts
702 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
703 query = self.mySqlConnection.executeQuery(command)
704 result = query.getLine()
705 first = result[0]
706 last = result[1]
707 nb = result[2]
708
709 transcripts1 = []
710 toBeRemoved1 = []
711 transcripts2 = []
712 toBeRemoved2 = []
713 overlapsWith = []
714 nbOverlaps = []
715 nbChunks = max(1, nb / 100)
716 chunkSize = (last - first) / nbChunks
717 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
718 for chunk in range(nbChunks + 1):
719
720 # load transcripts
721 start = first + chunk * chunkSize
722 end = start + chunkSize - 1
723 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
724 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
725 transcripts1.append(transcript1)
726 overlapsWith.append([])
727 nbOverlaps.append(0)
728 nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
729 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
730 self.mySqlConnection.executeQuery(command)
731 if chromosome1 in self.getTables(self.REFERENCE):
732 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
733 if chunk == 0:
734 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
735 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
736 transcripts2.append(transcript2)
737 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
738 self.mySqlConnection.executeQuery(command)
739
740 # compare sets
741 for index1, transcript1 in enumerate(transcripts1):
742 overlappingNames = []
743 nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
744 for transcript2 in transcripts2:
745 if self.compareTranscript(transcript1, transcript2, True):
746 id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
747 if id2 not in overlapsWith[index1]:
748 overlapsWith[index1].append(id2)
749 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements")
750 if self.odds:
751 if transcript2.getName() not in self.overlapResults:
752 self.overlapResults[transcript2.getName()] = 0
753 self.overlapResults[transcript2.getName()] += nbElements1
754
755 # check if query transcript extends bounds of the chunk
756 if transcript1.getEnd() < end:
757 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
758 if overlapsWith[index1]:
759 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
760 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
761 elif self.notOverlapping:
762 transcript1.setTagValue("nbOverlaps", "0")
763 self.writeTranscript(transcript1)
764 nbClustersOut += nbElements1
765 toBeRemoved1.append(index1)
766
767 # update list of query transcripts
768 for index1 in reversed(toBeRemoved1):
769 del transcripts1[index1]
770 del overlapsWith[index1]
771 del nbOverlaps[index1]
772 toBeRemoved1 = []
773
774 # check if the reference transcripts extends bounds of the chunk
775 for index2, transcript2 in enumerate(transcripts2):
776 if transcript2.getEnd() + distance < end:
777 toBeRemoved2.append(index2)
778 for index2 in reversed(toBeRemoved2):
779 del transcripts2[index2]
780 toBeRemoved2 = []
781
782 progress.inc()
783
784 for index1, transcript1 in enumerate(transcripts1):
785 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping:
786 if overlapsWith[index1]:
787 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100])
788 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1]))
789 elif self.notOverlapping:
790 transcript1.setTagValue("nbOverlaps", "0")
791 self.writeTranscript(transcript1)
792 nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements")
793 progress.done()
794 self.getTables(self.QUERY)[chromosome1].remove()
795 if chromosome1 in self.getTables(self.REFERENCE):
796 self.getTables(self.REFERENCE)[chromosome1].remove()
797 self.getTables(self.WORKING)[chromosome1].remove()
798
799 self.flushData()
800 if self.writer != None:
801 self.writer.close()
802 self.writer = None
803
804 if self.verbosity > 0:
805 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
806 print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn)
807 if self.nbTranscripts[self.QUERY] != 0:
808 print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100),
809 if nbClustersOut != 0:
810 print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100)
811
812
813 def compareTranscriptListDistance(self):
814 """
815 Compare a list of transcript to the reference one
816 @return: the distance distributions in a hash
817 """
818 nbDistances = 0
819 distances = {}
820 absDistances = {}
821 strandedDistances = dict([(strand, {}) for strand in (1, -1)])
822
823 # export the container into tables
824 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
825 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
826
827 progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1)
828 for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator():
829 # get the distance
830 transcript1 = self.extendTranscript(self.QUERY, transcript1)
831 distance = self.maxDistance + 1
832 strand = None
833 closestElement = "None"
834 for transcript2 in self.compareTranscriptToList(transcript1):
835 thisStrand = transcript1.getDirection() * transcript2.getDirection()
836 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()):
837 transcript2.reverse()
838 if self.absolute:
839 transcript2.setDirection(transcript1.getDirection())
840 if transcript2.getDirection() == transcript1.getDirection():
841 if self.starts[self.REFERENCE] != None:
842 transcript2.restrictStart(self.starts[self.REFERENCE])
843 if self.ends[self.REFERENCE] != None:
844 transcript2.restrictEnd(self.ends[self.REFERENCE])
845 thisDistance = transcript1.getRelativeDistance(transcript2)
846 if (self.absolute):
847 thisDistance = abs(thisDistance)
848 if abs(thisDistance) < abs(distance):
849 distance = thisDistance
850 strand = thisStrand
851 closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName()
852 if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance):
853 nbDistances += 1
854 distances[distance] = distances.get(distance, 0) + 1
855 absDistance = abs(distance)
856 absDistances[absDistance] = absDistances.get(absDistance, 0) + 1
857 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0)
858 if distance not in strandedDistances[-strand]:
859 strandedDistances[-strand][distance] = 0
860
861 # write transcript
862 if distance == self.maxDistance + 1:
863 distance = "None"
864 tmpTranscript = Transcript()
865 tmpTranscript.copy(transcript1)
866 tmpTranscript.setTagValue("distance", distance)
867 tmpTranscript.setTagValue("closestElement", closestElement)
868 self.writeTranscript(tmpTranscript)
869
870 progress.inc()
871 progress.done()
872
873 self.flushData()
874
875 if self.verbosity > 0:
876 print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE])
877 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
878 if nbDistances == 0:
879 print "Nothing matches"
880 else:
881 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances)
882 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100)
883
884 if self.strandedDistance:
885 return strandedDistances
886 return distances
887
888
889 def compareTranscriptListMerge(self):
890 """
891 Merge the query list of transcript with itself
892 @return: the merged transcripts in a transcript list database
893 """
894 nbMerges = 0
895
896 for type in (self.QUERY, self.REFERENCE):
897 self.storeTranscriptList(type, self.inputTranscriptContainers[type], True)
898 self.flushData()
899
900 # Loop on the chromosomes
901 for chromosome in sorted(self.getTables(self.QUERY).keys()):
902 if chromosome not in self.getTables(self.REFERENCE):
903 continue
904
905 # Get the size of the chromosome
906 maxEnd = 0
907 nbChunks = 0
908 for type in (self.QUERY, self.REFERENCE):
909 command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName())
910 query = self.mySqlConnection.executeQuery(command)
911 maxEnd = max(maxEnd, int(query.getLine()[0]))
912 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements())
913
914 mergedTranscripts = {}
915 transcripts = {self.QUERY: [], self.REFERENCE: []}
916 progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1)
917 for i in range(nbChunks):
918 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1
919 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks))
920
921 # Get all transcripts in query and reference from chunk
922 for type in (self.QUERY, self.REFERENCE):
923 correction = 0 if self.QUERY else self.maxDistance
924 command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction)
925 for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command):
926 transcripts[type].append(transcript)
927
928 # Merge elements between the two samples
929 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]):
930 for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]):
931 if referenceTranscript == None: continue
932 if self.compareTranscript(queryTranscript, referenceTranscript, True):
933 if queryTranscript.getDirection() != referenceTranscript.getDirection():
934 referenceTranscript.setDirection(queryTranscript.getDirection())
935 queryTranscript.merge(referenceTranscript, self.normalization)
936 nbMerges += 1
937 transcripts[self.REFERENCE][iReference] = None
938 if not self.multiple:
939 mergedTranscripts[iQuery] = 0
940
941 # Remove transcripts from database
942 for type in (self.QUERY, self.REFERENCE):
943 correction = 0 if self.QUERY else self.maxDistance
944 command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction)
945 query = self.mySqlConnection.executeQuery(command)
946
947 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!)
948 if (self.multiple):
949 for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]):
950 if queryTranscript1 == None: continue
951 for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]):
952 if iQuery2 <= iQuery1 or queryTranscript2 == None: continue
953 minOverlap = self.minOverlap
954 if self.pcOverlap != None:
955 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap)
956 if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear):
957 if queryTranscript1.getDirection() != queryTranscript2.getDirection():
958 queryTranscript2.setDirection(queryTranscript1.getDirection())
959 queryTranscript1.merge(queryTranscript2, self.normalization)
960 transcripts[self.QUERY][iQuery2] = None
961 nbMerges += 1
962 if not self.multiple:
963 mergedTranscripts[iQuery1] = 0
964
965 # Update the sets of transcripts and write into database (also update mergedTranscripts)
966 newTranscripts = {self.QUERY: [], self.REFERENCE: []}
967 newMergedTranscripts = {}
968 for type in (self.QUERY, self.REFERENCE):
969 for i, transcript in enumerate(transcripts[type]):
970 if transcript == None: continue
971 correction = 0 if self.QUERY else self.maxDistance
972 if transcript.getEnd() < rangeEnd - correction:
973 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
974 self.writeTranscript(transcripts[type][i])
975 else:
976 if type == self.QUERY and i in mergedTranscripts:
977 newMergedTranscripts[len(newTranscripts[type])] = 0
978 newTranscripts[type].append(transcript)
979 transcripts = newTranscripts
980 mergedTranscripts = newMergedTranscripts
981
982 progress.inc()
983 progress.done()
984
985 for type in (self.QUERY, self.REFERENCE):
986 for i, transcript in enumerate(transcripts[type]):
987 if transcripts == None: continue
988 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)):
989 self.writeTranscript(transcripts[type][i])
990
991 # Manage chromosomes with no corresponding data
992 if self.multiple:
993 for type in self.INPUTTYPES:
994 for chromosome in self.getTables(type):
995 if chromosome in self.getTables(1 - type):
996 continue
997 for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator():
998 self.writeTranscript(transcript)
999
1000 self.flushData()
1001 if self.writer != None:
1002 self.writer.close()
1003 self.writer = None
1004
1005 if self.verbosity > 0:
1006 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
1007 print "# merges: %d" % (nbMerges)
1008 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
1009
1010
1011 def compareTranscriptListSelfMerge(self):
1012 """
1013 Merge the query list of transcript with itself
1014 @return: the merged transcripts in a transcript list database
1015 """
1016 nbMerges = 0
1017 distance = self.maxDistance if self.maxDistance != None else 0
1018
1019 self.addIndexes([self.QUERY])
1020 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
1021 self.flushData()
1022
1023 # looping
1024 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
1025 transcripts2 = []
1026
1027 # get range of transcripts
1028 progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
1029 command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName())
1030 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
1031
1032 # compare sets
1033 toBeRemoved = set()
1034 toBePrinted = set()
1035 for index2, transcript2 in enumerate(transcripts2):
1036
1037 if self.compareTranscript(transcript1, transcript2, True):
1038 if transcript1.getDirection() != transcript2.getDirection():
1039 transcript2.setDirection(transcript1.getDirection())
1040 transcript1.merge(transcript2, self.normalization)
1041 toBeRemoved.add(index2)
1042 nbMerges += 1
1043 elif transcript2.getEnd() + distance < transcript1.getStart():
1044 toBePrinted.add(index2)
1045 transcripts2.append(transcript1)
1046
1047 for index2 in sorted(toBePrinted):
1048 self.writeTranscript(transcripts2[index2])
1049 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)]
1050
1051 for transcript2 in transcripts2:
1052 self.writeTranscript(transcript2)
1053 progress.done()
1054 self.getTables(self.QUERY)[chromosome1].remove()
1055
1056 self.flushData()
1057 if self.writer != None:
1058 self.writer.close()
1059 self.writer = None
1060
1061 if self.verbosity > 0:
1062 print "query: %d sequences" % (self.nbTranscripts[self.QUERY])
1063 print "# merges: %d" % (nbMerges)
1064 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
1065
1066
1067 def getDifferenceTranscriptList(self):
1068 """
1069 Get the elements of the first list which do not overlap the second list (at the nucleotide level)
1070 @return: the transcripts that overlap with the reference set
1071 """
1072 distance = 0 if self.maxDistance == None else self.maxDistance
1073
1074 self.addIndexes([self.QUERY, self.REFERENCE])
1075
1076 # export the container into tables
1077 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True)
1078 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True)
1079
1080 # looping
1081 for chromosome1 in sorted(self.getTables(self.QUERY).keys()):
1082 # get range of transcripts
1083 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName())
1084 query = self.mySqlConnection.executeQuery(command)
1085 result = query.getLine()
1086 first = result[0]
1087 last = result[1]
1088 nb = result[2]
1089
1090 transcripts1 = []
1091 transcripts2 = []
1092 nbChunks = max(1, nb / 100)
1093 chunkSize = (last - first) / nbChunks
1094 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1)
1095 for chunk in range(nbChunks + 1):
1096
1097 # load transcripts
1098 start = first + chunk * chunkSize
1099 end = start + chunkSize - 1
1100 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1)
1101 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command):
1102 transcripts1.append(transcript1)
1103 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end)
1104 self.mySqlConnection.executeQuery(command)
1105 if chromosome1 in self.getTables(self.REFERENCE):
1106 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1)
1107 if chunk == 0:
1108 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
1109 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command):
1110 transcripts2.append(transcript2)
1111 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance)
1112 self.mySqlConnection.executeQuery(command)
1113
1114 # compare sets
1115 toBeRemoved1 = []
1116 for index1, transcript1 in enumerate(transcripts1):
1117 newTranscript1 = Transcript()
1118 newTranscript1.copy(transcript1)
1119 for transcript2 in transcripts2:
1120 newTranscript1 = newTranscript1.getDifference(transcript2)
1121 if newTranscript1 == None:
1122 toBeRemoved1.append(index1)
1123 break
1124 transcripts1[index1] = newTranscript1
1125
1126 # check if query transcript extends bounds of the chunk
1127 if newTranscript1 != None and newTranscript1.getEnd() < end:
1128 if self.splitDifference:
1129 for exon in newTranscript1.getExons():
1130 transcript = Transcript()
1131 transcript.copy(exon)
1132 self.writeTranscript(transcript)
1133 else:
1134 self.writeTranscript(newTranscript1)
1135 toBeRemoved1.append(index1)
1136
1137 # update list of query transcripts
1138 for index1 in reversed(toBeRemoved1):
1139 del transcripts1[index1]
1140
1141 # check if the reference transcripts extends bounds of the chunk
1142 toBeRemoved2 = []
1143 for index2, transcript2 in enumerate(transcripts2):
1144 if transcript2.getEnd() + distance < end:
1145 toBeRemoved2.append(index2)
1146 for index2 in reversed(toBeRemoved2):
1147 del transcripts2[index2]
1148
1149 progress.inc()
1150
1151 for transcript1 in transcripts1:
1152 if self.splitDifference:
1153 for exon in transcript1.getExons():
1154 transcript = Transcript()
1155 transcript.copy(exon)
1156 self.writeTranscript(transcript)
1157 else:
1158 self.writeTranscript(transcript1)
1159 progress.done()
1160 self.getTables(self.QUERY)[chromosome1].remove()
1161 if chromosome1 in self.getTables(self.REFERENCE):
1162 self.getTables(self.REFERENCE)[chromosome1].remove()
1163 self.getTables(self.WORKING)[chromosome1].remove()
1164
1165 self.flushData()
1166 if self.writer != None:
1167 self.writer.close()
1168 self.writer = None
1169
1170 if self.verbosity > 0:
1171 print "query: %d elements" % (self.nbTranscripts[self.QUERY])
1172 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE])
1173 print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100)
1174
1175
1176 def getOddsPerTranscript(self):
1177 """
1178 Return overlap results
1179 @return a dict of data
1180 """
1181 if not self.odds:
1182 raise Exception("Did not compute odds!")
1183 return self.overlapResults
1184
1185
1186 def getOdds(self):
1187 """
1188 Return odds about the overlap
1189 @return a dict of data
1190 """
1191 if not self.odds:
1192 raise Exception("Did not compute odds!")
1193 if self.oddResults != None:
1194 return self.oddResults
1195 self.oddResults = {}
1196 for name, value in self.overlapResults.iteritems():
1197 self.oddResults[value] = self.oddResults.get(value, 0) + 1
1198 return self.oddResults