comparison SMART/Java/Python/structure/Interval.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30
31 from SMART.Java.Python.structure.Bins import *
32 from commons.core.coord.Range import Range
33
34 class Interval(Range):
35 """
36 Store a genomic interval
37 @ivar name: name of the interval [optional]
38 @type name: string
39 @ivar id: id of the interval [optional]
40 @type id: int
41 @ivar bin: bin in which the interval should be if stored in a database [computed]
42 @type bin: int
43 @ival tags: information about the transcript [optional]
44 @type tags: dict
45 @ivar verbosity: verbosity
46 @type verbosity: int [default: 0]
47 """
48
49 def __init__(self, interval = None, verbosity = 0):
50 """
51 Constructor
52 @param interval: interval to be copied
53 @type interval: class L{Interval<Interval>}
54 @param verbosity: verbosity
55 @type verbosity: int
56 """
57 Range.__init__(self)
58 self.name = None
59 self.id = None
60 self.bin = None
61 self.verbosity = verbosity
62 self.tags = {}
63 if interval != None:
64 self.copy(interval)
65
66 #!!!! Warning: two methods getStart() and getEnd() give the information maximum and minimum in interval.!!!!#
67 #In case strand = "+", start < end; strand = "-", start > end
68 def getStart(self):
69 if self.start == -1:
70 return -1
71 if self.end == -1:
72 return self.start
73 return self.getMin()
74
75
76 def getEnd(self):
77 if self.end == -1:
78 return -1
79 if self.start == -1:
80 return self.end
81 return self.getMax()
82
83
84 def getChromosome(self):
85 return self.getSeqname()
86
87
88 def getDirection(self):
89 return 1 if self.getStrand() == "+" else -1
90
91
92 def getName(self):
93 return self.name
94
95
96 def isSet(self):
97 """
98 Check if the interval is set
99 """
100 return self.getStart() == None and self.getEnd() == None
101
102
103 def copy(self, interval):
104 """
105 Copy method
106 @param interval: interval to be copied
107 @type interval: class L{Interval<Interval>}
108 """
109 self.setStart(interval.getStart())
110 self.setEnd(interval.getEnd())
111 self.setChromosome(interval.getChromosome())
112 self.setDirection(interval.getDirection())
113 self.name = interval.name
114 self.id = interval.id
115 self.bin = interval.bin
116 self.tags = {}
117 for tag in interval.tags:
118 self.tags[tag] = interval.tags[tag]
119 self.verbosity = interval.verbosity
120
121
122 def setName(self, name):
123 """
124 Set the name
125 @param name: name of the interval
126 @type name: string
127 """
128 if len(name) > 100:
129 name = name[:100]
130 self.name = name
131
132
133 def setChromosome(self, chromosome=""):
134 """
135 Set the chromosome
136 @param chromosome: chromosome on which the interval is
137 @type chromosome: string
138 """
139 if not chromosome:
140 self.seqname = None
141 else:
142 self.seqname = chromosome.replace(".", "_").replace("|", "_")
143
144
145 def setStart(self, start):
146 """
147 Set the start point
148 Possibly reset bin
149 @param start: start point of the interval
150 @type start: int
151 """
152 self.bin = None
153 direction = self.getDirection()
154 if self.start == -1:
155 self.start = start
156 elif self.end == -1:
157 self.end = start
158 else:
159 if direction == 1:
160 self.start = start
161 else:
162 self.end = start
163 if direction == 1:
164 self.start, self.end = min(self.start, self.end), max(self.start, self.end)
165 else:
166 self.start, self.end = max(self.start, self.end), min(self.start, self.end)
167
168
169 def setEnd(self, end):
170 """
171 Set the end point
172 Possibly reset bin
173 @param end: end point of the interval of the interval
174 @type end: int
175 """
176 self.bin = None
177 direction = self.getDirection()
178 if self.end == -1:
179 self.end = end
180 elif self.start == -1:
181 self.start = end
182 else:
183 if direction == 1:
184 self.end = end
185 else:
186 self.start = end
187 if direction == 1:
188 self.start, self.end = min(self.start, self.end), max(self.start, self.end)
189 else:
190 self.start, self.end = max(self.start, self.end), min(self.start, self.end)
191
192
193 def setSize(self, size):
194 """
195 Possibly modify the end point
196 @param size: size of the transcript
197 @type size: int
198 """
199 if self.end == None and self.start != None:
200 self.setEnd(self.start + self.getSize() - 1)
201 elif self.start == None and self.end != None:
202 self.setStart(self.end - self.getSize() + 1)
203
204
205 def getSize(self):
206 """
207 Get the size
208 """
209 return self.getEnd() - self.getStart() + 1
210
211
212 def _setDirection(self, direction):
213 """
214 Set the direction of the interval (connection to Range)
215 @param direction: direction of the transcript (+ / -)
216 @type direction: int (1 or -1)
217 """
218 if direction * self.getDirection() < 0:
219 self.reverse()
220
221
222 def setDirection(self, direction):
223 """
224 Set the direction of the interval
225 Possibly parse different formats
226 @param direction: direction of the transcript (+ / -)
227 @type direction: int or string
228 """
229 if type(direction).__name__ == 'int':
230 self._setDirection(direction / abs(direction))
231 elif type(direction).__name__ == 'str':
232 if direction == "+":
233 self._setDirection(1)
234 elif direction == "-":
235 self._setDirection(-1)
236 elif direction == "1" or direction == "-1":
237 self._setDirection(int(direction))
238 elif direction.lower() == "plus":
239 self._setDirection(1)
240 elif direction.lower() == "minus":
241 self._setDirection(-1)
242 else:
243 raise Exception("Cannot understand direction %s" % (direction))
244 else:
245 raise Exception("Cannot understand direction %s" % (direction))
246
247
248 def extendStart(self, size):
249 """
250 Extend the interval by the 5' end
251 @param size: the size to be exended
252 @type size: int
253 """
254 if self.getDirection() == 1:
255 self.setStart(max(0, self.getStart() - size))
256 else:
257 self.setEnd(self.getEnd() + size)
258 self.bin = None
259
260
261 def extendEnd(self, size):
262 """
263 Extend the interval by the 3' end
264 @param size: the size to be exended
265 @type size: int
266 """
267 if self.getDirection() == 1:
268 self.setEnd(self.getEnd() + size)
269 else:
270 self.setStart(max(0, self.getStart() - size))
271 self.bin = None
272
273
274 def restrictStart(self, size = 1):
275 """
276 Restrict the interval by some nucleotides, start from its start position
277 Remove the exons
278 @param size: the size to be restricted to
279 @type size: int
280 """
281 if self.getDirection() == 1:
282 self.setEnd(min(self.getEnd(), self.getStart() + size - 1))
283 else:
284 self.setStart(max(self.getStart(), self.getEnd() - size + 1))
285 self.bin = None
286
287
288 def restrictEnd(self, size = 1):
289 """
290 Restrict the interval by some nucleotides, end from its end position
291 Remove the exons
292 @param size: the size to be restricted to
293 @type size: int
294 """
295 if self.getDirection() == 1:
296 self.setStart(max(self.getStart(), self.getEnd() - size + 1))
297 else:
298 self.setEnd(min(self.getEnd(), self.getStart() + size - 1))
299 self.bin = None
300
301
302
303 def setTagValue(self, name, value):
304 """
305 Set a tag
306 @param name: name of the tag
307 @type name: string
308 @param value: value of the tag
309 @type value: int or string
310 """
311 self.tags[name] = value
312
313
314 def getTagNames(self):
315 """
316 Get all the names of the tags
317 """
318 return self.tags.keys()
319
320
321 def getTagValue(self, tag):
322 """
323 Get the value of a tag
324 @param tag: name of a tag
325 @type tag: string
326 """
327 if tag not in self.tags:
328 return None
329 return self.tags[tag]
330
331
332 def getTagValues(self, tagSep = "; ", fieldSep = " ", surrounder = ""):
333 """
334 Get the formatted tag values
335 @param tagSep: separator between tags
336 @type tagSep: string
337 @param fieldSep: separator between tag name and tag value
338 @type fieldSep: string
339 @param surrounder: string which optionally surround values
340 @type surrounder: string
341 """
342 tags = []
343 for name, value in self.tags.iteritems():
344 if value == None:
345 continue
346 if isinstance(value, basestring):
347 tags.append("%s%s%s%s%s" % (name, fieldSep, surrounder, value.replace("'", "\\'"), surrounder))
348 elif type(value) is int:
349 tags.append("%s%s%s%i%s" % (name, fieldSep, surrounder, value, surrounder))
350 elif type(value) is float:
351 tags.append("%s%s%s%f%s" % (name, fieldSep, surrounder, value, surrounder))
352 else:
353 raise Exception("Do not know how to print '" + value + "'.")
354 if self.getName() != None:
355 tags.append("%s%s%s%s%s" % ("Name", fieldSep, surrounder, self.getName(), surrounder))
356 return tagSep.join(tags)
357
358
359 def setTagValues(self, tags, tagSep = "; ", fieldSep = " "):
360 """
361 Set the tag values using given string
362 @param tags: the tags, concatenated
363 @type tags: string
364 @param tagSep: separator between tags
365 @type tagSep: string
366 @param fieldSep: separator between tag name and tag value
367 @type fieldSep: string
368 """
369 if tags == "":
370 self.tags = {}
371 return
372 for splittedTag in tags.split(tagSep):
373 if fieldSep not in splittedTag:
374 raise Exception("Weird field '%s' in tags '%s'" % (splittedTag, tags))
375 tag, value = splittedTag.split(fieldSep, 1)
376 if tag == "Name":
377 self.setName(value)
378 continue
379 try:
380 intValue = int(value)
381 self.tags[tag] = intValue
382 except ValueError:
383 try:
384 floatValue = float(value)
385 self.tags[tag] = floatValue
386 except ValueError:
387 self.tags[tag] = value
388
389
390 def deleteTag(self, tag):
391 """
392 Remove a tag
393 @param tag: the tag to be removed
394 @type tag: string
395 """
396 if tag in self.tags:
397 del self.tags[tag]
398
399
400 def setNbOccurrences(self, nbOccurrences):
401 """
402 Set the number of occurrences of the interval
403 @param nbOccurrences: number of occurrences of the interval
404 @type nbOccurrences: int
405 """
406 self.setTagValue("nbOccurrences", nbOccurrences)
407
408
409 def setOccurrence(self, occurrence):
410 """
411 Set the occurrence of this interval
412 @param occurrence: an occurrence for this transcript
413 @type occurrence: int
414 """
415 self.setTagValue("occurrence", occurrence)
416
417 def __eq__(self, interval):
418 """
419 Whether two intervals are equal (start and end at same position)
420 @param interval: object to be compared to
421 @type interval: class L{Interval<Interval>}
422 """
423 if not interval:
424 return False
425 return self.getChromosome() == interval.getChromosome() and self.getStart() == interval.getStart() and self.getEnd() == interval.getEnd() and self.getDirection() == interval.getDirection()
426
427
428 def overlapWith(self, interval, nbNucleotides = 1):
429 """
430 Whether two intervals overlap
431 @param interval: object to be compared to
432 @type interval: class L{Interval<Interval>}
433 @param nbNucleotides: minimum number of nucleotides to declare and overlap
434 @type nbNucleotides: int
435 """
436 if self.getChromosome() != interval.getChromosome():
437 return False
438 return (min(self.getEnd(), interval.getEnd()) - max(self.getStart(), interval.getStart()) + 1 >= nbNucleotides)
439
440 def isIncludeIn(self, interval):
441 return interval.include(self)
442
443
444 def include(self, interval):
445 """
446 Whether this interval includes the other one
447 @param interval: object to be compared to
448 @type interval: class L{Interval<Interval>}
449 """
450 if self.getChromosome() != interval.getChromosome():
451 return False
452 return ((self.getStart() <= interval.getStart()) and (self.getEnd() >= interval.getEnd()))
453
454
455 def getDifference(self, interval, sameStrand = False):
456 """
457 Get the difference between this cluster and another one
458 @param interval: object to be compared to
459 @type interval: class L{Interval<Interval>}
460 @param sameStrand: do the comparison iff the intervals are on the same strand
461 @type sameStrand: boolean
462 @return: a (possibly empty) list of intervals
463 """
464 newInterval = Interval()
465 newInterval.copy(self)
466 if self.getChromosome() != interval.getChromosome():
467 return [newInterval]
468 if not self.overlapWith(interval):
469 return [newInterval]
470 if sameStrand and self.getDirection() != interval.getDirection():
471 return [newInterval]
472 intervals = []
473 if self.getStart() < interval.getStart():
474 newInterval = Interval()
475 newInterval.copy(self)
476 newInterval.setEnd(min(self.getEnd(), interval.getStart() - 1))
477 intervals.append(newInterval)
478 if self.getEnd() > interval.getEnd():
479 newInterval = Interval()
480 newInterval.copy(self)
481 newInterval.setStart(max(self.getStart(), interval.getEnd() + 1))
482 intervals.append(newInterval)
483 return intervals
484
485
486 def getIntersection(self, interval):
487 """
488 Get the intersection between this interval and another one
489 @param interval: object to be compared to
490 @type interval: class L{Interval<Interval>}
491 @return: an other interval
492 """
493 if not self.overlapWith(interval):
494 return None
495 newInterval = Interval()
496 newInterval.setChromosome(self.getChromosome())
497 newInterval.setDirection(self.getDirection())
498 newInterval.setName("%s_intersect_%s" % (self.getName(), interval.getName()))
499 newInterval.setStart(max(self.getStart(), interval.getStart()))
500 newInterval.setEnd(min(self.getEnd(), interval.getEnd()))
501 return newInterval
502
503
504 def getDistance(self, interval):
505 """
506 Get the distance between two intervals (a non-negative value)
507 @param interval: another interval
508 @type interval: class L{Interval<Interval>}
509 """
510 if self.overlapWith(interval):
511 return 0
512 if self.getChromosome() != interval.getChromosome():
513 raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval)))
514 return min(abs(self.getStart() - interval.getEnd()), abs(self.getEnd() - interval.getStart()))
515
516
517 def getRelativeDistance(self, interval):
518 """
519 Get the distance between two intervals (negative if first interval is before)
520 @param interval: another interval
521 @type interval: class L{Interval<Interval>}
522 """
523 if self.overlapWith(interval):
524 return 0
525 if self.getChromosome() != interval.getChromosome():
526 raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval)))
527 if self.getEnd() < interval.getStart():
528 distance = interval.getStart() - self.getEnd()
529 else:
530 distance = interval.getEnd() - self.getStart()
531 distance *= self.getDirection()
532 return distance
533
534
535 def merge(self, interval, normalization = False):
536 """
537 Merge two intervals
538 @param interval: another interval
539 @type interval: class L{Interval<Interval>}
540 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements
541 @type normalization: boolean
542 """
543 if self.getChromosome() != interval.getChromosome():
544 raise Exception("Cannot merge '%s' and '%s' for they are on different chromosomes." % (str(self), str(interval)))
545 direction = None
546 if self.getStart() == self.getEnd():
547 direction = interval.getDirection()
548 elif interval.getStart() == interval.getEnd():
549 direction = self.getDirection()
550 elif self.getDirection() != interval.getDirection():
551 raise Exception("Cannot merge '%s' and '%s' for they are on different strands." % (str(self), str(interval)))
552 self.setStart(min(self.getStart(), interval.getStart()))
553 self.setEnd(max(self.getEnd(), interval.getEnd()))
554 if direction != None:
555 self.setDirection(direction)
556 nbElements = 0.0
557 for element in (self, interval):
558 for tagName in ("nbElements", "nbOccurrences"):
559 if tagName not in element.getTagNames():
560 element.setTagValue(tagName, 1)
561 nbElements += float(element.getTagValue("nbElements")) / float(element.getTagValue("nbOccurrences")) if normalization else float(element.getTagValue("nbElements"))
562 self.setTagValue("nbElements", nbElements)
563 self.bin = None
564 for tagName in ("identity", "nbOccurrences", "occurrence", "nbMismatches", "nbGaps", "rank", "evalue", "bestRegion"):
565 if tagName in self.getTagNames():
566 del self.tags[tagName]
567
568
569 def getBin(self):
570 """
571 Get the bin of the interval
572 Computed on the fly
573 """
574 if self.bin == None:
575 self.bin = getBin(self.getStart(), self.getEnd())
576 return self.bin
577
578
579 def getBins(self):
580 """
581 Get all the bin this interval could fall into
582 """
583 return getOverlappingBins(self.getStart(), self.getEnd())
584
585
586 def getSqlVariables(cls):
587 """
588 Get the properties of the object that should be saved in a database
589 """
590 variables = ["name", "chromosome", "start", "end", "direction", "tags", "bin"]
591 return variables
592 getSqlVariables = classmethod(getSqlVariables)
593
594
595 def setSqlValues(self, array):
596 """
597 Set the values of the properties of this object as given by a results line of a SQL query
598 """
599 self.id = array[0]
600 self.name = array[1].strip("'")
601 self.setChromosome(array[2].strip("'"))
602 self.setStart(array[3])
603 self.setEnd(array[4])
604 self.setDirection(array[5])
605 self.setTagValues(array[6].strip("'"), ";", "=")
606 self.bin = array[7]
607
608
609 def getSqlValues(self):
610 """
611 Get the values of the properties that should be saved in a database
612 """
613 values = dict()
614 values["name"] = self.name
615 values["chromosome"] = self.getChromosome()
616 values["start"] = self.getStart()
617 values["end"] = self.getEnd()
618 values["direction"] = self.getDirection()
619 values["tags"] = self.getTagValues(";", "=")
620 values["bin"] = self.getBin()
621 return values
622
623
624 def getSqlTypes(cls):
625 """
626 Get the values of the properties that should be saved in a database
627 """
628 types = dict()
629 types["name"] = "varchar"
630 types["chromosome"] = "varchar"
631 types["start"] = "int"
632 types["end"] = "int"
633 types["direction"] = "tinyint"
634 types["tags"] = "varchar"
635 types["bin"] = "int"
636 return types
637 getSqlTypes = classmethod(getSqlTypes)
638
639
640 def getSqlSizes(cls):
641 """
642 Get the sizes of the properties that should be saved in a database
643 """
644 sizes = dict()
645 sizes["name"] = 255
646 sizes["chromosome"] = 255
647 sizes["start"] = 11
648 sizes["end"] = 11
649 sizes["direction"] = 4
650 sizes["tags"] = 1023
651 sizes["bin"] = 11
652 return sizes
653 getSqlSizes = classmethod(getSqlSizes)
654
655
656 def printCoordinates(self):
657 """
658 Print start and end positions (depending on the direction of the interval)
659 """
660 if self.getDirection() == 1:
661 return "%d-%d" % (self.getStart(), self.getEnd())
662 else:
663 return "%d-%d" % (self.getEnd(), self.getStart())
664
665
666 def extractSequence(self, parser):
667 """
668 Get the sequence corresponding to this interval
669 @param parser: a parser to a FASTA file
670 @type parser: class L{SequenceListParser<SequenceListParser>}
671 @return : a instance of L{Sequence<Sequence>}
672 """
673 return parser.getSubSequence(self.getChromosome(), self.getStart(), self.getEnd(), self.getDirection(), self.name)
674
675
676 def extractWigData(self, parser):
677 """
678 Get the data retrieved from a wig file
679 @param parser: a parser class to a WIG file
680 @type parser: class L{WigParser<WigParser>}
681 """
682 data = parser.getRange(self.getChromosome(), self.getStart(), self.getEnd())
683 if self.getDirection() == -1:
684 if parser.strands:
685 newData = {}
686 for strand in data:
687 data[strand].reverse()
688 newData[-strand] = data[strand]
689 data = newData
690 else:
691 data.reverse()
692 return data
693
694
695 def __str__(self):
696 """
697 Output a simple representation of this interval
698 """
699 direction = "+"
700 if self.getDirection() == -1:
701 direction = "-"
702 string = "%s:%d-%d (%s)" % (self.getChromosome(), self.getStart(), self.getEnd(), direction)
703 if self.name != "":
704 string = "(%s) %s" % (self.name, string)
705 return string
706