0
|
1 #! /usr/bin/env python
|
|
2
|
|
3 # Read MAF-format alignments. Write them, omitting alignments whose
|
|
4 # coordinates in the top-most sequence are contained in those of >=
|
|
5 # cullingLimit higher-scoring alignments.
|
|
6
|
|
7 # Alignments on opposite strands are not considered to contain each
|
|
8 # other.
|
|
9
|
|
10 # The alignments must be sorted by sequence name, then strand, then
|
|
11 # start coordinate.
|
|
12
|
|
13 # This algorithm is not theoretically optimal, but it is simple and
|
|
14 # probably fast in practice. Optimal algorithms are described in:
|
|
15 # Winnowing sequences from a database search.
|
|
16 # Berman P, Zhang Z, Wolf YI, Koonin EV, Miller W.
|
|
17 # J Comput Biol. 2000 Feb-Apr;7(1-2):293-302.
|
|
18 # (Use a "priority search tree" or an "interval tree".)
|
|
19
|
|
20 # Seems to work with Python 2.x, x>=4
|
|
21
|
|
22 import fileinput, itertools, operator, optparse, os, signal, sys
|
|
23
|
|
24 # The intervals must have size > 0.
|
|
25
|
|
26 def isFresh(oldInterval, newInterval):
|
|
27 return oldInterval.end > newInterval.start
|
|
28
|
|
29 def freshIntervals(storedIntervals, newInterval):
|
|
30 """Yields those storedIntervals that overlap newInterval."""
|
|
31 return [i for i in storedIntervals if isFresh(i, newInterval)]
|
|
32
|
|
33 def isDominated(dog, queen):
|
|
34 return dog.score < queen.score and dog.end <= queen.end
|
|
35
|
|
36 def isWanted(newInterval, storedIntervals, cullingLimit):
|
|
37 """Is newInterval dominated by < cullingLimit storedIntervals?"""
|
|
38 dominators = (i for i in storedIntervals if isDominated(newInterval, i))
|
|
39 return len(list(dominators)) < cullingLimit
|
|
40
|
|
41 # Check that the intervals are sorted by start position, and further
|
|
42 # sort them in descending order of score.
|
|
43 def sortedIntervals(intervals):
|
|
44 oldStart = ()
|
|
45 for k, v in itertools.groupby(intervals, operator.attrgetter("start")):
|
|
46 if k < oldStart: raise Exception("the input is not sorted properly")
|
|
47 oldStart = k
|
|
48 for i in sorted(v, key=operator.attrgetter("score"), reverse=True):
|
|
49 yield i
|
|
50
|
|
51 def culledIntervals(intervals, cullingLimit):
|
|
52 """Yield intervals contained in < cullingLimit higher-scoring intervals."""
|
|
53 storedIntervals = []
|
|
54 for i in sortedIntervals(intervals):
|
|
55 storedIntervals = freshIntervals(storedIntervals, i)
|
|
56 if isWanted(i, storedIntervals, cullingLimit):
|
|
57 yield i
|
|
58 storedIntervals.append(i)
|
|
59
|
|
60 class Maf:
|
|
61 def __init__(self, lines):
|
|
62 self.lines = lines
|
|
63 try:
|
|
64 aLine = lines[0]
|
|
65 aWords = aLine.split()
|
|
66 scoreGen = (i for i in aWords if i.startswith("score="))
|
|
67 scoreWord = scoreGen.next()
|
|
68 self.score = float(scoreWord.split("=")[1])
|
|
69 except: raise Exception("missing score")
|
|
70 try:
|
|
71 sLine = lines[1]
|
|
72 sWords = sLine.split()
|
|
73 seqName = sWords[1]
|
|
74 alnStart = int(sWords[2])
|
|
75 alnSize = int(sWords[3])
|
|
76 strand = sWords[4]
|
|
77 self.start = seqName, strand, alnStart
|
|
78 self.end = seqName, strand, alnStart + alnSize
|
|
79 except: raise Exception("can't interpret the MAF data")
|
|
80
|
|
81 def mafInput(lines):
|
|
82 for k, v in itertools.groupby(lines, str.isspace):
|
|
83 if not k: yield Maf(list(v))
|
|
84
|
|
85 def filterComments(lines):
|
|
86 for i in lines:
|
|
87 if i.startswith("#"): print i,
|
|
88 else: yield i
|
|
89
|
|
90 def mafCull(opts, args):
|
|
91 inputLines = fileinput.input(args)
|
|
92 inputMafs = mafInput(filterComments(inputLines))
|
|
93 for maf in culledIntervals(inputMafs, opts.limit):
|
|
94 for i in maf.lines: print i,
|
|
95 print # blank line after each alignment
|
|
96
|
|
97 if __name__ == "__main__":
|
|
98 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message
|
|
99
|
|
100 usage = "%prog [options] my-alignments.maf"
|
|
101 description = "Cull alignments whose top-sequence coordinates are contained in LIMIT or more higher-scoring alignments."
|
|
102 op = optparse.OptionParser(usage=usage, description=description)
|
|
103 op.add_option("-l", "--limit", type="int", default=2,
|
|
104 help="culling limit (default: %default)")
|
|
105 (opts, args) = op.parse_args()
|
|
106 if opts.limit < 1: op.error("option -l: should be >= 1")
|
|
107
|
|
108 try: mafCull(opts, args)
|
|
109 except KeyboardInterrupt: pass # avoid silly error message
|
|
110 except Exception, e:
|
|
111 prog = os.path.basename(sys.argv[0])
|
|
112 sys.exit(prog + ": error: " + str(e))
|