Mercurial > repos > yutaka-saito > bsfcall
comparison bin/maf-join @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:06f8460885ff |
---|---|
1 #! /usr/bin/env python | |
2 | |
3 # Copyright 2009, 2010, 2011 Martin C. Frith | |
4 | |
5 # Join two or more sets of MAF-format multiple alignments into bigger | |
6 # multiple alignments. The 'join field' is the top genome, which | |
7 # should be the same for each input. Each input should be sorted by | |
8 # position in the top genome. | |
9 | |
10 # WARNING: Alignment columns with a gap in the top genome are joined | |
11 # arbitrarily!!! | |
12 | |
13 import sys, os, fileinput, optparse, signal | |
14 | |
15 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # stop spurious error message | |
16 | |
17 class peekable: # Adapted from Python Cookbook 2nd edition | |
18 """An iterator that supports a peek operation.""" | |
19 def __init__(self, iterable): | |
20 self.it = iter(iterable) | |
21 self.cache = [] | |
22 def __iter__(self): | |
23 return self | |
24 def next(self): | |
25 if self.cache: return self.cache.pop() | |
26 else: return self.it.next() | |
27 def peek(self): | |
28 if not self.cache: self.cache.append(self.it.next()) | |
29 return self.cache[0] | |
30 | |
31 def maxLen(things): return max(map(len, things)) | |
32 | |
33 class MafBlock: | |
34 def __init__(self, chr, beg, end, strand, chrSize, seq, prob): | |
35 self.chr = chr # chromosome names | |
36 self.beg = beg # alignment begin coordinates | |
37 self.end = end # alignment end coordinates | |
38 self.strand = strand | |
39 self.chrSize = chrSize # chromosome sizes | |
40 self.seq = seq # aligned sequences, including gaps | |
41 self.prob = prob # probabilities (may be empty) | |
42 | |
43 def __nonzero__(self): | |
44 return len(self.seq) > 0 | |
45 | |
46 def __cmp__(self, other): | |
47 return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1]) | |
48 | |
49 def before(self, other): | |
50 return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0]) | |
51 | |
52 def after(self, other): | |
53 return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0]) | |
54 | |
55 def addLine(self, line): | |
56 words = line.split() | |
57 if line.startswith('s'): | |
58 self.chr.append(words[1]) | |
59 self.beg.append(int(words[2])) | |
60 self.end.append(int(words[2]) + int(words[3])) | |
61 self.strand.append(words[4]) | |
62 self.chrSize.append(words[5]) | |
63 self.seq.append(list(words[6])) | |
64 elif line.startswith('p'): | |
65 self.prob.append(words[1]) | |
66 | |
67 def write(self): | |
68 beg = map(str, self.beg) | |
69 size = [str(e-b) for b, e in zip(self.beg, self.end)] | |
70 seq = [''.join(i) for i in self.seq] | |
71 columns = self.chr, beg, size, self.strand, self.chrSize, seq | |
72 widths = map(maxLen, columns) | |
73 print 'a' | |
74 for row in zip(*columns): | |
75 widthsAndFields = zip(widths, row) | |
76 field0 = "%-*s" % widthsAndFields[0] # left-justify | |
77 fields = ["%*s" % i for i in widthsAndFields[1:]] # right-justify | |
78 print 's', field0, ' '.join(fields) | |
79 pad = ' '.join(' ' * i for i in widths[:-1]) | |
80 for i in self.prob: | |
81 print 'p', pad, i | |
82 print # blank line afterwards | |
83 | |
84 def topSeqBeg(maf): return maf.beg[0] | |
85 def topSeqEnd(maf): return maf.end[0] | |
86 def emptyMaf(): return MafBlock([], [], [], [], [], [], []) | |
87 | |
88 def joinOnFirstItem(x, y): | |
89 if x[0] != y[0]: | |
90 raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0])) | |
91 return x + y[1:] | |
92 | |
93 def mafEasyJoin(x, y): | |
94 '''Join two MAF blocks on the top sequence.''' | |
95 xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq) | |
96 yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq) | |
97 joined = joinOnFirstItem(xJoin, yJoin) | |
98 chr, beg, end, strand, chrSize, seq = zip(*joined) | |
99 prob = x.prob + y.prob | |
100 return MafBlock(chr, beg, end, strand, chrSize, seq, prob) | |
101 | |
102 def countNonGaps(s): return len(s) - s.count('-') | |
103 | |
104 def nthNonGap(s, n): | |
105 '''Get the start position of the n-th non-gap.''' | |
106 for i, x in enumerate(s): | |
107 if x != '-': | |
108 if n == 0: return i | |
109 n -= 1 | |
110 raise ValueError('non-gap not found') | |
111 | |
112 def nthLastNonGap(s, n): | |
113 '''Get the end position of the n-th last non-gap.''' | |
114 return len(s) - nthNonGap(s[::-1], n) | |
115 | |
116 def mafSlice(maf, alnBeg, alnEnd): | |
117 '''Return a slice of a MAF block, using coordinates in the alignment.''' | |
118 beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)] | |
119 end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)] | |
120 seq = [i[alnBeg:alnEnd] for i in maf.seq] | |
121 prob = [i[alnBeg:alnEnd] for i in maf.prob] | |
122 return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob) | |
123 | |
124 def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd): | |
125 '''Return a slice of a MAF block, using coordinates in the top sequence.''' | |
126 lettersFromBeg = newTopSeqBeg - topSeqBeg(maf) | |
127 lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd | |
128 alnBeg = nthNonGap(maf.seq[0], lettersFromBeg) | |
129 alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd) | |
130 return mafSlice(maf, alnBeg, alnEnd) | |
131 | |
132 def jumpGaps(sequence, index): | |
133 '''Return the next index of the sequence where there is a non-gap.''' | |
134 nextIndex = index | |
135 while sequence[nextIndex] == '-': nextIndex += 1 | |
136 return nextIndex | |
137 | |
138 def gapsToAdd(sequences): | |
139 '''Return new gaps and their positions, needed to align the non-gaps.''' | |
140 gapInfo = [[] for i in sequences] | |
141 gapBeg = [0 for i in sequences] | |
142 try: | |
143 while True: | |
144 gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)] | |
145 gapSize = [e-b for b, e in zip(gapBeg, gapEnd)] | |
146 maxGapSize = max(gapSize) | |
147 for s, e, i in zip(gapSize, gapEnd, gapInfo): | |
148 if s < maxGapSize: | |
149 newGap = maxGapSize - s | |
150 i.append((newGap, e)) | |
151 gapBeg = [e+1 for e in gapEnd] | |
152 except IndexError: return gapInfo | |
153 | |
154 def chunksAndGaps(s, gapsAndPositions, oneGap): | |
155 '''Yield chunks of "s" interspersed with gaps at given positions.''' | |
156 oldPosition = 0 | |
157 for gapLen, position in gapsAndPositions: | |
158 yield s[oldPosition:position] | |
159 yield oneGap * gapLen | |
160 oldPosition = position | |
161 yield s[oldPosition:] | |
162 | |
163 def mafAddGaps(maf, gapsAndPositions): | |
164 '''Add the given gaps at the given positions to a MAF block.''' | |
165 maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), []) | |
166 for i in maf.seq] | |
167 maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~')) | |
168 for i in maf.prob] | |
169 | |
170 def mafJoin(mafs): | |
171 '''Intersect and join overlapping MAF blocks.''' | |
172 newTopSeqBeg = max(map(topSeqBeg, mafs)) | |
173 newTopSeqEnd = min(map(topSeqEnd, mafs)) | |
174 mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs] | |
175 topSeqs = [i.seq[0] for i in mafs] | |
176 gapInfo = gapsToAdd(topSeqs) | |
177 for maf, gapsAndPositions in zip(mafs, gapInfo): | |
178 mafAddGaps(maf, gapsAndPositions) | |
179 return reduce(mafEasyJoin, mafs) | |
180 | |
181 def mafInput(lines): | |
182 '''Read lines and yield MAF blocks.''' | |
183 maf = emptyMaf() | |
184 for line in lines: | |
185 if line.isspace(): | |
186 if maf: yield maf | |
187 maf = emptyMaf() | |
188 else: | |
189 maf.addLine(line) | |
190 if maf: yield maf | |
191 | |
192 def sortedMafInput(lines): | |
193 '''Read lines and yield MAF blocks, checking that they are in order.''' | |
194 old = emptyMaf() | |
195 for maf in mafInput(lines): | |
196 if maf < old: sys.exit(progName + ": MAF blocks not sorted properly") | |
197 yield maf | |
198 old = maf | |
199 | |
200 def allOverlaps(sequences, beg, end): | |
201 '''Yield all combinations of MAF blocks that overlap in the top genome.''' | |
202 assert beg < end | |
203 if not sequences: | |
204 yield () | |
205 return | |
206 for i in sequences[0]: | |
207 if topSeqEnd(i) <= beg: continue | |
208 if topSeqBeg(i) >= end: break # assumes they're sorted by position | |
209 newBeg = max(beg, topSeqBeg(i)) | |
210 newEnd = min(end, topSeqEnd(i)) | |
211 for j in allOverlaps(sequences[1:], newBeg, newEnd): | |
212 yield (i,) + j | |
213 | |
214 def nextWindow(window, input, referenceMaf): | |
215 '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.''' | |
216 for maf in window: | |
217 if not maf.before(referenceMaf): yield maf | |
218 try: | |
219 while True: | |
220 maf = input.peek() | |
221 if maf.after(referenceMaf): break | |
222 maf = input.next() | |
223 if not maf.before(referenceMaf): yield maf | |
224 except StopIteration: pass | |
225 | |
226 def overlappingMafs(sortedMafInputs): | |
227 '''Yield all combinations of MAF blocks that overlap in the top genome.''' | |
228 if not sortedMafInputs: return | |
229 head, tail = sortedMafInputs[0], sortedMafInputs[1:] | |
230 windows = [[] for t in tail] | |
231 for h in head: # iterate over MAF blocks in the first input | |
232 windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)] | |
233 for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)): | |
234 yield (h,) + i | |
235 | |
236 op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...") | |
237 (opts, args) = op.parse_args() | |
238 | |
239 progName = os.path.basename(sys.argv[0]) | |
240 | |
241 inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args] | |
242 | |
243 for mafs in overlappingMafs(inputs): | |
244 mafJoin(mafs).write() |