comparison bin/maf-join @ 0:06f8460885ff

migrate from GitHub
author yutaka-saito
date Sun, 19 Apr 2015 20:51:13 +0900
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:06f8460885ff
1 #! /usr/bin/env python
2
3 # Copyright 2009, 2010, 2011 Martin C. Frith
4
5 # Join two or more sets of MAF-format multiple alignments into bigger
6 # multiple alignments. The 'join field' is the top genome, which
7 # should be the same for each input. Each input should be sorted by
8 # position in the top genome.
9
10 # WARNING: Alignment columns with a gap in the top genome are joined
11 # arbitrarily!!!
12
13 import sys, os, fileinput, optparse, signal
14
15 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # stop spurious error message
16
17 class peekable: # Adapted from Python Cookbook 2nd edition
18 """An iterator that supports a peek operation."""
19 def __init__(self, iterable):
20 self.it = iter(iterable)
21 self.cache = []
22 def __iter__(self):
23 return self
24 def next(self):
25 if self.cache: return self.cache.pop()
26 else: return self.it.next()
27 def peek(self):
28 if not self.cache: self.cache.append(self.it.next())
29 return self.cache[0]
30
31 def maxLen(things): return max(map(len, things))
32
33 class MafBlock:
34 def __init__(self, chr, beg, end, strand, chrSize, seq, prob):
35 self.chr = chr # chromosome names
36 self.beg = beg # alignment begin coordinates
37 self.end = end # alignment end coordinates
38 self.strand = strand
39 self.chrSize = chrSize # chromosome sizes
40 self.seq = seq # aligned sequences, including gaps
41 self.prob = prob # probabilities (may be empty)
42
43 def __nonzero__(self):
44 return len(self.seq) > 0
45
46 def __cmp__(self, other):
47 return cmp(self.chr[:1] + self.beg[:1], other.chr[:1] + other.beg[:1])
48
49 def before(self, other):
50 return (self.chr[0], self.end[0]) <= (other.chr[0], other.beg[0])
51
52 def after(self, other):
53 return (self.chr[0], self.beg[0]) >= (other.chr[0], other.end[0])
54
55 def addLine(self, line):
56 words = line.split()
57 if line.startswith('s'):
58 self.chr.append(words[1])
59 self.beg.append(int(words[2]))
60 self.end.append(int(words[2]) + int(words[3]))
61 self.strand.append(words[4])
62 self.chrSize.append(words[5])
63 self.seq.append(list(words[6]))
64 elif line.startswith('p'):
65 self.prob.append(words[1])
66
67 def write(self):
68 beg = map(str, self.beg)
69 size = [str(e-b) for b, e in zip(self.beg, self.end)]
70 seq = [''.join(i) for i in self.seq]
71 columns = self.chr, beg, size, self.strand, self.chrSize, seq
72 widths = map(maxLen, columns)
73 print 'a'
74 for row in zip(*columns):
75 widthsAndFields = zip(widths, row)
76 field0 = "%-*s" % widthsAndFields[0] # left-justify
77 fields = ["%*s" % i for i in widthsAndFields[1:]] # right-justify
78 print 's', field0, ' '.join(fields)
79 pad = ' '.join(' ' * i for i in widths[:-1])
80 for i in self.prob:
81 print 'p', pad, i
82 print # blank line afterwards
83
84 def topSeqBeg(maf): return maf.beg[0]
85 def topSeqEnd(maf): return maf.end[0]
86 def emptyMaf(): return MafBlock([], [], [], [], [], [], [])
87
88 def joinOnFirstItem(x, y):
89 if x[0] != y[0]:
90 raise ValueError('join fields not equal:\n'+str(x[0])+'\n'+str(y[0]))
91 return x + y[1:]
92
93 def mafEasyJoin(x, y):
94 '''Join two MAF blocks on the top sequence.'''
95 xJoin = zip(x.chr, x.beg, x.end, x.strand, x.chrSize, x.seq)
96 yJoin = zip(y.chr, y.beg, y.end, y.strand, y.chrSize, y.seq)
97 joined = joinOnFirstItem(xJoin, yJoin)
98 chr, beg, end, strand, chrSize, seq = zip(*joined)
99 prob = x.prob + y.prob
100 return MafBlock(chr, beg, end, strand, chrSize, seq, prob)
101
102 def countNonGaps(s): return len(s) - s.count('-')
103
104 def nthNonGap(s, n):
105 '''Get the start position of the n-th non-gap.'''
106 for i, x in enumerate(s):
107 if x != '-':
108 if n == 0: return i
109 n -= 1
110 raise ValueError('non-gap not found')
111
112 def nthLastNonGap(s, n):
113 '''Get the end position of the n-th last non-gap.'''
114 return len(s) - nthNonGap(s[::-1], n)
115
116 def mafSlice(maf, alnBeg, alnEnd):
117 '''Return a slice of a MAF block, using coordinates in the alignment.'''
118 beg = [b + countNonGaps(s[:alnBeg]) for b, s in zip(maf.beg, maf.seq)]
119 end = [e - countNonGaps(s[alnEnd:]) for e, s in zip(maf.end, maf.seq)]
120 seq = [i[alnBeg:alnEnd] for i in maf.seq]
121 prob = [i[alnBeg:alnEnd] for i in maf.prob]
122 return MafBlock(maf.chr, beg, end, maf.strand, maf.chrSize, seq, prob)
123
124 def mafSliceTopSeq(maf, newTopSeqBeg, newTopSeqEnd):
125 '''Return a slice of a MAF block, using coordinates in the top sequence.'''
126 lettersFromBeg = newTopSeqBeg - topSeqBeg(maf)
127 lettersFromEnd = topSeqEnd(maf) - newTopSeqEnd
128 alnBeg = nthNonGap(maf.seq[0], lettersFromBeg)
129 alnEnd = nthLastNonGap(maf.seq[0], lettersFromEnd)
130 return mafSlice(maf, alnBeg, alnEnd)
131
132 def jumpGaps(sequence, index):
133 '''Return the next index of the sequence where there is a non-gap.'''
134 nextIndex = index
135 while sequence[nextIndex] == '-': nextIndex += 1
136 return nextIndex
137
138 def gapsToAdd(sequences):
139 '''Return new gaps and their positions, needed to align the non-gaps.'''
140 gapInfo = [[] for i in sequences]
141 gapBeg = [0 for i in sequences]
142 try:
143 while True:
144 gapEnd = [jumpGaps(s, p) for s, p in zip(sequences, gapBeg)]
145 gapSize = [e-b for b, e in zip(gapBeg, gapEnd)]
146 maxGapSize = max(gapSize)
147 for s, e, i in zip(gapSize, gapEnd, gapInfo):
148 if s < maxGapSize:
149 newGap = maxGapSize - s
150 i.append((newGap, e))
151 gapBeg = [e+1 for e in gapEnd]
152 except IndexError: return gapInfo
153
154 def chunksAndGaps(s, gapsAndPositions, oneGap):
155 '''Yield chunks of "s" interspersed with gaps at given positions.'''
156 oldPosition = 0
157 for gapLen, position in gapsAndPositions:
158 yield s[oldPosition:position]
159 yield oneGap * gapLen
160 oldPosition = position
161 yield s[oldPosition:]
162
163 def mafAddGaps(maf, gapsAndPositions):
164 '''Add the given gaps at the given positions to a MAF block.'''
165 maf.seq = [sum(chunksAndGaps(i, gapsAndPositions, ['-']), [])
166 for i in maf.seq]
167 maf.prob = [''.join(chunksAndGaps(i, gapsAndPositions, '~'))
168 for i in maf.prob]
169
170 def mafJoin(mafs):
171 '''Intersect and join overlapping MAF blocks.'''
172 newTopSeqBeg = max(map(topSeqBeg, mafs))
173 newTopSeqEnd = min(map(topSeqEnd, mafs))
174 mafs = [mafSliceTopSeq(i, newTopSeqBeg, newTopSeqEnd) for i in mafs]
175 topSeqs = [i.seq[0] for i in mafs]
176 gapInfo = gapsToAdd(topSeqs)
177 for maf, gapsAndPositions in zip(mafs, gapInfo):
178 mafAddGaps(maf, gapsAndPositions)
179 return reduce(mafEasyJoin, mafs)
180
181 def mafInput(lines):
182 '''Read lines and yield MAF blocks.'''
183 maf = emptyMaf()
184 for line in lines:
185 if line.isspace():
186 if maf: yield maf
187 maf = emptyMaf()
188 else:
189 maf.addLine(line)
190 if maf: yield maf
191
192 def sortedMafInput(lines):
193 '''Read lines and yield MAF blocks, checking that they are in order.'''
194 old = emptyMaf()
195 for maf in mafInput(lines):
196 if maf < old: sys.exit(progName + ": MAF blocks not sorted properly")
197 yield maf
198 old = maf
199
200 def allOverlaps(sequences, beg, end):
201 '''Yield all combinations of MAF blocks that overlap in the top genome.'''
202 assert beg < end
203 if not sequences:
204 yield ()
205 return
206 for i in sequences[0]:
207 if topSeqEnd(i) <= beg: continue
208 if topSeqBeg(i) >= end: break # assumes they're sorted by position
209 newBeg = max(beg, topSeqBeg(i))
210 newEnd = min(end, topSeqEnd(i))
211 for j in allOverlaps(sequences[1:], newBeg, newEnd):
212 yield (i,) + j
213
214 def nextWindow(window, input, referenceMaf):
215 '''Yield "relevant" MAF blocks, based on overlap with referenceMaf.'''
216 for maf in window:
217 if not maf.before(referenceMaf): yield maf
218 try:
219 while True:
220 maf = input.peek()
221 if maf.after(referenceMaf): break
222 maf = input.next()
223 if not maf.before(referenceMaf): yield maf
224 except StopIteration: pass
225
226 def overlappingMafs(sortedMafInputs):
227 '''Yield all combinations of MAF blocks that overlap in the top genome.'''
228 if not sortedMafInputs: return
229 head, tail = sortedMafInputs[0], sortedMafInputs[1:]
230 windows = [[] for t in tail]
231 for h in head: # iterate over MAF blocks in the first input
232 windows = [list(nextWindow(w, t, h)) for w, t in zip(windows, tail)]
233 for i in allOverlaps(windows, topSeqBeg(h), topSeqEnd(h)):
234 yield (h,) + i
235
236 op = optparse.OptionParser(usage="%prog sorted-file1.maf sorted-file2.maf ...")
237 (opts, args) = op.parse_args()
238
239 progName = os.path.basename(sys.argv[0])
240
241 inputs = [peekable(sortedMafInput(fileinput.input(i))) for i in args]
242
243 for mafs in overlappingMafs(inputs):
244 mafJoin(mafs).write()