Mercurial > repos > yutaka-saito > bsfcall
comparison bin/maf-convert @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:06f8460885ff |
---|---|
1 #! /usr/bin/env python | |
2 # Copyright 2010, 2011, 2013, 2014 Martin C. Frith | |
3 # Read MAF-format alignments: write them in other formats. | |
4 # Seems to work with Python 2.x, x>=4 | |
5 | |
6 # By "MAF" we mean "multiple alignment format" described in the UCSC | |
7 # Genome FAQ, not e.g. "MIRA assembly format". | |
8 | |
9 from itertools import * | |
10 import sys, os, fileinput, math, operator, optparse, signal, string | |
11 | |
12 def maxlen(s): | |
13 return max(map(len, s)) | |
14 | |
15 def joined(things, delimiter): | |
16 return delimiter.join(map(str, things)) | |
17 | |
18 identityTranslation = string.maketrans("", "") | |
19 def deleted(myString, deleteChars): | |
20 return myString.translate(identityTranslation, deleteChars) | |
21 | |
22 def quantify(iterable, pred=bool): | |
23 """Count how many times the predicate is true.""" | |
24 return sum(map(pred, iterable)) | |
25 | |
26 class Maf: | |
27 def __init__(self, aLine, sLines, qLines, pLines): | |
28 self.namesAndValues = dict(i.split("=") for i in aLine[1:]) | |
29 | |
30 if not sLines: raise Exception("empty alignment") | |
31 cols = zip(*sLines) | |
32 self.seqNames = cols[1] | |
33 self.alnStarts = map(int, cols[2]) | |
34 self.alnSizes = map(int, cols[3]) | |
35 self.strands = cols[4] | |
36 self.seqSizes = map(int, cols[5]) | |
37 self.alnStrings = cols[6] | |
38 | |
39 self.qLines = qLines | |
40 self.pLines = pLines | |
41 | |
42 def dieUnlessPairwise(maf): | |
43 if len(maf.alnStrings) != 2: | |
44 raise Exception("pairwise alignments only, please") | |
45 | |
46 def insertionCounts(alnString): | |
47 gaps = alnString.count("-") | |
48 forwardFrameshifts = alnString.count("\\") | |
49 reverseFrameshifts = alnString.count("/") | |
50 letters = len(alnString) - gaps - forwardFrameshifts - reverseFrameshifts | |
51 return letters, forwardFrameshifts, reverseFrameshifts | |
52 | |
53 def coordinatesPerLetter(alnString, alnSize): | |
54 letters, forwardShifts, reverseShifts = insertionCounts(alnString) | |
55 if forwardShifts or reverseShifts or letters < alnSize: return 3 | |
56 else: return 1 | |
57 | |
58 def mafLetterSizes(maf): | |
59 return map(coordinatesPerLetter, maf.alnStrings, maf.alnSizes) | |
60 | |
61 def alnLetterSizes(sLines): | |
62 return [coordinatesPerLetter(i[6], int(i[3])) for i in sLines] | |
63 | |
64 def isTranslated(sLines): | |
65 for i in sLines: | |
66 alnString = i[6] | |
67 if "\\" in alnString or "/" in alnString: return True | |
68 if len(alnString) - alnString.count("-") < int(i[3]): return True | |
69 return False | |
70 | |
71 def insertionSize(alnString, letterSize): | |
72 """Get the length of sequence included in the alnString.""" | |
73 letters, forwardShifts, reverseShifts = insertionCounts(alnString) | |
74 return letters * letterSize + forwardShifts - reverseShifts | |
75 | |
76 def symbolSize(symbol, letterSize): | |
77 if symbol == "-": return 0 | |
78 if symbol == "\\": return 1 | |
79 if symbol == "/": return -1 | |
80 return letterSize | |
81 | |
82 def isMatch(alignmentColumn): | |
83 # No special treatment of ambiguous bases/residues: same as NCBI BLAST. | |
84 first = alignmentColumn[0].upper() | |
85 for i in alignmentColumn[1:]: | |
86 if i.upper() != first: return False | |
87 return True | |
88 | |
89 def isGapless(alignmentColumn): | |
90 return "-" not in alignmentColumn | |
91 | |
92 def matchAndInsertSizes(alignmentColumns, letterSizes): | |
93 """Get sizes of gapless blocks, and of the inserts between them.""" | |
94 for k, v in groupby(alignmentColumns, isGapless): | |
95 if k: | |
96 sizeOfGaplessBlock = len(list(v)) | |
97 yield str(sizeOfGaplessBlock) | |
98 else: | |
99 blockRows = izip(*v) | |
100 blockRows = imap(''.join, blockRows) | |
101 insertSizes = imap(insertionSize, blockRows, letterSizes) | |
102 yield joined(insertSizes, ":") | |
103 | |
104 ##### Routines for converting to AXT format: ##### | |
105 | |
106 axtCounter = count() | |
107 | |
108 def writeAxt(maf): | |
109 if maf.strands[0] != "+": | |
110 raise Exception("for AXT, the 1st strand in each alignment must be +") | |
111 | |
112 # Convert to AXT's 1-based coordinates: | |
113 alnStarts = imap(operator.add, maf.alnStarts, repeat(1)) | |
114 alnEnds = imap(operator.add, maf.alnStarts, maf.alnSizes) | |
115 | |
116 rows = zip(maf.seqNames, alnStarts, alnEnds, maf.strands) | |
117 head, tail = rows[0], rows[1:] | |
118 | |
119 outWords = [] | |
120 outWords.append(axtCounter.next()) | |
121 outWords.extend(head[:3]) | |
122 outWords.extend(chain(*tail)) | |
123 outWords.append(maf.namesAndValues["score"]) | |
124 | |
125 print joined(outWords, " ") | |
126 for i in maf.alnStrings: print i | |
127 print # print a blank line at the end | |
128 | |
129 ##### Routines for converting to tabular format: ##### | |
130 | |
131 def writeTab(maf): | |
132 aLine, sLines, qLines, pLines = maf | |
133 | |
134 score = "0" | |
135 expect = None | |
136 mismap = None | |
137 for i in aLine: | |
138 if i.startswith("score="): score = i[6:] | |
139 elif i.startswith("expect="): expect = i[7:] | |
140 elif i.startswith("mismap="): mismap = i[7:] | |
141 | |
142 outWords = [] | |
143 outWords.append(score) | |
144 | |
145 for i in sLines: outWords.extend(i[1:6]) | |
146 | |
147 alnStrings = [i[6] for i in sLines] | |
148 alignmentColumns = zip(*alnStrings) | |
149 letterSizes = alnLetterSizes(sLines) | |
150 gapStrings = matchAndInsertSizes(alignmentColumns, letterSizes) | |
151 gapWord = ",".join(gapStrings) | |
152 outWords.append(gapWord) | |
153 | |
154 if expect: outWords.append(expect) | |
155 if mismap: outWords.append(mismap) | |
156 | |
157 print "\t".join(outWords) | |
158 | |
159 ##### Routines for converting to PSL format: ##### | |
160 | |
161 def pslBlocks(alignmentColumns, alnStarts, letterSizes): | |
162 """Get sizes and start coordinates of gapless blocks in an alignment.""" | |
163 start1, start2 = alnStarts | |
164 letterSize1, letterSize2 = letterSizes | |
165 size = 0 | |
166 for x, y in alignmentColumns: | |
167 if x == "-": | |
168 if size: | |
169 yield size, start1, start2 | |
170 start1 += size * letterSize1 | |
171 start2 += size * letterSize2 | |
172 size = 0 | |
173 start2 += symbolSize(y, letterSize2) | |
174 elif y == "-": | |
175 if size: | |
176 yield size, start1, start2 | |
177 start1 += size * letterSize1 | |
178 start2 += size * letterSize2 | |
179 size = 0 | |
180 start1 += symbolSize(x, letterSize1) | |
181 else: | |
182 size += 1 | |
183 if size: yield size, start1, start2 | |
184 | |
185 def pslCommaString(things): | |
186 # UCSC software seems to prefer a trailing comma | |
187 return joined(things, ",") + "," | |
188 | |
189 def gapRunCount(letters): | |
190 """Get the number of runs of gap characters.""" | |
191 uniqLetters = map(operator.itemgetter(0), groupby(letters)) | |
192 return uniqLetters.count("-") | |
193 | |
194 def pslEndpoints(alnStart, alnSize, strand, seqSize): | |
195 alnEnd = alnStart + alnSize | |
196 if strand == "+": return alnStart, alnEnd | |
197 else: return seqSize - alnEnd, seqSize - alnStart | |
198 | |
199 def caseSensitivePairwiseMatchCounts(columns, isProtein): | |
200 # repMatches is always zero | |
201 # for proteins, nCount is always zero, because that's what BLATv34 does | |
202 standardBases = "ACGTU" | |
203 matches = mismatches = repMatches = nCount = 0 | |
204 for i in columns: | |
205 if "-" in i: continue | |
206 x, y = i | |
207 if x in standardBases and y in standardBases or isProtein: | |
208 if x == y: matches += 1 | |
209 else: mismatches += 1 | |
210 else: nCount += 1 | |
211 return matches, mismatches, repMatches, nCount | |
212 | |
213 def writePsl(maf, isProtein): | |
214 dieUnlessPairwise(maf) | |
215 | |
216 alnStrings = map(str.upper, maf.alnStrings) | |
217 alignmentColumns = zip(*alnStrings) | |
218 letterSizes = mafLetterSizes(maf) | |
219 | |
220 matchCounts = caseSensitivePairwiseMatchCounts(alignmentColumns, isProtein) | |
221 matches, mismatches, repMatches, nCount = matchCounts | |
222 numGaplessColumns = sum(matchCounts) | |
223 | |
224 qNumInsert = gapRunCount(maf.alnStrings[0]) | |
225 qBaseInsert = maf.alnSizes[1] - numGaplessColumns * letterSizes[1] | |
226 | |
227 tNumInsert = gapRunCount(maf.alnStrings[1]) | |
228 tBaseInsert = maf.alnSizes[0] - numGaplessColumns * letterSizes[0] | |
229 | |
230 strand = maf.strands[1] | |
231 if max(letterSizes) > 1: | |
232 strand += maf.strands[0] | |
233 elif maf.strands[0] != "+": | |
234 raise Exception("for non-translated PSL, the 1st strand in each alignment must be +") | |
235 | |
236 tName, qName = maf.seqNames | |
237 tSeqSize, qSeqSize = maf.seqSizes | |
238 | |
239 rows = zip(maf.alnStarts, maf.alnSizes, maf.strands, maf.seqSizes) | |
240 tStart, tEnd = pslEndpoints(*rows[0]) | |
241 qStart, qEnd = pslEndpoints(*rows[1]) | |
242 | |
243 blocks = list(pslBlocks(alignmentColumns, maf.alnStarts, letterSizes)) | |
244 blockCount = len(blocks) | |
245 blockSizes, tStarts, qStarts = map(pslCommaString, zip(*blocks)) | |
246 | |
247 outWords = (matches, mismatches, repMatches, nCount, | |
248 qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, | |
249 qName, qSeqSize, qStart, qEnd, tName, tSeqSize, tStart, tEnd, | |
250 blockCount, blockSizes, qStarts, tStarts) | |
251 print joined(outWords, "\t") | |
252 | |
253 ##### Routines for converting to SAM format: ##### | |
254 | |
255 def readGroupId(readGroupItems): | |
256 for i in readGroupItems: | |
257 if i.startswith("ID:"): | |
258 return i[3:] | |
259 raise Exception("readgroup must include ID") | |
260 | |
261 def readSequenceLengths(lines): | |
262 """Read name & length of topmost sequence in each maf block.""" | |
263 sequenceLengths = {} # an OrderedDict might be nice | |
264 isSearching = True | |
265 for line in lines: | |
266 if line.isspace(): isSearching = True | |
267 elif isSearching: | |
268 w = line.split() | |
269 if w[0] == "s": | |
270 seqName, seqSize = w[1], w[5] | |
271 sequenceLengths[seqName] = seqSize | |
272 isSearching = False | |
273 return sequenceLengths | |
274 | |
275 def naturalSortKey(s): | |
276 """Return a key that sorts strings in "natural" order.""" | |
277 return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)] | |
278 | |
279 def karyotypicSortKey(s): | |
280 """Attempt to sort chromosomes in GATK's ridiculous order.""" | |
281 if s == "chrM": return [] | |
282 if s == "MT": return ["~"] | |
283 return naturalSortKey(s) | |
284 | |
285 def writeSamHeader(sequenceLengths, dictFile, readGroupItems): | |
286 print "@HD\tVN:1.3\tSO:unsorted" | |
287 for k in sorted(sequenceLengths, key=karyotypicSortKey): | |
288 print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k]) | |
289 if dictFile: | |
290 for i in fileinput.input(dictFile): | |
291 if i.startswith("@SQ"): print i, | |
292 elif not i.startswith("@"): break | |
293 if readGroupItems: | |
294 print "@RG\t" + "\t".join(readGroupItems) | |
295 | |
296 mapqMissing = "255" | |
297 mapqMaximum = "254" | |
298 mapqMaximumNum = float(mapqMaximum) | |
299 | |
300 def mapqFromProb(probString): | |
301 try: p = float(probString) | |
302 except ValueError: raise Exception("bad probability: " + probString) | |
303 if p < 0 or p > 1: raise Exception("bad probability: " + probString) | |
304 if p == 0: return mapqMaximum | |
305 phred = -10 * math.log(p, 10) | |
306 if phred >= mapqMaximumNum: return mapqMaximum | |
307 return str(int(round(phred))) | |
308 | |
309 def cigarParts(beg, alignmentColumns, end): | |
310 if beg: yield str(beg) + "H" | |
311 | |
312 # (doesn't handle translated alignments) | |
313 isActive = True | |
314 for x, y in alignmentColumns: break | |
315 else: isActive = False | |
316 while isActive: | |
317 size = 1 | |
318 if x == "-": | |
319 for x, y in alignmentColumns: | |
320 if x != "-": break | |
321 size += 1 | |
322 else: isActive = False | |
323 yield str(size) + "I" | |
324 elif y == "-": | |
325 for x, y in alignmentColumns: | |
326 if y != "-": break | |
327 size += 1 | |
328 else: isActive = False | |
329 yield str(size) + "D" | |
330 else: | |
331 for x, y in alignmentColumns: | |
332 if x == "-" or y == "-": break | |
333 size += 1 | |
334 else: isActive = False | |
335 yield str(size) + "M" | |
336 | |
337 if end: yield str(end) + "H" | |
338 | |
339 def writeSam(maf, rg): | |
340 aLine, sLines, qLines, pLines = maf | |
341 | |
342 if isTranslated(sLines): | |
343 raise Exception("this looks like translated DNA - can't convert to SAM format") | |
344 | |
345 score = None | |
346 evalue = None | |
347 mapq = mapqMissing | |
348 for i in aLine: | |
349 if i.startswith("score="): | |
350 v = i[6:] | |
351 if v.isdigit(): score = "AS:i:" + v # it must be an integer | |
352 elif i.startswith("expect="): | |
353 evalue = "EV:Z:" + i[7:] | |
354 elif i.startswith("mismap="): | |
355 mapq = mapqFromProb(i[7:]) | |
356 | |
357 head, tail = sLines[0], sLines[1:] | |
358 | |
359 s, rName, rStart, rAlnSize, rStrand, rSeqSize, rAlnString = head | |
360 if rStrand != "+": | |
361 raise Exception("for SAM, the 1st strand in each alignment must be +") | |
362 pos = str(int(rStart) + 1) # convert to 1-based coordinate | |
363 | |
364 for s, qName, qStart, qAlnSize, qStrand, qSeqSize, qAlnString in tail: | |
365 alignmentColumns = zip(rAlnString.upper(), qAlnString.upper()) | |
366 | |
367 qStart = int(qStart) | |
368 qRevStart = int(qSeqSize) - qStart - int(qAlnSize) | |
369 cigar = "".join(cigarParts(qStart, iter(alignmentColumns), qRevStart)) | |
370 | |
371 seq = deleted(qAlnString, "-") | |
372 | |
373 qual = "*" | |
374 if qLines: | |
375 q, qualityName, qualityString = qLines[0] | |
376 # don't try to handle multiple alignments for now (YAGNI): | |
377 if len(qLines) > 1 or len(tail) > 1 or qualityName != qName: | |
378 raise Exception("can't interpret the quality data") | |
379 qual = ''.join(j for i, j in izip(qAlnString, qualityString) | |
380 if i != "-") | |
381 | |
382 # It's hard to get all the pair info, so this is very | |
383 # incomplete, but hopefully good enough. | |
384 # I'm not sure whether to add 2 and/or 8 to flag. | |
385 if qName.endswith("/1"): | |
386 qName = qName[:-2] | |
387 if qStrand == "+": flag = "99" # 1 + 2 + 32 + 64 | |
388 else: flag = "83" # 1 + 2 + 16 + 64 | |
389 elif qName.endswith("/2"): | |
390 qName = qName[:-2] | |
391 if qStrand == "+": flag = "163" # 1 + 2 + 32 + 128 | |
392 else: flag = "147" # 1 + 2 + 16 + 128 | |
393 else: | |
394 if qStrand == "+": flag = "0" | |
395 else: flag = "16" | |
396 | |
397 editDistance = sum(1 for x, y in alignmentColumns if x != y) | |
398 # no special treatment of ambiguous bases: might be a minor bug | |
399 editDistance = "NM:i:" + str(editDistance) | |
400 | |
401 outWords = [qName, flag, rName, pos, mapq, cigar, "*\t0\t0", seq, qual] | |
402 outWords.append(editDistance) | |
403 if score: outWords.append(score) | |
404 if evalue: outWords.append(evalue) | |
405 if rg: outWords.append(rg) | |
406 print "\t".join(outWords) | |
407 | |
408 ##### Routines for converting to BLAST-like format: ##### | |
409 | |
410 def pairwiseMatchSymbol(alignmentColumn): | |
411 if isMatch(alignmentColumn): return "|" | |
412 else: return " " | |
413 | |
414 def strandText(strand): | |
415 if strand == "+": return "Plus" | |
416 else: return "Minus" | |
417 | |
418 def blastCoordinate(oneBasedCoordinate, strand, seqSize): | |
419 if strand == "-": | |
420 oneBasedCoordinate = seqSize - oneBasedCoordinate + 1 | |
421 return str(oneBasedCoordinate) | |
422 | |
423 def chunker(things, chunkSize): | |
424 for i in range(0, len(things), chunkSize): | |
425 yield things[i:i+chunkSize] | |
426 | |
427 def blastChunker(maf, lineSize, alignmentColumns): | |
428 letterSizes = mafLetterSizes(maf) | |
429 coords = maf.alnStarts | |
430 for chunkCols in chunker(alignmentColumns, lineSize): | |
431 chunkRows = zip(*chunkCols) | |
432 chunkRows = map(''.join, chunkRows) | |
433 starts = [i + 1 for i in coords] # change to 1-based coordinates | |
434 starts = map(blastCoordinate, starts, maf.strands, maf.seqSizes) | |
435 increments = map(insertionSize, chunkRows, letterSizes) | |
436 coords = map(operator.add, coords, increments) | |
437 ends = map(blastCoordinate, coords, maf.strands, maf.seqSizes) | |
438 yield chunkCols, chunkRows, starts, ends | |
439 | |
440 def writeBlast(maf, lineSize, oldQueryName): | |
441 dieUnlessPairwise(maf) | |
442 | |
443 if maf.seqNames[1] != oldQueryName: | |
444 print "Query= %s" % maf.seqNames[1] | |
445 print " (%s letters)" % maf.seqSizes[1] | |
446 print | |
447 | |
448 print ">%s" % maf.seqNames[0] | |
449 print " Length = %s" % maf.seqSizes[0] | |
450 print | |
451 | |
452 scoreLine = " Score = %s" % maf.namesAndValues["score"] | |
453 try: scoreLine += ", Expect = %s" % maf.namesAndValues["expect"] | |
454 except KeyError: pass | |
455 print scoreLine | |
456 | |
457 alignmentColumns = zip(*maf.alnStrings) | |
458 | |
459 alnSize = len(alignmentColumns) | |
460 matches = quantify(alignmentColumns, isMatch) | |
461 matchPercent = 100 * matches // alnSize # round down, like BLAST | |
462 identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent) | |
463 gaps = alnSize - quantify(alignmentColumns, isGapless) | |
464 gapPercent = 100 * gaps // alnSize # round down, like BLAST | |
465 if gaps: identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent) | |
466 print identLine | |
467 | |
468 strands = map(strandText, maf.strands) | |
469 print " Strand = %s / %s" % (strands[1], strands[0]) | |
470 print | |
471 | |
472 for chunk in blastChunker(maf, lineSize, alignmentColumns): | |
473 cols, rows, starts, ends = chunk | |
474 startWidth = maxlen(starts) | |
475 matchSymbols = map(pairwiseMatchSymbol, cols) | |
476 matchSymbols = ''.join(matchSymbols) | |
477 print "Query: %-*s %s %s" % (startWidth, starts[1], rows[1], ends[1]) | |
478 print " %-*s %s" % (startWidth, " ", matchSymbols) | |
479 print "Sbjct: %-*s %s %s" % (startWidth, starts[0], rows[0], ends[0]) | |
480 print | |
481 | |
482 ##### Routines for converting to HTML format: ##### | |
483 | |
484 def writeHtmlHeader(): | |
485 print ''' | |
486 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" | |
487 "http://www.w3.org/TR/html4/strict.dtd"> | |
488 <html lang="en"><head> | |
489 <meta http-equiv="Content-type" content="text/html; charset=UTF-8"> | |
490 <title>Reliable Alignments</title> | |
491 <style type="text/css"> | |
492 /* Try to force monospace, working around browser insanity: */ | |
493 pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em} | |
494 .a {background-color: #3333FF} | |
495 .b {background-color: #9933FF} | |
496 .c {background-color: #FF66CC} | |
497 .d {background-color: #FF3333} | |
498 .e {background-color: #FF9933} | |
499 .f {background-color: #FFFF00} | |
500 .key {display:inline; margin-right:2em} | |
501 </style> | |
502 </head><body> | |
503 | |
504 <div style="line-height:1"> | |
505 <pre class="key"><span class="a"> </span> prob > 0.999</pre> | |
506 <pre class="key"><span class="b"> </span> prob > 0.99 </pre> | |
507 <pre class="key"><span class="c"> </span> prob > 0.95 </pre> | |
508 <pre class="key"><span class="d"> </span> prob > 0.9 </pre> | |
509 <pre class="key"><span class="e"> </span> prob > 0.5 </pre> | |
510 <pre class="key"><span class="f"> </span> prob ≤ 0.5 </pre> | |
511 </div> | |
512 ''' | |
513 | |
514 def probabilityClass(probabilityColumn): | |
515 p = ord(min(probabilityColumn)) - 33 | |
516 if p >= 30: return 'a' | |
517 elif p >= 20: return 'b' | |
518 elif p >= 13: return 'c' | |
519 elif p >= 10: return 'd' | |
520 elif p >= 3: return 'e' | |
521 else: return 'f' | |
522 | |
523 def identicalRuns(s): | |
524 """Yield (item, start, end) for each run of identical items in s.""" | |
525 beg = 0 | |
526 for k, v in groupby(s): | |
527 end = beg + len(list(v)) | |
528 yield k, beg, end | |
529 beg = end | |
530 | |
531 def htmlSpan(text, classRun): | |
532 key, beg, end = classRun | |
533 textbit = text[beg:end] | |
534 if key: return '<span class="%s">%s</span>' % (key, textbit) | |
535 else: return textbit | |
536 | |
537 def multipleMatchSymbol(alignmentColumn): | |
538 if isMatch(alignmentColumn): return "*" | |
539 else: return " " | |
540 | |
541 def writeHtml(maf, lineSize): | |
542 scoreLine = "Alignment" | |
543 try: | |
544 scoreLine += " score=%s" % maf.namesAndValues["score"] | |
545 scoreLine += ", expect=%s" % maf.namesAndValues["expect"] | |
546 except KeyError: pass | |
547 print "<h3>%s:</h3>" % scoreLine | |
548 | |
549 if maf.pLines: | |
550 probRows = [i[1] for i in maf.pLines] | |
551 probCols = izip(*probRows) | |
552 classes = imap(probabilityClass, probCols) | |
553 else: | |
554 classes = repeat(None) | |
555 | |
556 nameWidth = maxlen(maf.seqNames) | |
557 alignmentColumns = zip(*maf.alnStrings) | |
558 | |
559 print '<pre>' | |
560 for chunk in blastChunker(maf, lineSize, alignmentColumns): | |
561 cols, rows, starts, ends = chunk | |
562 startWidth = maxlen(starts) | |
563 endWidth = maxlen(ends) | |
564 matchSymbols = map(multipleMatchSymbol, cols) | |
565 matchSymbols = ''.join(matchSymbols) | |
566 classChunk = islice(classes, lineSize) | |
567 classRuns = list(identicalRuns(classChunk)) | |
568 for n, s, r, e in zip(maf.seqNames, starts, rows, ends): | |
569 spans = [htmlSpan(r, i) for i in classRuns] | |
570 spans = ''.join(spans) | |
571 formatParams = nameWidth, n, startWidth, s, spans, endWidth, e | |
572 print '%-*s %*s %s %*s' % formatParams | |
573 print ' ' * nameWidth, ' ' * startWidth, matchSymbols | |
574 print | |
575 print '</pre>' | |
576 | |
577 ##### Routines for reading MAF format: ##### | |
578 | |
579 def mafInput(lines, isKeepComments): | |
580 a = [] | |
581 s = [] | |
582 q = [] | |
583 p = [] | |
584 for i in lines: | |
585 w = i.split() | |
586 if not w: | |
587 if a: yield a, s, q, p | |
588 a = [] | |
589 s = [] | |
590 q = [] | |
591 p = [] | |
592 elif w[0] == "a": | |
593 a = w | |
594 elif w[0] == "s": | |
595 s.append(w) | |
596 elif w[0] == "q": | |
597 q.append(w) | |
598 elif w[0] == "p": | |
599 p.append(w) | |
600 elif i[0] == "#" and isKeepComments: | |
601 print i, | |
602 if a: yield a, s, q, p | |
603 | |
604 def isFormat(myString, myFormat): | |
605 return myFormat.startswith(myString) | |
606 | |
607 def mafConvert(opts, args): | |
608 format = args[0].lower() | |
609 if isFormat(format, "sam"): | |
610 if opts.dictionary: d = readSequenceLengths(fileinput.input(args[1:])) | |
611 else: d = {} | |
612 if opts.readgroup: | |
613 readGroupItems = opts.readgroup.split() | |
614 rg = "RG:Z:" + readGroupId(readGroupItems) | |
615 else: | |
616 readGroupItems = [] | |
617 rg = "" | |
618 if not opts.noheader: writeSamHeader(d, opts.dictfile, readGroupItems) | |
619 inputLines = fileinput.input(args[1:]) | |
620 if isFormat(format, "html") and not opts.noheader: writeHtmlHeader() | |
621 isKeepCommentLines = isFormat(format, "tabular") and not opts.noheader | |
622 oldQueryName = "" | |
623 for maf in mafInput(inputLines, isKeepCommentLines): | |
624 if isFormat(format, "axt"): writeAxt(Maf(*maf)) | |
625 elif isFormat(format, "blast"): | |
626 maf = Maf(*maf) | |
627 writeBlast(maf, opts.linesize, oldQueryName) | |
628 oldQueryName = maf.seqNames[1] | |
629 elif isFormat(format, "html"): writeHtml(Maf(*maf), opts.linesize) | |
630 elif isFormat(format, "psl"): writePsl(Maf(*maf), opts.protein) | |
631 elif isFormat(format, "sam"): writeSam(maf, rg) | |
632 elif isFormat(format, "tabular"): writeTab(maf) | |
633 else: raise Exception("unknown format: " + format) | |
634 if isFormat(format, "html") and not opts.noheader: print "</body></html>" | |
635 | |
636 if __name__ == "__main__": | |
637 signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message | |
638 | |
639 usage = """ | |
640 %prog --help | |
641 %prog axt mafFile(s) | |
642 %prog blast mafFile(s) | |
643 %prog html mafFile(s) | |
644 %prog psl mafFile(s) | |
645 %prog sam mafFile(s) | |
646 %prog tab mafFile(s)""" | |
647 | |
648 description = "Read MAF-format alignments & write them in another format." | |
649 | |
650 op = optparse.OptionParser(usage=usage, description=description) | |
651 op.add_option("-p", "--protein", action="store_true", | |
652 help="assume protein alignments, for psl match counts") | |
653 op.add_option("-n", "--noheader", action="store_true", | |
654 help="omit any header lines from the output") | |
655 op.add_option("-d", "--dictionary", action="store_true", | |
656 help="include dictionary of sequence lengths in sam format") | |
657 op.add_option("-f", "--dictfile", | |
658 help="get sequence dictionary from DICTFILE") | |
659 op.add_option("-r", "--readgroup", | |
660 help="read group info for sam format") | |
661 op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS", | |
662 help="line length for blast and html formats (default: %default)") | |
663 (opts, args) = op.parse_args() | |
664 if opts.linesize <= 0: op.error("option -l: should be >= 1") | |
665 if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f") | |
666 if len(args) < 1: op.error("I need a format-name and some MAF alignments") | |
667 if opts.dictionary and (len(args) == 1 or "-" in args[1:]): | |
668 op.error("need file (not pipe) with option -d") | |
669 | |
670 try: mafConvert(opts, args) | |
671 except KeyboardInterrupt: pass # avoid silly error message | |
672 except Exception, e: | |
673 prog = os.path.basename(sys.argv[0]) | |
674 sys.exit(prog + ": error: " + str(e)) |