Mercurial > repos > yutaka-saito > bsfcall
diff bin/maf-convert @ 0:06f8460885ff
migrate from GitHub
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 20:51:13 +0900 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/maf-convert Sun Apr 19 20:51:13 2015 +0900 @@ -0,0 +1,674 @@ +#! /usr/bin/env python +# Copyright 2010, 2011, 2013, 2014 Martin C. Frith +# Read MAF-format alignments: write them in other formats. +# Seems to work with Python 2.x, x>=4 + +# By "MAF" we mean "multiple alignment format" described in the UCSC +# Genome FAQ, not e.g. "MIRA assembly format". + +from itertools import * +import sys, os, fileinput, math, operator, optparse, signal, string + +def maxlen(s): + return max(map(len, s)) + +def joined(things, delimiter): + return delimiter.join(map(str, things)) + +identityTranslation = string.maketrans("", "") +def deleted(myString, deleteChars): + return myString.translate(identityTranslation, deleteChars) + +def quantify(iterable, pred=bool): + """Count how many times the predicate is true.""" + return sum(map(pred, iterable)) + +class Maf: + def __init__(self, aLine, sLines, qLines, pLines): + self.namesAndValues = dict(i.split("=") for i in aLine[1:]) + + if not sLines: raise Exception("empty alignment") + cols = zip(*sLines) + self.seqNames = cols[1] + self.alnStarts = map(int, cols[2]) + self.alnSizes = map(int, cols[3]) + self.strands = cols[4] + self.seqSizes = map(int, cols[5]) + self.alnStrings = cols[6] + + self.qLines = qLines + self.pLines = pLines + +def dieUnlessPairwise(maf): + if len(maf.alnStrings) != 2: + raise Exception("pairwise alignments only, please") + +def insertionCounts(alnString): + gaps = alnString.count("-") + forwardFrameshifts = alnString.count("\\") + reverseFrameshifts = alnString.count("/") + letters = len(alnString) - gaps - forwardFrameshifts - reverseFrameshifts + return letters, forwardFrameshifts, reverseFrameshifts + +def coordinatesPerLetter(alnString, alnSize): + letters, forwardShifts, reverseShifts = insertionCounts(alnString) + if forwardShifts or reverseShifts or letters < alnSize: return 3 + else: return 1 + +def mafLetterSizes(maf): + return map(coordinatesPerLetter, maf.alnStrings, maf.alnSizes) + +def alnLetterSizes(sLines): + return [coordinatesPerLetter(i[6], int(i[3])) for i in sLines] + +def isTranslated(sLines): + for i in sLines: + alnString = i[6] + if "\\" in alnString or "/" in alnString: return True + if len(alnString) - alnString.count("-") < int(i[3]): return True + return False + +def insertionSize(alnString, letterSize): + """Get the length of sequence included in the alnString.""" + letters, forwardShifts, reverseShifts = insertionCounts(alnString) + return letters * letterSize + forwardShifts - reverseShifts + +def symbolSize(symbol, letterSize): + if symbol == "-": return 0 + if symbol == "\\": return 1 + if symbol == "/": return -1 + return letterSize + +def isMatch(alignmentColumn): + # No special treatment of ambiguous bases/residues: same as NCBI BLAST. + first = alignmentColumn[0].upper() + for i in alignmentColumn[1:]: + if i.upper() != first: return False + return True + +def isGapless(alignmentColumn): + return "-" not in alignmentColumn + +def matchAndInsertSizes(alignmentColumns, letterSizes): + """Get sizes of gapless blocks, and of the inserts between them.""" + for k, v in groupby(alignmentColumns, isGapless): + if k: + sizeOfGaplessBlock = len(list(v)) + yield str(sizeOfGaplessBlock) + else: + blockRows = izip(*v) + blockRows = imap(''.join, blockRows) + insertSizes = imap(insertionSize, blockRows, letterSizes) + yield joined(insertSizes, ":") + +##### Routines for converting to AXT format: ##### + +axtCounter = count() + +def writeAxt(maf): + if maf.strands[0] != "+": + raise Exception("for AXT, the 1st strand in each alignment must be +") + + # Convert to AXT's 1-based coordinates: + alnStarts = imap(operator.add, maf.alnStarts, repeat(1)) + alnEnds = imap(operator.add, maf.alnStarts, maf.alnSizes) + + rows = zip(maf.seqNames, alnStarts, alnEnds, maf.strands) + head, tail = rows[0], rows[1:] + + outWords = [] + outWords.append(axtCounter.next()) + outWords.extend(head[:3]) + outWords.extend(chain(*tail)) + outWords.append(maf.namesAndValues["score"]) + + print joined(outWords, " ") + for i in maf.alnStrings: print i + print # print a blank line at the end + +##### Routines for converting to tabular format: ##### + +def writeTab(maf): + aLine, sLines, qLines, pLines = maf + + score = "0" + expect = None + mismap = None + for i in aLine: + if i.startswith("score="): score = i[6:] + elif i.startswith("expect="): expect = i[7:] + elif i.startswith("mismap="): mismap = i[7:] + + outWords = [] + outWords.append(score) + + for i in sLines: outWords.extend(i[1:6]) + + alnStrings = [i[6] for i in sLines] + alignmentColumns = zip(*alnStrings) + letterSizes = alnLetterSizes(sLines) + gapStrings = matchAndInsertSizes(alignmentColumns, letterSizes) + gapWord = ",".join(gapStrings) + outWords.append(gapWord) + + if expect: outWords.append(expect) + if mismap: outWords.append(mismap) + + print "\t".join(outWords) + +##### Routines for converting to PSL format: ##### + +def pslBlocks(alignmentColumns, alnStarts, letterSizes): + """Get sizes and start coordinates of gapless blocks in an alignment.""" + start1, start2 = alnStarts + letterSize1, letterSize2 = letterSizes + size = 0 + for x, y in alignmentColumns: + if x == "-": + if size: + yield size, start1, start2 + start1 += size * letterSize1 + start2 += size * letterSize2 + size = 0 + start2 += symbolSize(y, letterSize2) + elif y == "-": + if size: + yield size, start1, start2 + start1 += size * letterSize1 + start2 += size * letterSize2 + size = 0 + start1 += symbolSize(x, letterSize1) + else: + size += 1 + if size: yield size, start1, start2 + +def pslCommaString(things): + # UCSC software seems to prefer a trailing comma + return joined(things, ",") + "," + +def gapRunCount(letters): + """Get the number of runs of gap characters.""" + uniqLetters = map(operator.itemgetter(0), groupby(letters)) + return uniqLetters.count("-") + +def pslEndpoints(alnStart, alnSize, strand, seqSize): + alnEnd = alnStart + alnSize + if strand == "+": return alnStart, alnEnd + else: return seqSize - alnEnd, seqSize - alnStart + +def caseSensitivePairwiseMatchCounts(columns, isProtein): + # repMatches is always zero + # for proteins, nCount is always zero, because that's what BLATv34 does + standardBases = "ACGTU" + matches = mismatches = repMatches = nCount = 0 + for i in columns: + if "-" in i: continue + x, y = i + if x in standardBases and y in standardBases or isProtein: + if x == y: matches += 1 + else: mismatches += 1 + else: nCount += 1 + return matches, mismatches, repMatches, nCount + +def writePsl(maf, isProtein): + dieUnlessPairwise(maf) + + alnStrings = map(str.upper, maf.alnStrings) + alignmentColumns = zip(*alnStrings) + letterSizes = mafLetterSizes(maf) + + matchCounts = caseSensitivePairwiseMatchCounts(alignmentColumns, isProtein) + matches, mismatches, repMatches, nCount = matchCounts + numGaplessColumns = sum(matchCounts) + + qNumInsert = gapRunCount(maf.alnStrings[0]) + qBaseInsert = maf.alnSizes[1] - numGaplessColumns * letterSizes[1] + + tNumInsert = gapRunCount(maf.alnStrings[1]) + tBaseInsert = maf.alnSizes[0] - numGaplessColumns * letterSizes[0] + + strand = maf.strands[1] + if max(letterSizes) > 1: + strand += maf.strands[0] + elif maf.strands[0] != "+": + raise Exception("for non-translated PSL, the 1st strand in each alignment must be +") + + tName, qName = maf.seqNames + tSeqSize, qSeqSize = maf.seqSizes + + rows = zip(maf.alnStarts, maf.alnSizes, maf.strands, maf.seqSizes) + tStart, tEnd = pslEndpoints(*rows[0]) + qStart, qEnd = pslEndpoints(*rows[1]) + + blocks = list(pslBlocks(alignmentColumns, maf.alnStarts, letterSizes)) + blockCount = len(blocks) + blockSizes, tStarts, qStarts = map(pslCommaString, zip(*blocks)) + + outWords = (matches, mismatches, repMatches, nCount, + qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, + qName, qSeqSize, qStart, qEnd, tName, tSeqSize, tStart, tEnd, + blockCount, blockSizes, qStarts, tStarts) + print joined(outWords, "\t") + +##### Routines for converting to SAM format: ##### + +def readGroupId(readGroupItems): + for i in readGroupItems: + if i.startswith("ID:"): + return i[3:] + raise Exception("readgroup must include ID") + +def readSequenceLengths(lines): + """Read name & length of topmost sequence in each maf block.""" + sequenceLengths = {} # an OrderedDict might be nice + isSearching = True + for line in lines: + if line.isspace(): isSearching = True + elif isSearching: + w = line.split() + if w[0] == "s": + seqName, seqSize = w[1], w[5] + sequenceLengths[seqName] = seqSize + isSearching = False + return sequenceLengths + +def naturalSortKey(s): + """Return a key that sorts strings in "natural" order.""" + return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)] + +def karyotypicSortKey(s): + """Attempt to sort chromosomes in GATK's ridiculous order.""" + if s == "chrM": return [] + if s == "MT": return ["~"] + return naturalSortKey(s) + +def writeSamHeader(sequenceLengths, dictFile, readGroupItems): + print "@HD\tVN:1.3\tSO:unsorted" + for k in sorted(sequenceLengths, key=karyotypicSortKey): + print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k]) + if dictFile: + for i in fileinput.input(dictFile): + if i.startswith("@SQ"): print i, + elif not i.startswith("@"): break + if readGroupItems: + print "@RG\t" + "\t".join(readGroupItems) + +mapqMissing = "255" +mapqMaximum = "254" +mapqMaximumNum = float(mapqMaximum) + +def mapqFromProb(probString): + try: p = float(probString) + except ValueError: raise Exception("bad probability: " + probString) + if p < 0 or p > 1: raise Exception("bad probability: " + probString) + if p == 0: return mapqMaximum + phred = -10 * math.log(p, 10) + if phred >= mapqMaximumNum: return mapqMaximum + return str(int(round(phred))) + +def cigarParts(beg, alignmentColumns, end): + if beg: yield str(beg) + "H" + + # (doesn't handle translated alignments) + isActive = True + for x, y in alignmentColumns: break + else: isActive = False + while isActive: + size = 1 + if x == "-": + for x, y in alignmentColumns: + if x != "-": break + size += 1 + else: isActive = False + yield str(size) + "I" + elif y == "-": + for x, y in alignmentColumns: + if y != "-": break + size += 1 + else: isActive = False + yield str(size) + "D" + else: + for x, y in alignmentColumns: + if x == "-" or y == "-": break + size += 1 + else: isActive = False + yield str(size) + "M" + + if end: yield str(end) + "H" + +def writeSam(maf, rg): + aLine, sLines, qLines, pLines = maf + + if isTranslated(sLines): + raise Exception("this looks like translated DNA - can't convert to SAM format") + + score = None + evalue = None + mapq = mapqMissing + for i in aLine: + if i.startswith("score="): + v = i[6:] + if v.isdigit(): score = "AS:i:" + v # it must be an integer + elif i.startswith("expect="): + evalue = "EV:Z:" + i[7:] + elif i.startswith("mismap="): + mapq = mapqFromProb(i[7:]) + + head, tail = sLines[0], sLines[1:] + + s, rName, rStart, rAlnSize, rStrand, rSeqSize, rAlnString = head + if rStrand != "+": + raise Exception("for SAM, the 1st strand in each alignment must be +") + pos = str(int(rStart) + 1) # convert to 1-based coordinate + + for s, qName, qStart, qAlnSize, qStrand, qSeqSize, qAlnString in tail: + alignmentColumns = zip(rAlnString.upper(), qAlnString.upper()) + + qStart = int(qStart) + qRevStart = int(qSeqSize) - qStart - int(qAlnSize) + cigar = "".join(cigarParts(qStart, iter(alignmentColumns), qRevStart)) + + seq = deleted(qAlnString, "-") + + qual = "*" + if qLines: + q, qualityName, qualityString = qLines[0] + # don't try to handle multiple alignments for now (YAGNI): + if len(qLines) > 1 or len(tail) > 1 or qualityName != qName: + raise Exception("can't interpret the quality data") + qual = ''.join(j for i, j in izip(qAlnString, qualityString) + if i != "-") + + # It's hard to get all the pair info, so this is very + # incomplete, but hopefully good enough. + # I'm not sure whether to add 2 and/or 8 to flag. + if qName.endswith("/1"): + qName = qName[:-2] + if qStrand == "+": flag = "99" # 1 + 2 + 32 + 64 + else: flag = "83" # 1 + 2 + 16 + 64 + elif qName.endswith("/2"): + qName = qName[:-2] + if qStrand == "+": flag = "163" # 1 + 2 + 32 + 128 + else: flag = "147" # 1 + 2 + 16 + 128 + else: + if qStrand == "+": flag = "0" + else: flag = "16" + + editDistance = sum(1 for x, y in alignmentColumns if x != y) + # no special treatment of ambiguous bases: might be a minor bug + editDistance = "NM:i:" + str(editDistance) + + outWords = [qName, flag, rName, pos, mapq, cigar, "*\t0\t0", seq, qual] + outWords.append(editDistance) + if score: outWords.append(score) + if evalue: outWords.append(evalue) + if rg: outWords.append(rg) + print "\t".join(outWords) + +##### Routines for converting to BLAST-like format: ##### + +def pairwiseMatchSymbol(alignmentColumn): + if isMatch(alignmentColumn): return "|" + else: return " " + +def strandText(strand): + if strand == "+": return "Plus" + else: return "Minus" + +def blastCoordinate(oneBasedCoordinate, strand, seqSize): + if strand == "-": + oneBasedCoordinate = seqSize - oneBasedCoordinate + 1 + return str(oneBasedCoordinate) + +def chunker(things, chunkSize): + for i in range(0, len(things), chunkSize): + yield things[i:i+chunkSize] + +def blastChunker(maf, lineSize, alignmentColumns): + letterSizes = mafLetterSizes(maf) + coords = maf.alnStarts + for chunkCols in chunker(alignmentColumns, lineSize): + chunkRows = zip(*chunkCols) + chunkRows = map(''.join, chunkRows) + starts = [i + 1 for i in coords] # change to 1-based coordinates + starts = map(blastCoordinate, starts, maf.strands, maf.seqSizes) + increments = map(insertionSize, chunkRows, letterSizes) + coords = map(operator.add, coords, increments) + ends = map(blastCoordinate, coords, maf.strands, maf.seqSizes) + yield chunkCols, chunkRows, starts, ends + +def writeBlast(maf, lineSize, oldQueryName): + dieUnlessPairwise(maf) + + if maf.seqNames[1] != oldQueryName: + print "Query= %s" % maf.seqNames[1] + print " (%s letters)" % maf.seqSizes[1] + print + + print ">%s" % maf.seqNames[0] + print " Length = %s" % maf.seqSizes[0] + print + + scoreLine = " Score = %s" % maf.namesAndValues["score"] + try: scoreLine += ", Expect = %s" % maf.namesAndValues["expect"] + except KeyError: pass + print scoreLine + + alignmentColumns = zip(*maf.alnStrings) + + alnSize = len(alignmentColumns) + matches = quantify(alignmentColumns, isMatch) + matchPercent = 100 * matches // alnSize # round down, like BLAST + identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent) + gaps = alnSize - quantify(alignmentColumns, isGapless) + gapPercent = 100 * gaps // alnSize # round down, like BLAST + if gaps: identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent) + print identLine + + strands = map(strandText, maf.strands) + print " Strand = %s / %s" % (strands[1], strands[0]) + print + + for chunk in blastChunker(maf, lineSize, alignmentColumns): + cols, rows, starts, ends = chunk + startWidth = maxlen(starts) + matchSymbols = map(pairwiseMatchSymbol, cols) + matchSymbols = ''.join(matchSymbols) + print "Query: %-*s %s %s" % (startWidth, starts[1], rows[1], ends[1]) + print " %-*s %s" % (startWidth, " ", matchSymbols) + print "Sbjct: %-*s %s %s" % (startWidth, starts[0], rows[0], ends[0]) + print + +##### Routines for converting to HTML format: ##### + +def writeHtmlHeader(): + print ''' +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" + "http://www.w3.org/TR/html4/strict.dtd"> +<html lang="en"><head> +<meta http-equiv="Content-type" content="text/html; charset=UTF-8"> +<title>Reliable Alignments</title> +<style type="text/css"> +/* Try to force monospace, working around browser insanity: */ +pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em} +.a {background-color: #3333FF} +.b {background-color: #9933FF} +.c {background-color: #FF66CC} +.d {background-color: #FF3333} +.e {background-color: #FF9933} +.f {background-color: #FFFF00} +.key {display:inline; margin-right:2em} +</style> +</head><body> + +<div style="line-height:1"> +<pre class="key"><span class="a"> </span> prob > 0.999</pre> +<pre class="key"><span class="b"> </span> prob > 0.99 </pre> +<pre class="key"><span class="c"> </span> prob > 0.95 </pre> +<pre class="key"><span class="d"> </span> prob > 0.9 </pre> +<pre class="key"><span class="e"> </span> prob > 0.5 </pre> +<pre class="key"><span class="f"> </span> prob ≤ 0.5 </pre> +</div> +''' + +def probabilityClass(probabilityColumn): + p = ord(min(probabilityColumn)) - 33 + if p >= 30: return 'a' + elif p >= 20: return 'b' + elif p >= 13: return 'c' + elif p >= 10: return 'd' + elif p >= 3: return 'e' + else: return 'f' + +def identicalRuns(s): + """Yield (item, start, end) for each run of identical items in s.""" + beg = 0 + for k, v in groupby(s): + end = beg + len(list(v)) + yield k, beg, end + beg = end + +def htmlSpan(text, classRun): + key, beg, end = classRun + textbit = text[beg:end] + if key: return '<span class="%s">%s</span>' % (key, textbit) + else: return textbit + +def multipleMatchSymbol(alignmentColumn): + if isMatch(alignmentColumn): return "*" + else: return " " + +def writeHtml(maf, lineSize): + scoreLine = "Alignment" + try: + scoreLine += " score=%s" % maf.namesAndValues["score"] + scoreLine += ", expect=%s" % maf.namesAndValues["expect"] + except KeyError: pass + print "<h3>%s:</h3>" % scoreLine + + if maf.pLines: + probRows = [i[1] for i in maf.pLines] + probCols = izip(*probRows) + classes = imap(probabilityClass, probCols) + else: + classes = repeat(None) + + nameWidth = maxlen(maf.seqNames) + alignmentColumns = zip(*maf.alnStrings) + + print '<pre>' + for chunk in blastChunker(maf, lineSize, alignmentColumns): + cols, rows, starts, ends = chunk + startWidth = maxlen(starts) + endWidth = maxlen(ends) + matchSymbols = map(multipleMatchSymbol, cols) + matchSymbols = ''.join(matchSymbols) + classChunk = islice(classes, lineSize) + classRuns = list(identicalRuns(classChunk)) + for n, s, r, e in zip(maf.seqNames, starts, rows, ends): + spans = [htmlSpan(r, i) for i in classRuns] + spans = ''.join(spans) + formatParams = nameWidth, n, startWidth, s, spans, endWidth, e + print '%-*s %*s %s %*s' % formatParams + print ' ' * nameWidth, ' ' * startWidth, matchSymbols + print + print '</pre>' + +##### Routines for reading MAF format: ##### + +def mafInput(lines, isKeepComments): + a = [] + s = [] + q = [] + p = [] + for i in lines: + w = i.split() + if not w: + if a: yield a, s, q, p + a = [] + s = [] + q = [] + p = [] + elif w[0] == "a": + a = w + elif w[0] == "s": + s.append(w) + elif w[0] == "q": + q.append(w) + elif w[0] == "p": + p.append(w) + elif i[0] == "#" and isKeepComments: + print i, + if a: yield a, s, q, p + +def isFormat(myString, myFormat): + return myFormat.startswith(myString) + +def mafConvert(opts, args): + format = args[0].lower() + if isFormat(format, "sam"): + if opts.dictionary: d = readSequenceLengths(fileinput.input(args[1:])) + else: d = {} + if opts.readgroup: + readGroupItems = opts.readgroup.split() + rg = "RG:Z:" + readGroupId(readGroupItems) + else: + readGroupItems = [] + rg = "" + if not opts.noheader: writeSamHeader(d, opts.dictfile, readGroupItems) + inputLines = fileinput.input(args[1:]) + if isFormat(format, "html") and not opts.noheader: writeHtmlHeader() + isKeepCommentLines = isFormat(format, "tabular") and not opts.noheader + oldQueryName = "" + for maf in mafInput(inputLines, isKeepCommentLines): + if isFormat(format, "axt"): writeAxt(Maf(*maf)) + elif isFormat(format, "blast"): + maf = Maf(*maf) + writeBlast(maf, opts.linesize, oldQueryName) + oldQueryName = maf.seqNames[1] + elif isFormat(format, "html"): writeHtml(Maf(*maf), opts.linesize) + elif isFormat(format, "psl"): writePsl(Maf(*maf), opts.protein) + elif isFormat(format, "sam"): writeSam(maf, rg) + elif isFormat(format, "tabular"): writeTab(maf) + else: raise Exception("unknown format: " + format) + if isFormat(format, "html") and not opts.noheader: print "</body></html>" + +if __name__ == "__main__": + signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message + + usage = """ + %prog --help + %prog axt mafFile(s) + %prog blast mafFile(s) + %prog html mafFile(s) + %prog psl mafFile(s) + %prog sam mafFile(s) + %prog tab mafFile(s)""" + + description = "Read MAF-format alignments & write them in another format." + + op = optparse.OptionParser(usage=usage, description=description) + op.add_option("-p", "--protein", action="store_true", + help="assume protein alignments, for psl match counts") + op.add_option("-n", "--noheader", action="store_true", + help="omit any header lines from the output") + op.add_option("-d", "--dictionary", action="store_true", + help="include dictionary of sequence lengths in sam format") + op.add_option("-f", "--dictfile", + help="get sequence dictionary from DICTFILE") + op.add_option("-r", "--readgroup", + help="read group info for sam format") + op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS", + help="line length for blast and html formats (default: %default)") + (opts, args) = op.parse_args() + if opts.linesize <= 0: op.error("option -l: should be >= 1") + if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f") + if len(args) < 1: op.error("I need a format-name and some MAF alignments") + if opts.dictionary and (len(args) == 1 or "-" in args[1:]): + op.error("need file (not pipe) with option -d") + + try: mafConvert(opts, args) + except KeyboardInterrupt: pass # avoid silly error message + except Exception, e: + prog = os.path.basename(sys.argv[0]) + sys.exit(prog + ": error: " + str(e))