Mercurial > repos > yutaka-saito > bsfcall
view bin/maf-convert @ 2:f274c166e738 default tip
remove comments in bsfcall_wrapper.xml
author | yutaka-saito |
---|---|
date | Sun, 19 Apr 2015 23:02:04 +0900 |
parents | 06f8460885ff |
children |
line wrap: on
line source
#! /usr/bin/env python # Copyright 2010, 2011, 2013, 2014 Martin C. Frith # Read MAF-format alignments: write them in other formats. # Seems to work with Python 2.x, x>=4 # By "MAF" we mean "multiple alignment format" described in the UCSC # Genome FAQ, not e.g. "MIRA assembly format". from itertools import * import sys, os, fileinput, math, operator, optparse, signal, string def maxlen(s): return max(map(len, s)) def joined(things, delimiter): return delimiter.join(map(str, things)) identityTranslation = string.maketrans("", "") def deleted(myString, deleteChars): return myString.translate(identityTranslation, deleteChars) def quantify(iterable, pred=bool): """Count how many times the predicate is true.""" return sum(map(pred, iterable)) class Maf: def __init__(self, aLine, sLines, qLines, pLines): self.namesAndValues = dict(i.split("=") for i in aLine[1:]) if not sLines: raise Exception("empty alignment") cols = zip(*sLines) self.seqNames = cols[1] self.alnStarts = map(int, cols[2]) self.alnSizes = map(int, cols[3]) self.strands = cols[4] self.seqSizes = map(int, cols[5]) self.alnStrings = cols[6] self.qLines = qLines self.pLines = pLines def dieUnlessPairwise(maf): if len(maf.alnStrings) != 2: raise Exception("pairwise alignments only, please") def insertionCounts(alnString): gaps = alnString.count("-") forwardFrameshifts = alnString.count("\\") reverseFrameshifts = alnString.count("/") letters = len(alnString) - gaps - forwardFrameshifts - reverseFrameshifts return letters, forwardFrameshifts, reverseFrameshifts def coordinatesPerLetter(alnString, alnSize): letters, forwardShifts, reverseShifts = insertionCounts(alnString) if forwardShifts or reverseShifts or letters < alnSize: return 3 else: return 1 def mafLetterSizes(maf): return map(coordinatesPerLetter, maf.alnStrings, maf.alnSizes) def alnLetterSizes(sLines): return [coordinatesPerLetter(i[6], int(i[3])) for i in sLines] def isTranslated(sLines): for i in sLines: alnString = i[6] if "\\" in alnString or "/" in alnString: return True if len(alnString) - alnString.count("-") < int(i[3]): return True return False def insertionSize(alnString, letterSize): """Get the length of sequence included in the alnString.""" letters, forwardShifts, reverseShifts = insertionCounts(alnString) return letters * letterSize + forwardShifts - reverseShifts def symbolSize(symbol, letterSize): if symbol == "-": return 0 if symbol == "\\": return 1 if symbol == "/": return -1 return letterSize def isMatch(alignmentColumn): # No special treatment of ambiguous bases/residues: same as NCBI BLAST. first = alignmentColumn[0].upper() for i in alignmentColumn[1:]: if i.upper() != first: return False return True def isGapless(alignmentColumn): return "-" not in alignmentColumn def matchAndInsertSizes(alignmentColumns, letterSizes): """Get sizes of gapless blocks, and of the inserts between them.""" for k, v in groupby(alignmentColumns, isGapless): if k: sizeOfGaplessBlock = len(list(v)) yield str(sizeOfGaplessBlock) else: blockRows = izip(*v) blockRows = imap(''.join, blockRows) insertSizes = imap(insertionSize, blockRows, letterSizes) yield joined(insertSizes, ":") ##### Routines for converting to AXT format: ##### axtCounter = count() def writeAxt(maf): if maf.strands[0] != "+": raise Exception("for AXT, the 1st strand in each alignment must be +") # Convert to AXT's 1-based coordinates: alnStarts = imap(operator.add, maf.alnStarts, repeat(1)) alnEnds = imap(operator.add, maf.alnStarts, maf.alnSizes) rows = zip(maf.seqNames, alnStarts, alnEnds, maf.strands) head, tail = rows[0], rows[1:] outWords = [] outWords.append(axtCounter.next()) outWords.extend(head[:3]) outWords.extend(chain(*tail)) outWords.append(maf.namesAndValues["score"]) print joined(outWords, " ") for i in maf.alnStrings: print i print # print a blank line at the end ##### Routines for converting to tabular format: ##### def writeTab(maf): aLine, sLines, qLines, pLines = maf score = "0" expect = None mismap = None for i in aLine: if i.startswith("score="): score = i[6:] elif i.startswith("expect="): expect = i[7:] elif i.startswith("mismap="): mismap = i[7:] outWords = [] outWords.append(score) for i in sLines: outWords.extend(i[1:6]) alnStrings = [i[6] for i in sLines] alignmentColumns = zip(*alnStrings) letterSizes = alnLetterSizes(sLines) gapStrings = matchAndInsertSizes(alignmentColumns, letterSizes) gapWord = ",".join(gapStrings) outWords.append(gapWord) if expect: outWords.append(expect) if mismap: outWords.append(mismap) print "\t".join(outWords) ##### Routines for converting to PSL format: ##### def pslBlocks(alignmentColumns, alnStarts, letterSizes): """Get sizes and start coordinates of gapless blocks in an alignment.""" start1, start2 = alnStarts letterSize1, letterSize2 = letterSizes size = 0 for x, y in alignmentColumns: if x == "-": if size: yield size, start1, start2 start1 += size * letterSize1 start2 += size * letterSize2 size = 0 start2 += symbolSize(y, letterSize2) elif y == "-": if size: yield size, start1, start2 start1 += size * letterSize1 start2 += size * letterSize2 size = 0 start1 += symbolSize(x, letterSize1) else: size += 1 if size: yield size, start1, start2 def pslCommaString(things): # UCSC software seems to prefer a trailing comma return joined(things, ",") + "," def gapRunCount(letters): """Get the number of runs of gap characters.""" uniqLetters = map(operator.itemgetter(0), groupby(letters)) return uniqLetters.count("-") def pslEndpoints(alnStart, alnSize, strand, seqSize): alnEnd = alnStart + alnSize if strand == "+": return alnStart, alnEnd else: return seqSize - alnEnd, seqSize - alnStart def caseSensitivePairwiseMatchCounts(columns, isProtein): # repMatches is always zero # for proteins, nCount is always zero, because that's what BLATv34 does standardBases = "ACGTU" matches = mismatches = repMatches = nCount = 0 for i in columns: if "-" in i: continue x, y = i if x in standardBases and y in standardBases or isProtein: if x == y: matches += 1 else: mismatches += 1 else: nCount += 1 return matches, mismatches, repMatches, nCount def writePsl(maf, isProtein): dieUnlessPairwise(maf) alnStrings = map(str.upper, maf.alnStrings) alignmentColumns = zip(*alnStrings) letterSizes = mafLetterSizes(maf) matchCounts = caseSensitivePairwiseMatchCounts(alignmentColumns, isProtein) matches, mismatches, repMatches, nCount = matchCounts numGaplessColumns = sum(matchCounts) qNumInsert = gapRunCount(maf.alnStrings[0]) qBaseInsert = maf.alnSizes[1] - numGaplessColumns * letterSizes[1] tNumInsert = gapRunCount(maf.alnStrings[1]) tBaseInsert = maf.alnSizes[0] - numGaplessColumns * letterSizes[0] strand = maf.strands[1] if max(letterSizes) > 1: strand += maf.strands[0] elif maf.strands[0] != "+": raise Exception("for non-translated PSL, the 1st strand in each alignment must be +") tName, qName = maf.seqNames tSeqSize, qSeqSize = maf.seqSizes rows = zip(maf.alnStarts, maf.alnSizes, maf.strands, maf.seqSizes) tStart, tEnd = pslEndpoints(*rows[0]) qStart, qEnd = pslEndpoints(*rows[1]) blocks = list(pslBlocks(alignmentColumns, maf.alnStarts, letterSizes)) blockCount = len(blocks) blockSizes, tStarts, qStarts = map(pslCommaString, zip(*blocks)) outWords = (matches, mismatches, repMatches, nCount, qNumInsert, qBaseInsert, tNumInsert, tBaseInsert, strand, qName, qSeqSize, qStart, qEnd, tName, tSeqSize, tStart, tEnd, blockCount, blockSizes, qStarts, tStarts) print joined(outWords, "\t") ##### Routines for converting to SAM format: ##### def readGroupId(readGroupItems): for i in readGroupItems: if i.startswith("ID:"): return i[3:] raise Exception("readgroup must include ID") def readSequenceLengths(lines): """Read name & length of topmost sequence in each maf block.""" sequenceLengths = {} # an OrderedDict might be nice isSearching = True for line in lines: if line.isspace(): isSearching = True elif isSearching: w = line.split() if w[0] == "s": seqName, seqSize = w[1], w[5] sequenceLengths[seqName] = seqSize isSearching = False return sequenceLengths def naturalSortKey(s): """Return a key that sorts strings in "natural" order.""" return [(str, int)[k]("".join(v)) for k, v in groupby(s, str.isdigit)] def karyotypicSortKey(s): """Attempt to sort chromosomes in GATK's ridiculous order.""" if s == "chrM": return [] if s == "MT": return ["~"] return naturalSortKey(s) def writeSamHeader(sequenceLengths, dictFile, readGroupItems): print "@HD\tVN:1.3\tSO:unsorted" for k in sorted(sequenceLengths, key=karyotypicSortKey): print "@SQ\tSN:%s\tLN:%s" % (k, sequenceLengths[k]) if dictFile: for i in fileinput.input(dictFile): if i.startswith("@SQ"): print i, elif not i.startswith("@"): break if readGroupItems: print "@RG\t" + "\t".join(readGroupItems) mapqMissing = "255" mapqMaximum = "254" mapqMaximumNum = float(mapqMaximum) def mapqFromProb(probString): try: p = float(probString) except ValueError: raise Exception("bad probability: " + probString) if p < 0 or p > 1: raise Exception("bad probability: " + probString) if p == 0: return mapqMaximum phred = -10 * math.log(p, 10) if phred >= mapqMaximumNum: return mapqMaximum return str(int(round(phred))) def cigarParts(beg, alignmentColumns, end): if beg: yield str(beg) + "H" # (doesn't handle translated alignments) isActive = True for x, y in alignmentColumns: break else: isActive = False while isActive: size = 1 if x == "-": for x, y in alignmentColumns: if x != "-": break size += 1 else: isActive = False yield str(size) + "I" elif y == "-": for x, y in alignmentColumns: if y != "-": break size += 1 else: isActive = False yield str(size) + "D" else: for x, y in alignmentColumns: if x == "-" or y == "-": break size += 1 else: isActive = False yield str(size) + "M" if end: yield str(end) + "H" def writeSam(maf, rg): aLine, sLines, qLines, pLines = maf if isTranslated(sLines): raise Exception("this looks like translated DNA - can't convert to SAM format") score = None evalue = None mapq = mapqMissing for i in aLine: if i.startswith("score="): v = i[6:] if v.isdigit(): score = "AS:i:" + v # it must be an integer elif i.startswith("expect="): evalue = "EV:Z:" + i[7:] elif i.startswith("mismap="): mapq = mapqFromProb(i[7:]) head, tail = sLines[0], sLines[1:] s, rName, rStart, rAlnSize, rStrand, rSeqSize, rAlnString = head if rStrand != "+": raise Exception("for SAM, the 1st strand in each alignment must be +") pos = str(int(rStart) + 1) # convert to 1-based coordinate for s, qName, qStart, qAlnSize, qStrand, qSeqSize, qAlnString in tail: alignmentColumns = zip(rAlnString.upper(), qAlnString.upper()) qStart = int(qStart) qRevStart = int(qSeqSize) - qStart - int(qAlnSize) cigar = "".join(cigarParts(qStart, iter(alignmentColumns), qRevStart)) seq = deleted(qAlnString, "-") qual = "*" if qLines: q, qualityName, qualityString = qLines[0] # don't try to handle multiple alignments for now (YAGNI): if len(qLines) > 1 or len(tail) > 1 or qualityName != qName: raise Exception("can't interpret the quality data") qual = ''.join(j for i, j in izip(qAlnString, qualityString) if i != "-") # It's hard to get all the pair info, so this is very # incomplete, but hopefully good enough. # I'm not sure whether to add 2 and/or 8 to flag. if qName.endswith("/1"): qName = qName[:-2] if qStrand == "+": flag = "99" # 1 + 2 + 32 + 64 else: flag = "83" # 1 + 2 + 16 + 64 elif qName.endswith("/2"): qName = qName[:-2] if qStrand == "+": flag = "163" # 1 + 2 + 32 + 128 else: flag = "147" # 1 + 2 + 16 + 128 else: if qStrand == "+": flag = "0" else: flag = "16" editDistance = sum(1 for x, y in alignmentColumns if x != y) # no special treatment of ambiguous bases: might be a minor bug editDistance = "NM:i:" + str(editDistance) outWords = [qName, flag, rName, pos, mapq, cigar, "*\t0\t0", seq, qual] outWords.append(editDistance) if score: outWords.append(score) if evalue: outWords.append(evalue) if rg: outWords.append(rg) print "\t".join(outWords) ##### Routines for converting to BLAST-like format: ##### def pairwiseMatchSymbol(alignmentColumn): if isMatch(alignmentColumn): return "|" else: return " " def strandText(strand): if strand == "+": return "Plus" else: return "Minus" def blastCoordinate(oneBasedCoordinate, strand, seqSize): if strand == "-": oneBasedCoordinate = seqSize - oneBasedCoordinate + 1 return str(oneBasedCoordinate) def chunker(things, chunkSize): for i in range(0, len(things), chunkSize): yield things[i:i+chunkSize] def blastChunker(maf, lineSize, alignmentColumns): letterSizes = mafLetterSizes(maf) coords = maf.alnStarts for chunkCols in chunker(alignmentColumns, lineSize): chunkRows = zip(*chunkCols) chunkRows = map(''.join, chunkRows) starts = [i + 1 for i in coords] # change to 1-based coordinates starts = map(blastCoordinate, starts, maf.strands, maf.seqSizes) increments = map(insertionSize, chunkRows, letterSizes) coords = map(operator.add, coords, increments) ends = map(blastCoordinate, coords, maf.strands, maf.seqSizes) yield chunkCols, chunkRows, starts, ends def writeBlast(maf, lineSize, oldQueryName): dieUnlessPairwise(maf) if maf.seqNames[1] != oldQueryName: print "Query= %s" % maf.seqNames[1] print " (%s letters)" % maf.seqSizes[1] print print ">%s" % maf.seqNames[0] print " Length = %s" % maf.seqSizes[0] print scoreLine = " Score = %s" % maf.namesAndValues["score"] try: scoreLine += ", Expect = %s" % maf.namesAndValues["expect"] except KeyError: pass print scoreLine alignmentColumns = zip(*maf.alnStrings) alnSize = len(alignmentColumns) matches = quantify(alignmentColumns, isMatch) matchPercent = 100 * matches // alnSize # round down, like BLAST identLine = " Identities = %s/%s (%s%%)" % (matches, alnSize, matchPercent) gaps = alnSize - quantify(alignmentColumns, isGapless) gapPercent = 100 * gaps // alnSize # round down, like BLAST if gaps: identLine += ", Gaps = %s/%s (%s%%)" % (gaps, alnSize, gapPercent) print identLine strands = map(strandText, maf.strands) print " Strand = %s / %s" % (strands[1], strands[0]) print for chunk in blastChunker(maf, lineSize, alignmentColumns): cols, rows, starts, ends = chunk startWidth = maxlen(starts) matchSymbols = map(pairwiseMatchSymbol, cols) matchSymbols = ''.join(matchSymbols) print "Query: %-*s %s %s" % (startWidth, starts[1], rows[1], ends[1]) print " %-*s %s" % (startWidth, " ", matchSymbols) print "Sbjct: %-*s %s %s" % (startWidth, starts[0], rows[0], ends[0]) print ##### Routines for converting to HTML format: ##### def writeHtmlHeader(): print ''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> <html lang="en"><head> <meta http-equiv="Content-type" content="text/html; charset=UTF-8"> <title>Reliable Alignments</title> <style type="text/css"> /* Try to force monospace, working around browser insanity: */ pre {font-family: "Courier New", monospace, serif; font-size: 0.8125em} .a {background-color: #3333FF} .b {background-color: #9933FF} .c {background-color: #FF66CC} .d {background-color: #FF3333} .e {background-color: #FF9933} .f {background-color: #FFFF00} .key {display:inline; margin-right:2em} </style> </head><body> <div style="line-height:1"> <pre class="key"><span class="a"> </span> prob > 0.999</pre> <pre class="key"><span class="b"> </span> prob > 0.99 </pre> <pre class="key"><span class="c"> </span> prob > 0.95 </pre> <pre class="key"><span class="d"> </span> prob > 0.9 </pre> <pre class="key"><span class="e"> </span> prob > 0.5 </pre> <pre class="key"><span class="f"> </span> prob ≤ 0.5 </pre> </div> ''' def probabilityClass(probabilityColumn): p = ord(min(probabilityColumn)) - 33 if p >= 30: return 'a' elif p >= 20: return 'b' elif p >= 13: return 'c' elif p >= 10: return 'd' elif p >= 3: return 'e' else: return 'f' def identicalRuns(s): """Yield (item, start, end) for each run of identical items in s.""" beg = 0 for k, v in groupby(s): end = beg + len(list(v)) yield k, beg, end beg = end def htmlSpan(text, classRun): key, beg, end = classRun textbit = text[beg:end] if key: return '<span class="%s">%s</span>' % (key, textbit) else: return textbit def multipleMatchSymbol(alignmentColumn): if isMatch(alignmentColumn): return "*" else: return " " def writeHtml(maf, lineSize): scoreLine = "Alignment" try: scoreLine += " score=%s" % maf.namesAndValues["score"] scoreLine += ", expect=%s" % maf.namesAndValues["expect"] except KeyError: pass print "<h3>%s:</h3>" % scoreLine if maf.pLines: probRows = [i[1] for i in maf.pLines] probCols = izip(*probRows) classes = imap(probabilityClass, probCols) else: classes = repeat(None) nameWidth = maxlen(maf.seqNames) alignmentColumns = zip(*maf.alnStrings) print '<pre>' for chunk in blastChunker(maf, lineSize, alignmentColumns): cols, rows, starts, ends = chunk startWidth = maxlen(starts) endWidth = maxlen(ends) matchSymbols = map(multipleMatchSymbol, cols) matchSymbols = ''.join(matchSymbols) classChunk = islice(classes, lineSize) classRuns = list(identicalRuns(classChunk)) for n, s, r, e in zip(maf.seqNames, starts, rows, ends): spans = [htmlSpan(r, i) for i in classRuns] spans = ''.join(spans) formatParams = nameWidth, n, startWidth, s, spans, endWidth, e print '%-*s %*s %s %*s' % formatParams print ' ' * nameWidth, ' ' * startWidth, matchSymbols print print '</pre>' ##### Routines for reading MAF format: ##### def mafInput(lines, isKeepComments): a = [] s = [] q = [] p = [] for i in lines: w = i.split() if not w: if a: yield a, s, q, p a = [] s = [] q = [] p = [] elif w[0] == "a": a = w elif w[0] == "s": s.append(w) elif w[0] == "q": q.append(w) elif w[0] == "p": p.append(w) elif i[0] == "#" and isKeepComments: print i, if a: yield a, s, q, p def isFormat(myString, myFormat): return myFormat.startswith(myString) def mafConvert(opts, args): format = args[0].lower() if isFormat(format, "sam"): if opts.dictionary: d = readSequenceLengths(fileinput.input(args[1:])) else: d = {} if opts.readgroup: readGroupItems = opts.readgroup.split() rg = "RG:Z:" + readGroupId(readGroupItems) else: readGroupItems = [] rg = "" if not opts.noheader: writeSamHeader(d, opts.dictfile, readGroupItems) inputLines = fileinput.input(args[1:]) if isFormat(format, "html") and not opts.noheader: writeHtmlHeader() isKeepCommentLines = isFormat(format, "tabular") and not opts.noheader oldQueryName = "" for maf in mafInput(inputLines, isKeepCommentLines): if isFormat(format, "axt"): writeAxt(Maf(*maf)) elif isFormat(format, "blast"): maf = Maf(*maf) writeBlast(maf, opts.linesize, oldQueryName) oldQueryName = maf.seqNames[1] elif isFormat(format, "html"): writeHtml(Maf(*maf), opts.linesize) elif isFormat(format, "psl"): writePsl(Maf(*maf), opts.protein) elif isFormat(format, "sam"): writeSam(maf, rg) elif isFormat(format, "tabular"): writeTab(maf) else: raise Exception("unknown format: " + format) if isFormat(format, "html") and not opts.noheader: print "</body></html>" if __name__ == "__main__": signal.signal(signal.SIGPIPE, signal.SIG_DFL) # avoid silly error message usage = """ %prog --help %prog axt mafFile(s) %prog blast mafFile(s) %prog html mafFile(s) %prog psl mafFile(s) %prog sam mafFile(s) %prog tab mafFile(s)""" description = "Read MAF-format alignments & write them in another format." op = optparse.OptionParser(usage=usage, description=description) op.add_option("-p", "--protein", action="store_true", help="assume protein alignments, for psl match counts") op.add_option("-n", "--noheader", action="store_true", help="omit any header lines from the output") op.add_option("-d", "--dictionary", action="store_true", help="include dictionary of sequence lengths in sam format") op.add_option("-f", "--dictfile", help="get sequence dictionary from DICTFILE") op.add_option("-r", "--readgroup", help="read group info for sam format") op.add_option("-l", "--linesize", type="int", default=60, #metavar="CHARS", help="line length for blast and html formats (default: %default)") (opts, args) = op.parse_args() if opts.linesize <= 0: op.error("option -l: should be >= 1") if opts.dictionary and opts.dictfile: op.error("can't use both -d and -f") if len(args) < 1: op.error("I need a format-name and some MAF alignments") if opts.dictionary and (len(args) == 1 or "-" in args[1:]): op.error("need file (not pipe) with option -d") try: mafConvert(opts, args) except KeyboardInterrupt: pass # avoid silly error message except Exception, e: prog = os.path.basename(sys.argv[0]) sys.exit(prog + ": error: " + str(e))