# HG changeset patch # User artbio # Date 1501107421 14400 # Node ID 1ed8619a5611bd69b7744a0577788ee75c00330d # Parent 796552c157dedc1dc17234a1b15dce17680992d7 planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 0b55a106b1f76e3cc3d89932fef2cc8d3eb24e4f diff -r 796552c157de -r 1ed8619a5611 extractSplitReads_BwaMem.py --- a/extractSplitReads_BwaMem.py Mon Jul 24 08:03:17 2017 -0400 +++ b/extractSplitReads_BwaMem.py Wed Jul 26 18:17:01 2017 -0400 @@ -1,12 +1,11 @@ #!/usr/bin/env python +import re import sys -import getopt -import string from optparse import OptionParser -import re + -def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): +def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap): if inFile == "stdin": data = sys.stdin else: @@ -14,82 +13,89 @@ for line in data: split = 0 if line[0] == '@': - print line.strip() + print(line.strip()) continue samList = line.strip().split('\t') sam = SAM(samList) - if includeDups==0 and (1024 & sam.flag)==1024: + if includeDups == 0 and (1024 & sam.flag) == 1024: continue for el in sam.tags: if "SA:" in el: - if(len(el.split(";")))<=numSplits: + if(len(el.split(";"))) <= numSplits: split = 1 mate = el.split(",") mateCigar = mate[3] mateFlag = int(0) - if mate[2]=="-": mateFlag = int(16) + if mate[2] == "-": + mateFlag = int(16) if split: read1 = sam.flag & 64 - if read1 == 64: tag = "_1" - else: tag="_2" + if read1 == 64: + tag = "_1" + else: + tag = "_2" samList[0] = sam.query + tag readCigar = sam.cigar - readCigarOps = extractCigarOps(readCigar,sam.flag) + readCigarOps = extractCigarOps(readCigar, sam.flag) readQueryPos = calcQueryPosFromCigar(readCigarOps) - mateCigarOps = extractCigarOps(mateCigar,mateFlag) + mateCigarOps = extractCigarOps(mateCigar, mateFlag) mateQueryPos = calcQueryPosFromCigar(mateCigarOps) - overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) + overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos, + mateQueryPos.qsPos, mateQueryPos.qePos) nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap mno = min(nonOverlap1, nonOverlap2) if mno >= minNonOverlap: - print "\t".join(samList) + print("\t".join(samList)) -#-------------------------------------------------------------------------------------------------- +# ----------------------------------------------------------------------- # functions -#-------------------------------------------------------------------------------------------------- +# ----------------------------------------------------------------------- + class SAM (object): """ __very__ basic class for SAM input. """ - def __init__(self, samList = []): + def __init__(self, samList=[]): if len(samList) > 0: - self.query = samList[0] - self.flag = int(samList[1]) - self.ref = samList[2] - self.pos = int(samList[3]) - self.mapq = int(samList[4]) - self.cigar = samList[5] - self.matRef = samList[6] - self.matePos = int(samList[7]) - self.iSize = int(samList[8]) - self.seq = samList[9] - self.qual = samList[10] - self.tags = samList[11:]#tags is a list of each tag:vtype:value sets - self.valid = 1 + self.query = samList[0] + self.flag = int(samList[1]) + self.ref = samList[2] + self.pos = int(samList[3]) + self.mapq = int(samList[4]) + self.cigar = samList[5] + self.matRef = samList[6] + self.matePos = int(samList[7]) + self.iSize = int(samList[8]) + self.seq = samList[9] + self.qual = samList[10] + self.tags = samList[11:] + # tags is a list of each tag:vtype:value sets + self.valid = 1 else: self.valid = 0 self.query = 'null' - def extractTagValue (self, tagID): + def extractTagValue(self, tagID): for tag in self.tags: - tagParts = tag.split(':', 2); + tagParts = tag.split(':', 2) if (tagParts[0] == tagID): if (tagParts[1] == 'i'): - return int(tagParts[2]); + return int(tagParts[2]) elif (tagParts[1] == 'H'): - return int(tagParts[2],16); - return tagParts[2]; - return None; + return int(tagParts[2], 16) + return tagParts[2] + return None -#----------------------------------------------- + cigarPattern = '([0-9]+[MIDNSHP])' cigarSearch = re.compile(cigarPattern) atomicCigarPattern = '([0-9]+)([MIDNSHP])' atomicCigarSearch = re.compile(atomicCigarPattern) -def extractCigarOps(cigar,flag): + +def extractCigarOps(cigar, flag): if (cigar == "*"): cigarOps = [] elif (flag & 0x0010): @@ -104,7 +110,7 @@ cigarOps.append(cigar) cigarOps = cigarOps cigarOps.reverse() - ##do in reverse order because negative strand## + # do in reverse order because negative strand else: cigarOpStrings = cigarSearch.findall(cigar) cigarOps = [] @@ -117,34 +123,38 @@ # cigarOps = cigarOps return(cigarOps) + def calcQueryPosFromCigar(cigarOps): qsPos = 0 qePos = 0 - qLen = 0 + qLen = 0 # if first op is a H, need to shift start position - # the opPosition counter sees if the for loop is looking at the first index of the cigar object + # the opPosition counter sees if the for loop is looking + # at the first index of the cigar object opPosition = 0 for cigar in cigarOps: if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): qsPos += cigar.length qePos += cigar.length - qLen += cigar.length + qLen += cigar.length elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): - qLen += cigar.length + qLen += cigar.length elif cigar.op == 'M' or cigar.op == 'I': qePos += cigar.length - qLen += cigar.length + qLen += cigar.length opPosition += 1 - d = queryPos(qsPos, qePos, qLen); + d = queryPos(qsPos, qePos, qLen) return d + class cigarOp (object): """ sturct to store a discrete CIGAR operations """ def __init__(self, opLength, op): self.length = int(opLength) - self.op = op + self.op = op + class queryPos (object): """ @@ -153,50 +163,60 @@ def __init__(self, qsPos, qePos, qLen): self.qsPos = int(qsPos) self.qePos = int(qePos) - self.qLen = int(qLen) + self.qLen = int(qLen) -def calcQueryOverlap(s1,e1,s2,e2): +def calcQueryOverlap(s1, e1, s2, e2): o = 1 + min(e1, e2) - max(s1, s2) return max(0, o) ############################################### + class Usage(Exception): def __init__(self, msg): self.msg = msg + def main(): - usage = """%prog -i extractSplitReads_BwaMem v0.1.0 Author: Ira Hall -Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. +Description: Get split-read alignments from bwa-mem in lumpy compatible +format. Ignores reads marked as duplicates. Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. """ parser = OptionParser(usage) parser.add_option("-i", "--inFile", dest="inFile", - help="A SAM file or standard input (-i stdin).", - metavar="FILE") - parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", - help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", - metavar="INT") - parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, - help="Include alignments marked as duplicates. Default=False") - parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", - help="minimum non-overlap between split alignments on the query (default=20)", - metavar="INT") + help="A SAM file or standard input (-i stdin).", + metavar="FILE") + parser.add_option("-n", "--numSplits", dest="numSplits", default=2, + type="int", + help='''The maximum number of split-read mappings to + allow per read. Reads with more are excluded. + Default=2''', metavar="INT") + parser.add_option("-d", "--includeDups", dest="includeDups", + action="store_true", default=0, + help='''Include alignments marked as duplicates. + Default=False''') + parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", + default=20, type="int", help='''minimum non-overlap between + split alignments on the query (default=20)''', + metavar="INT") (opts, args) = parser.parse_args() if opts.inFile is None: parser.print_help() print else: try: - extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) + extractSplitsFromBwaMem(opts.inFile, opts.numSplits, + opts.includeDups, opts.minNonOverlap) except IOError as err: - sys.stderr.write("IOError " + str(err) + "\n"); + sys.stderr.write("IOError " + str(err) + "\n") return + + if __name__ == "__main__": sys.exit(main()) diff -r 796552c157de -r 1ed8619a5611 lumpy.xml --- a/lumpy.xml Mon Jul 24 08:03:17 2017 -0400 +++ b/lumpy.xml Wed Jul 26 18:17:01 2017 -0400 @@ -1,4 +1,4 @@ - + find structural variants lumpy-sv diff -r 796552c157de -r 1ed8619a5611 pairend_distro.py --- a/pairend_distro.py Mon Jul 24 08:03:17 2017 -0400 +++ b/pairend_distro.py Wed Jul 26 18:17:01 2017 -0400 @@ -9,9 +9,9 @@ # rl6sf@virginia.edu import sys +from optparse import OptionParser + import numpy as np -from operator import itemgetter -from optparse import OptionParser # some constants for sam/bam field ids SAM_FLAG = 1 @@ -20,32 +20,16 @@ SAM_ISIZE = 8 parser = OptionParser() - -parser.add_option("-r", - "--read_length", - type="int", - dest="read_length", - help="Read length") - -parser.add_option("-X", - dest="X", - type="int", - help="Number of stdevs from mean to extend") +parser.add_option("-r", "--read_length", type="int", dest="read_length", + help="Read length") +parser.add_option("-X", dest="X", type="int", + help="Number of stdevs from mean to extend") +parser.add_option("-N", dest="N", type="int", help="Number to sample") +parser.add_option("-o", dest="output_file", help="Output file") +parser.add_option("-m", dest="mads", type="int", default=10, + help='''Outlier cutoff in # of median absolute deviations + (unscaled, upper only)''') -parser.add_option("-N", - dest="N", - type="int", - help="Number to sample") - -parser.add_option("-o", - dest="output_file", - help="Output file") - -parser.add_option("-m", - dest="mads", - type="int", - default=10, - help="Outlier cutoff in # of median absolute deviations (unscaled, upper only)") def unscaled_upper_mad(xs): """Return a tuple consisting of the median of xs followed by the @@ -96,7 +80,8 @@ # warn if very few elements in distribution min_elements = 1000 if len(L) < min_elements: - sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % (len(L), min_elements)) + sys.stderr.write("Warning: only %s elements in distribution (min: %s)\n" % + (len(L), min_elements)) mean = "NA" stdev = "NA" @@ -110,7 +95,7 @@ new_len = len(L) removed = c - new_len sys.stderr.write("Removed %d outliers with isize >= %d\n" % - (removed, upper_cutoff)) + (removed, upper_cutoff)) c = new_len mean = np.mean(L) @@ -125,7 +110,7 @@ for x in L: if (x >= start) and (x <= end): j = int(x - start) - H[j] = H[ int(x - start) ] + 1 + H[j] = H[int(x - start)] + 1 s += 1 f = open(options.output_file, 'w') @@ -133,8 +118,5 @@ for i in range(end - start): o = str(i) + "\t" + str(float(H[i])/float(s)) + "\n" f.write(o) - - f.close() - print('mean:' + str(mean) + '\tstdev:' + str(stdev))