Mercurial > repos > drosofff > lumpy
comparison extractSplitReads_BwaMem.py @ 0:8b3daa745d9b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy commit c0bfc4b2215705e1b5fd1d4e60b1d72e5da13c92
| author | drosofff |
|---|---|
| date | Tue, 06 Dec 2016 05:46:28 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:8b3daa745d9b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 import sys | |
| 4 import getopt | |
| 5 import string | |
| 6 from optparse import OptionParser | |
| 7 import re | |
| 8 | |
| 9 def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): | |
| 10 if inFile == "stdin": | |
| 11 data = sys.stdin | |
| 12 else: | |
| 13 data = open(inFile, 'r') | |
| 14 for line in data: | |
| 15 split = 0 | |
| 16 if line[0] == '@': | |
| 17 print line.strip() | |
| 18 continue | |
| 19 samList = line.strip().split('\t') | |
| 20 sam = SAM(samList) | |
| 21 if includeDups==0 and (1024 & sam.flag)==1024: | |
| 22 continue | |
| 23 for el in sam.tags: | |
| 24 if "SA:" in el: | |
| 25 if(len(el.split(";")))<=numSplits: | |
| 26 split = 1 | |
| 27 mate = el.split(",") | |
| 28 mateCigar = mate[3] | |
| 29 mateFlag = int(0) | |
| 30 if mate[2]=="-": mateFlag = int(16) | |
| 31 if split: | |
| 32 read1 = sam.flag & 64 | |
| 33 if read1 == 64: tag = "_1" | |
| 34 else: tag="_2" | |
| 35 samList[0] = sam.query + tag | |
| 36 readCigar = sam.cigar | |
| 37 readCigarOps = extractCigarOps(readCigar,sam.flag) | |
| 38 readQueryPos = calcQueryPosFromCigar(readCigarOps) | |
| 39 mateCigarOps = extractCigarOps(mateCigar,mateFlag) | |
| 40 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) | |
| 41 overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) | |
| 42 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap | |
| 43 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap | |
| 44 mno = min(nonOverlap1, nonOverlap2) | |
| 45 if mno >= minNonOverlap: | |
| 46 print "\t".join(samList) | |
| 47 | |
| 48 #-------------------------------------------------------------------------------------------------- | |
| 49 # functions | |
| 50 #-------------------------------------------------------------------------------------------------- | |
| 51 | |
| 52 class SAM (object): | |
| 53 """ | |
| 54 __very__ basic class for SAM input. | |
| 55 """ | |
| 56 def __init__(self, samList = []): | |
| 57 if len(samList) > 0: | |
| 58 self.query = samList[0] | |
| 59 self.flag = int(samList[1]) | |
| 60 self.ref = samList[2] | |
| 61 self.pos = int(samList[3]) | |
| 62 self.mapq = int(samList[4]) | |
| 63 self.cigar = samList[5] | |
| 64 self.matRef = samList[6] | |
| 65 self.matePos = int(samList[7]) | |
| 66 self.iSize = int(samList[8]) | |
| 67 self.seq = samList[9] | |
| 68 self.qual = samList[10] | |
| 69 self.tags = samList[11:]#tags is a list of each tag:vtype:value sets | |
| 70 self.valid = 1 | |
| 71 else: | |
| 72 self.valid = 0 | |
| 73 self.query = 'null' | |
| 74 | |
| 75 def extractTagValue (self, tagID): | |
| 76 for tag in self.tags: | |
| 77 tagParts = tag.split(':', 2); | |
| 78 if (tagParts[0] == tagID): | |
| 79 if (tagParts[1] == 'i'): | |
| 80 return int(tagParts[2]); | |
| 81 elif (tagParts[1] == 'H'): | |
| 82 return int(tagParts[2],16); | |
| 83 return tagParts[2]; | |
| 84 return None; | |
| 85 | |
| 86 #----------------------------------------------- | |
| 87 cigarPattern = '([0-9]+[MIDNSHP])' | |
| 88 cigarSearch = re.compile(cigarPattern) | |
| 89 atomicCigarPattern = '([0-9]+)([MIDNSHP])' | |
| 90 atomicCigarSearch = re.compile(atomicCigarPattern) | |
| 91 | |
| 92 def extractCigarOps(cigar,flag): | |
| 93 if (cigar == "*"): | |
| 94 cigarOps = [] | |
| 95 elif (flag & 0x0010): | |
| 96 cigarOpStrings = cigarSearch.findall(cigar) | |
| 97 cigarOps = [] | |
| 98 for opString in cigarOpStrings: | |
| 99 cigarOpList = atomicCigarSearch.findall(opString) | |
| 100 # print cigarOpList | |
| 101 # "struct" for the op and it's length | |
| 102 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
| 103 # add to the list of cigarOps | |
| 104 cigarOps.append(cigar) | |
| 105 cigarOps = cigarOps | |
| 106 cigarOps.reverse() | |
| 107 ##do in reverse order because negative strand## | |
| 108 else: | |
| 109 cigarOpStrings = cigarSearch.findall(cigar) | |
| 110 cigarOps = [] | |
| 111 for opString in cigarOpStrings: | |
| 112 cigarOpList = atomicCigarSearch.findall(opString) | |
| 113 # "struct" for the op and it's length | |
| 114 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
| 115 # add to the list of cigarOps | |
| 116 cigarOps.append(cigar) | |
| 117 # cigarOps = cigarOps | |
| 118 return(cigarOps) | |
| 119 | |
| 120 def calcQueryPosFromCigar(cigarOps): | |
| 121 qsPos = 0 | |
| 122 qePos = 0 | |
| 123 qLen = 0 | |
| 124 # if first op is a H, need to shift start position | |
| 125 # the opPosition counter sees if the for loop is looking at the first index of the cigar object | |
| 126 opPosition = 0 | |
| 127 for cigar in cigarOps: | |
| 128 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
| 129 qsPos += cigar.length | |
| 130 qePos += cigar.length | |
| 131 qLen += cigar.length | |
| 132 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
| 133 qLen += cigar.length | |
| 134 elif cigar.op == 'M' or cigar.op == 'I': | |
| 135 qePos += cigar.length | |
| 136 qLen += cigar.length | |
| 137 opPosition += 1 | |
| 138 d = queryPos(qsPos, qePos, qLen); | |
| 139 return d | |
| 140 | |
| 141 class cigarOp (object): | |
| 142 """ | |
| 143 sturct to store a discrete CIGAR operations | |
| 144 """ | |
| 145 def __init__(self, opLength, op): | |
| 146 self.length = int(opLength) | |
| 147 self.op = op | |
| 148 | |
| 149 class queryPos (object): | |
| 150 """ | |
| 151 struct to store the start and end positions of query CIGAR operations | |
| 152 """ | |
| 153 def __init__(self, qsPos, qePos, qLen): | |
| 154 self.qsPos = int(qsPos) | |
| 155 self.qePos = int(qePos) | |
| 156 self.qLen = int(qLen) | |
| 157 | |
| 158 | |
| 159 def calcQueryOverlap(s1,e1,s2,e2): | |
| 160 o = 1 + min(e1, e2) - max(s1, s2) | |
| 161 return max(0, o) | |
| 162 | |
| 163 ############################################### | |
| 164 | |
| 165 class Usage(Exception): | |
| 166 def __init__(self, msg): | |
| 167 self.msg = msg | |
| 168 | |
| 169 def main(): | |
| 170 | |
| 171 usage = """%prog -i <file> | |
| 172 | |
| 173 extractSplitReads_BwaMem v0.1.0 | |
| 174 Author: Ira Hall | |
| 175 Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. | |
| 176 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. | |
| 177 """ | |
| 178 parser = OptionParser(usage) | |
| 179 | |
| 180 parser.add_option("-i", "--inFile", dest="inFile", | |
| 181 help="A SAM file or standard input (-i stdin).", | |
| 182 metavar="FILE") | |
| 183 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", | |
| 184 help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", | |
| 185 metavar="INT") | |
| 186 parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, | |
| 187 help="Include alignments marked as duplicates. Default=False") | |
| 188 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", | |
| 189 help="minimum non-overlap between split alignments on the query (default=20)", | |
| 190 metavar="INT") | |
| 191 (opts, args) = parser.parse_args() | |
| 192 if opts.inFile is None: | |
| 193 parser.print_help() | |
| 194 print | |
| 195 else: | |
| 196 try: | |
| 197 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) | |
| 198 except IOError as err: | |
| 199 sys.stderr.write("IOError " + str(err) + "\n"); | |
| 200 return | |
| 201 if __name__ == "__main__": | |
| 202 sys.exit(main()) |
