Mercurial > repos > schang > frp_tool
comparison SAM_parser.py @ 6:d4ac3899145b draft
Uploaded
| author | schang |
|---|---|
| date | Tue, 02 Dec 2014 19:54:26 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 5:70e76f6bfcfb | 6:d4ac3899145b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Parse SAM file format from Bowtie2. | |
| 4 | |
| 5 Features: | |
| 6 - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal | |
| 7 - the refseq field is assumed to look like: | |
| 8 gi|228472346|ref|NZ_ACLQ01000011.1| | |
| 9 which is common in HMP accessions to SRA. In this case, the string | |
| 10 starting with "NZ_" is treated as the refseq | |
| 11 """ | |
| 12 | |
| 13 from __future__ import division | |
| 14 import sys, getopt | |
| 15 | |
| 16 class SAMFragment: | |
| 17 """ | |
| 18 Represents a fragment match to a reference genome. | |
| 19 """ | |
| 20 | |
| 21 def __init__(self, sam_string): | |
| 22 '''Look at SAM file format documentation for more details''' | |
| 23 fields = sam_string.rstrip().split() | |
| 24 | |
| 25 self.name = fields[0] #sample run name (QNAME) | |
| 26 self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME | |
| 27 # self.refseq = fields[2] | |
| 28 self.location = int(fields[3]) # POS | |
| 29 self.cigar = fields[5] # CIGAR | |
| 30 self.sequence = fields[9] # SEQ | |
| 31 self.length = len(self.sequence) # length of sequence | |
| 32 | |
| 33 for field in fields[11:]: | |
| 34 # Parses the OPT field that contains the MD tag for identity info. | |
| 35 if field[0:5] == "MD:Z:": | |
| 36 md = field[5:] | |
| 37 # warning: finite state machine ahead! | |
| 38 # state = 0: nothing to remember | |
| 39 # state = 1: read a number, looking to see if next is a number | |
| 40 # state = 2: read a "^", looking for A, T, C, and G for deletion | |
| 41 state = 0 | |
| 42 tempstr = '' | |
| 43 matches = 0 | |
| 44 mismatches = 0 | |
| 45 deletions = 0 | |
| 46 for char in md: | |
| 47 if char in ('0', '1', '2', '3', '4', '5', '6', '7', | |
| 48 '8', '9'): | |
| 49 if state == 2: | |
| 50 deletions += len(tempstr) | |
| 51 tempstr = '' | |
| 52 state = 1 | |
| 53 tempstr += char | |
| 54 elif char == '^': | |
| 55 if state == 1: | |
| 56 matches += int(tempstr) | |
| 57 tempstr = '' | |
| 58 state = 2 | |
| 59 elif char in ('A', 'T', 'C', 'G'): | |
| 60 if state == 1: | |
| 61 matches += int(tempstr) | |
| 62 tempstr = '' | |
| 63 state = 0 | |
| 64 mismatches += 1 | |
| 65 elif state == 2: | |
| 66 tempstr += char | |
| 67 else: | |
| 68 mismatches += 1 | |
| 69 if state == 1: | |
| 70 matches += int(tempstr) | |
| 71 elif state == 2: | |
| 72 deletions += len(tempstr) | |
| 73 | |
| 74 self.matches = matches | |
| 75 self.mismatches = mismatches | |
| 76 self.deletions = deletions | |
| 77 self.identity = 100 * (matches / (matches + mismatches)) | |
| 78 | |
| 79 def __repr__(self): | |
| 80 out = [] | |
| 81 out.append("Name: %s\n" % self.name) | |
| 82 out.append("Location: %d\n" % self.location) | |
| 83 out.append("Identity: %4.1f\n" % self.identity) | |
| 84 return ''.join(out) | |
| 85 | |
| 86 | |
| 87 def SAMFile(filename): | |
| 88 """ | |
| 89 Parses a SAM format file, returning SAMFragment objects. | |
| 90 | |
| 91 This is a generator, so that there is no need to store a full list of | |
| 92 fragments in memory if it's not necessary to do so. | |
| 93 """ | |
| 94 | |
| 95 f = open(filename, 'rb') | |
| 96 | |
| 97 for line in f: | |
| 98 if line.startswith('@'): | |
| 99 continue | |
| 100 if not line.startswith("Your job"): # preprocessing error on my part | |
| 101 yield SAMFragment(line) | |
| 102 | |
| 103 f.close() | |
| 104 | |
| 105 | |
| 106 if __name__ == '__main__': | |
| 107 opts, args = getopt.getopt(sys.argv[1:], 'h') | |
| 108 | |
| 109 for o, a in opts: | |
| 110 if o == '-h': | |
| 111 print ('Usage:') | |
| 112 print (' %s <sam_file>' % sys.argv[0]) | |
| 113 sys.exit(0) | |
| 114 else: | |
| 115 print ('unhandled option') | |
| 116 sys.exit(1) | |
| 117 | |
| 118 if len(args) == 0: | |
| 119 print ('Specify a SAM file as an argument') | |
| 120 sys.exit(1) | |
| 121 | |
| 122 for frag in SAMFile(args[0]): | |
| 123 print (frag) |
