| 6 | 1 #!/usr/bin/env python | 
|  | 2 """ | 
|  | 3 Parse SAM file format from Bowtie2. | 
|  | 4 | 
|  | 5 Features: | 
|  | 6  - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal | 
|  | 7  - the refseq field is assumed to look like: | 
|  | 8      gi|228472346|ref|NZ_ACLQ01000011.1| | 
|  | 9    which is common in HMP accessions to SRA.  In this case, the string | 
|  | 10    starting with "NZ_" is treated as the refseq | 
|  | 11 """ | 
|  | 12 | 
|  | 13 from __future__ import division | 
|  | 14 import sys, getopt | 
|  | 15 | 
|  | 16 class SAMFragment: | 
|  | 17     """ | 
|  | 18     Represents a fragment match to a reference genome. | 
|  | 19     """ | 
|  | 20 | 
|  | 21     def __init__(self, sam_string): | 
|  | 22         '''Look at SAM file format documentation for more details''' | 
|  | 23         fields = sam_string.rstrip().split() | 
|  | 24 | 
|  | 25         self.name = fields[0] #sample run name (QNAME) | 
|  | 26         self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME | 
|  | 27 #         self.refseq = fields[2] | 
|  | 28         self.location = int(fields[3]) # POS | 
|  | 29         self.cigar = fields[5] # CIGAR | 
|  | 30         self.sequence = fields[9] # SEQ | 
|  | 31         self.length = len(self.sequence) # length of sequence | 
|  | 32 | 
|  | 33         for field in fields[11:]: | 
|  | 34             # Parses the OPT field that contains the MD tag for identity info. | 
|  | 35             if field[0:5] == "MD:Z:": | 
|  | 36                 md = field[5:] | 
|  | 37                 # warning: finite state machine ahead! | 
|  | 38                 # state = 0: nothing to remember | 
|  | 39                 # state = 1: read a number, looking to see if next is a number | 
|  | 40                 # state = 2: read a "^", looking for A, T, C, and G for deletion | 
|  | 41                 state = 0 | 
|  | 42                 tempstr = '' | 
|  | 43                 matches = 0 | 
|  | 44                 mismatches = 0 | 
|  | 45                 deletions = 0 | 
|  | 46                 for char in md: | 
|  | 47                     if char in ('0', '1', '2', '3', '4', '5', '6', '7', | 
|  | 48                                 '8', '9'): | 
|  | 49                         if state == 2: | 
|  | 50                             deletions += len(tempstr) | 
|  | 51                             tempstr = '' | 
|  | 52                         state = 1 | 
|  | 53                         tempstr += char | 
|  | 54                     elif char == '^': | 
|  | 55                         if state == 1: | 
|  | 56                             matches += int(tempstr) | 
|  | 57                             tempstr = '' | 
|  | 58                         state = 2 | 
|  | 59                     elif char in ('A', 'T', 'C', 'G'): | 
|  | 60                         if state == 1: | 
|  | 61                             matches += int(tempstr) | 
|  | 62                             tempstr = '' | 
|  | 63                             state = 0 | 
|  | 64                             mismatches += 1 | 
|  | 65                         elif state == 2: | 
|  | 66                             tempstr += char | 
|  | 67                         else: | 
|  | 68                             mismatches += 1 | 
|  | 69                 if state == 1: | 
|  | 70                     matches += int(tempstr) | 
|  | 71                 elif state == 2: | 
|  | 72                     deletions += len(tempstr) | 
|  | 73 | 
|  | 74         self.matches = matches | 
|  | 75         self.mismatches = mismatches | 
|  | 76         self.deletions = deletions | 
|  | 77         self.identity = 100 * (matches / (matches + mismatches)) | 
|  | 78 | 
|  | 79     def __repr__(self): | 
|  | 80         out = [] | 
|  | 81         out.append("Name:      %s\n" % self.name) | 
|  | 82         out.append("Location:  %d\n" % self.location) | 
|  | 83         out.append("Identity:  %4.1f\n" % self.identity) | 
|  | 84         return ''.join(out) | 
|  | 85 | 
|  | 86 | 
|  | 87 def SAMFile(filename): | 
|  | 88     """ | 
|  | 89     Parses a SAM format file, returning SAMFragment objects. | 
|  | 90 | 
|  | 91     This is a generator, so that there is no need to store a full list of | 
|  | 92     fragments in memory if it's not necessary to do so. | 
|  | 93     """ | 
|  | 94 | 
|  | 95     f = open(filename, 'rb') | 
|  | 96 | 
|  | 97     for line in f: | 
|  | 98         if line.startswith('@'): | 
|  | 99             continue | 
|  | 100         if not line.startswith("Your job"):  # preprocessing error on my part | 
|  | 101             yield SAMFragment(line) | 
|  | 102 | 
|  | 103     f.close() | 
|  | 104 | 
|  | 105 | 
|  | 106 if __name__ == '__main__': | 
|  | 107     opts, args = getopt.getopt(sys.argv[1:], 'h') | 
|  | 108 | 
|  | 109     for o, a in opts: | 
|  | 110         if o == '-h': | 
|  | 111             print ('Usage:') | 
|  | 112             print ('  %s <sam_file>' % sys.argv[0]) | 
|  | 113             sys.exit(0) | 
|  | 114         else: | 
|  | 115             print ('unhandled option') | 
|  | 116             sys.exit(1) | 
|  | 117 | 
|  | 118     if len(args) == 0: | 
|  | 119         print ('Specify a SAM file as an argument') | 
|  | 120         sys.exit(1) | 
|  | 121 | 
|  | 122     for frag in SAMFile(args[0]): | 
|  | 123         print (frag) |