| 
6
 | 
     1 #!/usr/bin/env python
 | 
| 
 | 
     2 """
 | 
| 
 | 
     3 Parse SAM file format from Bowtie2.
 | 
| 
 | 
     4 
 | 
| 
 | 
     5 Features:
 | 
| 
 | 
     6  - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal
 | 
| 
 | 
     7  - the refseq field is assumed to look like:
 | 
| 
 | 
     8      gi|228472346|ref|NZ_ACLQ01000011.1| 
 | 
| 
 | 
     9    which is common in HMP accessions to SRA.  In this case, the string
 | 
| 
 | 
    10    starting with "NZ_" is treated as the refseq
 | 
| 
 | 
    11 """
 | 
| 
 | 
    12 
 | 
| 
 | 
    13 from __future__ import division
 | 
| 
 | 
    14 import sys, getopt
 | 
| 
 | 
    15 
 | 
| 
 | 
    16 class SAMFragment:
 | 
| 
 | 
    17     """
 | 
| 
 | 
    18     Represents a fragment match to a reference genome.
 | 
| 
 | 
    19     """
 | 
| 
 | 
    20 
 | 
| 
 | 
    21     def __init__(self, sam_string):
 | 
| 
 | 
    22         '''Look at SAM file format documentation for more details'''
 | 
| 
 | 
    23         fields = sam_string.rstrip().split()
 | 
| 
 | 
    24 
 | 
| 
 | 
    25         self.name = fields[0] #sample run name (QNAME)
 | 
| 
 | 
    26         self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME
 | 
| 
 | 
    27 #         self.refseq = fields[2]
 | 
| 
 | 
    28         self.location = int(fields[3]) # POS
 | 
| 
 | 
    29         self.cigar = fields[5] # CIGAR
 | 
| 
 | 
    30         self.sequence = fields[9] # SEQ
 | 
| 
 | 
    31         self.length = len(self.sequence) # length of sequence
 | 
| 
 | 
    32 
 | 
| 
 | 
    33         for field in fields[11:]:
 | 
| 
 | 
    34             # Parses the OPT field that contains the MD tag for identity info. 
 | 
| 
 | 
    35             if field[0:5] == "MD:Z:":
 | 
| 
 | 
    36                 md = field[5:]
 | 
| 
 | 
    37                 # warning: finite state machine ahead!
 | 
| 
 | 
    38                 # state = 0: nothing to remember
 | 
| 
 | 
    39                 # state = 1: read a number, looking to see if next is a number
 | 
| 
 | 
    40                 # state = 2: read a "^", looking for A, T, C, and G for deletion
 | 
| 
 | 
    41                 state = 0
 | 
| 
 | 
    42                 tempstr = ''
 | 
| 
 | 
    43                 matches = 0
 | 
| 
 | 
    44                 mismatches = 0
 | 
| 
 | 
    45                 deletions = 0
 | 
| 
 | 
    46                 for char in md:
 | 
| 
 | 
    47                     if char in ('0', '1', '2', '3', '4', '5', '6', '7',
 | 
| 
 | 
    48                                 '8', '9'):
 | 
| 
 | 
    49                         if state == 2:
 | 
| 
 | 
    50                             deletions += len(tempstr)
 | 
| 
 | 
    51                             tempstr = ''
 | 
| 
 | 
    52                         state = 1
 | 
| 
 | 
    53                         tempstr += char
 | 
| 
 | 
    54                     elif char == '^':
 | 
| 
 | 
    55                         if state == 1:
 | 
| 
 | 
    56                             matches += int(tempstr)
 | 
| 
 | 
    57                             tempstr = ''
 | 
| 
 | 
    58                         state = 2
 | 
| 
 | 
    59                     elif char in ('A', 'T', 'C', 'G'):
 | 
| 
 | 
    60                         if state == 1:
 | 
| 
 | 
    61                             matches += int(tempstr)
 | 
| 
 | 
    62                             tempstr = ''
 | 
| 
 | 
    63                             state = 0
 | 
| 
 | 
    64                             mismatches += 1
 | 
| 
 | 
    65                         elif state == 2:
 | 
| 
 | 
    66                             tempstr += char
 | 
| 
 | 
    67                         else:
 | 
| 
 | 
    68                             mismatches += 1
 | 
| 
 | 
    69                 if state == 1:
 | 
| 
 | 
    70                     matches += int(tempstr)
 | 
| 
 | 
    71                 elif state == 2:
 | 
| 
 | 
    72                     deletions += len(tempstr)
 | 
| 
 | 
    73 
 | 
| 
 | 
    74         self.matches = matches
 | 
| 
 | 
    75         self.mismatches = mismatches
 | 
| 
 | 
    76         self.deletions = deletions
 | 
| 
 | 
    77         self.identity = 100 * (matches / (matches + mismatches))
 | 
| 
 | 
    78 
 | 
| 
 | 
    79     def __repr__(self):
 | 
| 
 | 
    80         out = []
 | 
| 
 | 
    81         out.append("Name:      %s\n" % self.name)
 | 
| 
 | 
    82         out.append("Location:  %d\n" % self.location)
 | 
| 
 | 
    83         out.append("Identity:  %4.1f\n" % self.identity)
 | 
| 
 | 
    84         return ''.join(out)
 | 
| 
 | 
    85 
 | 
| 
 | 
    86 
 | 
| 
 | 
    87 def SAMFile(filename):
 | 
| 
 | 
    88     """
 | 
| 
 | 
    89     Parses a SAM format file, returning SAMFragment objects.
 | 
| 
 | 
    90 
 | 
| 
 | 
    91     This is a generator, so that there is no need to store a full list of
 | 
| 
 | 
    92     fragments in memory if it's not necessary to do so.
 | 
| 
 | 
    93     """
 | 
| 
 | 
    94 
 | 
| 
 | 
    95     f = open(filename, 'rb')
 | 
| 
 | 
    96 
 | 
| 
 | 
    97     for line in f:
 | 
| 
 | 
    98         if line.startswith('@'):
 | 
| 
 | 
    99             continue
 | 
| 
 | 
   100         if not line.startswith("Your job"):  # preprocessing error on my part
 | 
| 
 | 
   101             yield SAMFragment(line)
 | 
| 
 | 
   102 
 | 
| 
 | 
   103     f.close()
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 
 | 
| 
 | 
   106 if __name__ == '__main__':
 | 
| 
 | 
   107     opts, args = getopt.getopt(sys.argv[1:], 'h')
 | 
| 
 | 
   108 
 | 
| 
 | 
   109     for o, a in opts:
 | 
| 
 | 
   110         if o == '-h':
 | 
| 
 | 
   111             print ('Usage:')
 | 
| 
 | 
   112             print ('  %s <sam_file>' % sys.argv[0])
 | 
| 
 | 
   113             sys.exit(0)
 | 
| 
 | 
   114         else:
 | 
| 
 | 
   115             print ('unhandled option')
 | 
| 
 | 
   116             sys.exit(1)
 | 
| 
 | 
   117 
 | 
| 
 | 
   118     if len(args) == 0:
 | 
| 
 | 
   119         print ('Specify a SAM file as an argument')
 | 
| 
 | 
   120         sys.exit(1)
 | 
| 
 | 
   121 
 | 
| 
 | 
   122     for frag in SAMFile(args[0]):
 | 
| 
 | 
   123         print (frag)
 |