frp_tool: SAM_parser.py comparison

comparison SAM_parser.py @ 6:d4ac3899145b draft

Uploaded

author	schang
date	Tue, 02 Dec 2014 19:54:26 -0500
parents
children

comparison

equal deleted inserted replaced

-:70e76f6bfcfb
+:d4ac3899145b
+#!/usr/bin/env python
+"""
+Parse SAM file format from Bowtie2.
+Features:
+- remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal
+- the refseq field is assumed to look like:
+gi|228472346|ref|NZ_ACLQ01000011.1|
+which is common in HMP accessions to SRA.  In this case, the string
+starting with "NZ_" is treated as the refseq
+"""
+from __future__ import division
+import sys, getopt
+class SAMFragment:
+"""
+Represents a fragment match to a reference genome.
+"""
+def __init__(self, sam_string):
+'''Look at SAM file format documentation for more details'''
+fields = sam_string.rstrip().split()
+self.name = fields[0] #sample run name (QNAME)
+self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME
+#         self.refseq = fields[2]
+self.location = int(fields[3]) # POS
+self.cigar = fields[5] # CIGAR
+self.sequence = fields[9] # SEQ
+self.length = len(self.sequence) # length of sequence
+for field in fields[11:]:
+# Parses the OPT field that contains the MD tag for identity info.
+if field[0:5] == "MD:Z:":
+md = field[5:]
+# warning: finite state machine ahead!
+# state = 0: nothing to remember
+# state = 1: read a number, looking to see if next is a number
+# state = 2: read a "^", looking for A, T, C, and G for deletion
+state = 0
+tempstr = ''
+matches = 0
+mismatches = 0
+deletions = 0
+for char in md:
+if char in ('0', '1', '2', '3', '4', '5', '6', '7',
+'8', '9'):
+if state == 2:
+deletions += len(tempstr)
+tempstr = ''
+state = 1
+tempstr += char
+elif char == '^':
+if state == 1:
+matches += int(tempstr)
+tempstr = ''
+state = 2
+elif char in ('A', 'T', 'C', 'G'):
+if state == 1:
+matches += int(tempstr)
+tempstr = ''
+state = 0
+mismatches += 1
+elif state == 2:
+tempstr += char
+else:
+mismatches += 1
+if state == 1:
+matches += int(tempstr)
+elif state == 2:
+deletions += len(tempstr)
+self.matches = matches
+self.mismatches = mismatches
+self.deletions = deletions
+self.identity = 100 * (matches / (matches + mismatches))
+def __repr__(self):
+out = []
+out.append("Name:      %s\n" % self.name)
+out.append("Location:  %d\n" % self.location)
+out.append("Identity:  %4.1f\n" % self.identity)
+return ''.join(out)
+def SAMFile(filename):
+"""
+Parses a SAM format file, returning SAMFragment objects.
+This is a generator, so that there is no need to store a full list of
+fragments in memory if it's not necessary to do so.
+"""
+f = open(filename, 'rb')
+for line in f:
+if line.startswith('@'):
+continue
+if not line.startswith("Your job"):  # preprocessing error on my part
+yield SAMFragment(line)
+f.close()
+if __name__ == '__main__':
+opts, args = getopt.getopt(sys.argv[1:], 'h')
+for o, a in opts:
+if o == '-h':
+print ('Usage:')
+print ('  %s <sam_file>' % sys.argv[0])
+sys.exit(0)
+else:
+print ('unhandled option')
+sys.exit(1)
+if len(args) == 0:
+print ('Specify a SAM file as an argument')
+sys.exit(1)
+for frag in SAMFile(args[0]):
+print (frag)

Mercurial > repos > schang > frp_tool

comparison SAM_parser.py @ 6:d4ac3899145b draft