annotate SAM_parser.py @ 7:6d94241370cc draft default tip

Uploaded
author schang
date Tue, 02 Dec 2014 21:06:35 -0500
parents d4ac3899145b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
6
d4ac3899145b Uploaded
schang
parents:
diff changeset
1 #!/usr/bin/env python
d4ac3899145b Uploaded
schang
parents:
diff changeset
2 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
3 Parse SAM file format from Bowtie2.
d4ac3899145b Uploaded
schang
parents:
diff changeset
4
d4ac3899145b Uploaded
schang
parents:
diff changeset
5 Features:
d4ac3899145b Uploaded
schang
parents:
diff changeset
6 - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal
d4ac3899145b Uploaded
schang
parents:
diff changeset
7 - the refseq field is assumed to look like:
d4ac3899145b Uploaded
schang
parents:
diff changeset
8 gi|228472346|ref|NZ_ACLQ01000011.1|
d4ac3899145b Uploaded
schang
parents:
diff changeset
9 which is common in HMP accessions to SRA. In this case, the string
d4ac3899145b Uploaded
schang
parents:
diff changeset
10 starting with "NZ_" is treated as the refseq
d4ac3899145b Uploaded
schang
parents:
diff changeset
11 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
12
d4ac3899145b Uploaded
schang
parents:
diff changeset
13 from __future__ import division
d4ac3899145b Uploaded
schang
parents:
diff changeset
14 import sys, getopt
d4ac3899145b Uploaded
schang
parents:
diff changeset
15
d4ac3899145b Uploaded
schang
parents:
diff changeset
16 class SAMFragment:
d4ac3899145b Uploaded
schang
parents:
diff changeset
17 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
18 Represents a fragment match to a reference genome.
d4ac3899145b Uploaded
schang
parents:
diff changeset
19 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
20
d4ac3899145b Uploaded
schang
parents:
diff changeset
21 def __init__(self, sam_string):
d4ac3899145b Uploaded
schang
parents:
diff changeset
22 '''Look at SAM file format documentation for more details'''
d4ac3899145b Uploaded
schang
parents:
diff changeset
23 fields = sam_string.rstrip().split()
d4ac3899145b Uploaded
schang
parents:
diff changeset
24
d4ac3899145b Uploaded
schang
parents:
diff changeset
25 self.name = fields[0] #sample run name (QNAME)
d4ac3899145b Uploaded
schang
parents:
diff changeset
26 self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME
d4ac3899145b Uploaded
schang
parents:
diff changeset
27 # self.refseq = fields[2]
d4ac3899145b Uploaded
schang
parents:
diff changeset
28 self.location = int(fields[3]) # POS
d4ac3899145b Uploaded
schang
parents:
diff changeset
29 self.cigar = fields[5] # CIGAR
d4ac3899145b Uploaded
schang
parents:
diff changeset
30 self.sequence = fields[9] # SEQ
d4ac3899145b Uploaded
schang
parents:
diff changeset
31 self.length = len(self.sequence) # length of sequence
d4ac3899145b Uploaded
schang
parents:
diff changeset
32
d4ac3899145b Uploaded
schang
parents:
diff changeset
33 for field in fields[11:]:
d4ac3899145b Uploaded
schang
parents:
diff changeset
34 # Parses the OPT field that contains the MD tag for identity info.
d4ac3899145b Uploaded
schang
parents:
diff changeset
35 if field[0:5] == "MD:Z:":
d4ac3899145b Uploaded
schang
parents:
diff changeset
36 md = field[5:]
d4ac3899145b Uploaded
schang
parents:
diff changeset
37 # warning: finite state machine ahead!
d4ac3899145b Uploaded
schang
parents:
diff changeset
38 # state = 0: nothing to remember
d4ac3899145b Uploaded
schang
parents:
diff changeset
39 # state = 1: read a number, looking to see if next is a number
d4ac3899145b Uploaded
schang
parents:
diff changeset
40 # state = 2: read a "^", looking for A, T, C, and G for deletion
d4ac3899145b Uploaded
schang
parents:
diff changeset
41 state = 0
d4ac3899145b Uploaded
schang
parents:
diff changeset
42 tempstr = ''
d4ac3899145b Uploaded
schang
parents:
diff changeset
43 matches = 0
d4ac3899145b Uploaded
schang
parents:
diff changeset
44 mismatches = 0
d4ac3899145b Uploaded
schang
parents:
diff changeset
45 deletions = 0
d4ac3899145b Uploaded
schang
parents:
diff changeset
46 for char in md:
d4ac3899145b Uploaded
schang
parents:
diff changeset
47 if char in ('0', '1', '2', '3', '4', '5', '6', '7',
d4ac3899145b Uploaded
schang
parents:
diff changeset
48 '8', '9'):
d4ac3899145b Uploaded
schang
parents:
diff changeset
49 if state == 2:
d4ac3899145b Uploaded
schang
parents:
diff changeset
50 deletions += len(tempstr)
d4ac3899145b Uploaded
schang
parents:
diff changeset
51 tempstr = ''
d4ac3899145b Uploaded
schang
parents:
diff changeset
52 state = 1
d4ac3899145b Uploaded
schang
parents:
diff changeset
53 tempstr += char
d4ac3899145b Uploaded
schang
parents:
diff changeset
54 elif char == '^':
d4ac3899145b Uploaded
schang
parents:
diff changeset
55 if state == 1:
d4ac3899145b Uploaded
schang
parents:
diff changeset
56 matches += int(tempstr)
d4ac3899145b Uploaded
schang
parents:
diff changeset
57 tempstr = ''
d4ac3899145b Uploaded
schang
parents:
diff changeset
58 state = 2
d4ac3899145b Uploaded
schang
parents:
diff changeset
59 elif char in ('A', 'T', 'C', 'G'):
d4ac3899145b Uploaded
schang
parents:
diff changeset
60 if state == 1:
d4ac3899145b Uploaded
schang
parents:
diff changeset
61 matches += int(tempstr)
d4ac3899145b Uploaded
schang
parents:
diff changeset
62 tempstr = ''
d4ac3899145b Uploaded
schang
parents:
diff changeset
63 state = 0
d4ac3899145b Uploaded
schang
parents:
diff changeset
64 mismatches += 1
d4ac3899145b Uploaded
schang
parents:
diff changeset
65 elif state == 2:
d4ac3899145b Uploaded
schang
parents:
diff changeset
66 tempstr += char
d4ac3899145b Uploaded
schang
parents:
diff changeset
67 else:
d4ac3899145b Uploaded
schang
parents:
diff changeset
68 mismatches += 1
d4ac3899145b Uploaded
schang
parents:
diff changeset
69 if state == 1:
d4ac3899145b Uploaded
schang
parents:
diff changeset
70 matches += int(tempstr)
d4ac3899145b Uploaded
schang
parents:
diff changeset
71 elif state == 2:
d4ac3899145b Uploaded
schang
parents:
diff changeset
72 deletions += len(tempstr)
d4ac3899145b Uploaded
schang
parents:
diff changeset
73
d4ac3899145b Uploaded
schang
parents:
diff changeset
74 self.matches = matches
d4ac3899145b Uploaded
schang
parents:
diff changeset
75 self.mismatches = mismatches
d4ac3899145b Uploaded
schang
parents:
diff changeset
76 self.deletions = deletions
d4ac3899145b Uploaded
schang
parents:
diff changeset
77 self.identity = 100 * (matches / (matches + mismatches))
d4ac3899145b Uploaded
schang
parents:
diff changeset
78
d4ac3899145b Uploaded
schang
parents:
diff changeset
79 def __repr__(self):
d4ac3899145b Uploaded
schang
parents:
diff changeset
80 out = []
d4ac3899145b Uploaded
schang
parents:
diff changeset
81 out.append("Name: %s\n" % self.name)
d4ac3899145b Uploaded
schang
parents:
diff changeset
82 out.append("Location: %d\n" % self.location)
d4ac3899145b Uploaded
schang
parents:
diff changeset
83 out.append("Identity: %4.1f\n" % self.identity)
d4ac3899145b Uploaded
schang
parents:
diff changeset
84 return ''.join(out)
d4ac3899145b Uploaded
schang
parents:
diff changeset
85
d4ac3899145b Uploaded
schang
parents:
diff changeset
86
d4ac3899145b Uploaded
schang
parents:
diff changeset
87 def SAMFile(filename):
d4ac3899145b Uploaded
schang
parents:
diff changeset
88 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
89 Parses a SAM format file, returning SAMFragment objects.
d4ac3899145b Uploaded
schang
parents:
diff changeset
90
d4ac3899145b Uploaded
schang
parents:
diff changeset
91 This is a generator, so that there is no need to store a full list of
d4ac3899145b Uploaded
schang
parents:
diff changeset
92 fragments in memory if it's not necessary to do so.
d4ac3899145b Uploaded
schang
parents:
diff changeset
93 """
d4ac3899145b Uploaded
schang
parents:
diff changeset
94
d4ac3899145b Uploaded
schang
parents:
diff changeset
95 f = open(filename, 'rb')
d4ac3899145b Uploaded
schang
parents:
diff changeset
96
d4ac3899145b Uploaded
schang
parents:
diff changeset
97 for line in f:
d4ac3899145b Uploaded
schang
parents:
diff changeset
98 if line.startswith('@'):
d4ac3899145b Uploaded
schang
parents:
diff changeset
99 continue
d4ac3899145b Uploaded
schang
parents:
diff changeset
100 if not line.startswith("Your job"): # preprocessing error on my part
d4ac3899145b Uploaded
schang
parents:
diff changeset
101 yield SAMFragment(line)
d4ac3899145b Uploaded
schang
parents:
diff changeset
102
d4ac3899145b Uploaded
schang
parents:
diff changeset
103 f.close()
d4ac3899145b Uploaded
schang
parents:
diff changeset
104
d4ac3899145b Uploaded
schang
parents:
diff changeset
105
d4ac3899145b Uploaded
schang
parents:
diff changeset
106 if __name__ == '__main__':
d4ac3899145b Uploaded
schang
parents:
diff changeset
107 opts, args = getopt.getopt(sys.argv[1:], 'h')
d4ac3899145b Uploaded
schang
parents:
diff changeset
108
d4ac3899145b Uploaded
schang
parents:
diff changeset
109 for o, a in opts:
d4ac3899145b Uploaded
schang
parents:
diff changeset
110 if o == '-h':
d4ac3899145b Uploaded
schang
parents:
diff changeset
111 print ('Usage:')
d4ac3899145b Uploaded
schang
parents:
diff changeset
112 print (' %s <sam_file>' % sys.argv[0])
d4ac3899145b Uploaded
schang
parents:
diff changeset
113 sys.exit(0)
d4ac3899145b Uploaded
schang
parents:
diff changeset
114 else:
d4ac3899145b Uploaded
schang
parents:
diff changeset
115 print ('unhandled option')
d4ac3899145b Uploaded
schang
parents:
diff changeset
116 sys.exit(1)
d4ac3899145b Uploaded
schang
parents:
diff changeset
117
d4ac3899145b Uploaded
schang
parents:
diff changeset
118 if len(args) == 0:
d4ac3899145b Uploaded
schang
parents:
diff changeset
119 print ('Specify a SAM file as an argument')
d4ac3899145b Uploaded
schang
parents:
diff changeset
120 sys.exit(1)
d4ac3899145b Uploaded
schang
parents:
diff changeset
121
d4ac3899145b Uploaded
schang
parents:
diff changeset
122 for frag in SAMFile(args[0]):
d4ac3899145b Uploaded
schang
parents:
diff changeset
123 print (frag)