6
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Parse SAM file format from Bowtie2.
|
|
4
|
|
5 Features:
|
|
6 - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal
|
|
7 - the refseq field is assumed to look like:
|
|
8 gi|228472346|ref|NZ_ACLQ01000011.1|
|
|
9 which is common in HMP accessions to SRA. In this case, the string
|
|
10 starting with "NZ_" is treated as the refseq
|
|
11 """
|
|
12
|
|
13 from __future__ import division
|
|
14 import sys, getopt
|
|
15
|
|
16 class SAMFragment:
|
|
17 """
|
|
18 Represents a fragment match to a reference genome.
|
|
19 """
|
|
20
|
|
21 def __init__(self, sam_string):
|
|
22 '''Look at SAM file format documentation for more details'''
|
|
23 fields = sam_string.rstrip().split()
|
|
24
|
|
25 self.name = fields[0] #sample run name (QNAME)
|
|
26 self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME
|
|
27 # self.refseq = fields[2]
|
|
28 self.location = int(fields[3]) # POS
|
|
29 self.cigar = fields[5] # CIGAR
|
|
30 self.sequence = fields[9] # SEQ
|
|
31 self.length = len(self.sequence) # length of sequence
|
|
32
|
|
33 for field in fields[11:]:
|
|
34 # Parses the OPT field that contains the MD tag for identity info.
|
|
35 if field[0:5] == "MD:Z:":
|
|
36 md = field[5:]
|
|
37 # warning: finite state machine ahead!
|
|
38 # state = 0: nothing to remember
|
|
39 # state = 1: read a number, looking to see if next is a number
|
|
40 # state = 2: read a "^", looking for A, T, C, and G for deletion
|
|
41 state = 0
|
|
42 tempstr = ''
|
|
43 matches = 0
|
|
44 mismatches = 0
|
|
45 deletions = 0
|
|
46 for char in md:
|
|
47 if char in ('0', '1', '2', '3', '4', '5', '6', '7',
|
|
48 '8', '9'):
|
|
49 if state == 2:
|
|
50 deletions += len(tempstr)
|
|
51 tempstr = ''
|
|
52 state = 1
|
|
53 tempstr += char
|
|
54 elif char == '^':
|
|
55 if state == 1:
|
|
56 matches += int(tempstr)
|
|
57 tempstr = ''
|
|
58 state = 2
|
|
59 elif char in ('A', 'T', 'C', 'G'):
|
|
60 if state == 1:
|
|
61 matches += int(tempstr)
|
|
62 tempstr = ''
|
|
63 state = 0
|
|
64 mismatches += 1
|
|
65 elif state == 2:
|
|
66 tempstr += char
|
|
67 else:
|
|
68 mismatches += 1
|
|
69 if state == 1:
|
|
70 matches += int(tempstr)
|
|
71 elif state == 2:
|
|
72 deletions += len(tempstr)
|
|
73
|
|
74 self.matches = matches
|
|
75 self.mismatches = mismatches
|
|
76 self.deletions = deletions
|
|
77 self.identity = 100 * (matches / (matches + mismatches))
|
|
78
|
|
79 def __repr__(self):
|
|
80 out = []
|
|
81 out.append("Name: %s\n" % self.name)
|
|
82 out.append("Location: %d\n" % self.location)
|
|
83 out.append("Identity: %4.1f\n" % self.identity)
|
|
84 return ''.join(out)
|
|
85
|
|
86
|
|
87 def SAMFile(filename):
|
|
88 """
|
|
89 Parses a SAM format file, returning SAMFragment objects.
|
|
90
|
|
91 This is a generator, so that there is no need to store a full list of
|
|
92 fragments in memory if it's not necessary to do so.
|
|
93 """
|
|
94
|
|
95 f = open(filename, 'rb')
|
|
96
|
|
97 for line in f:
|
|
98 if line.startswith('@'):
|
|
99 continue
|
|
100 if not line.startswith("Your job"): # preprocessing error on my part
|
|
101 yield SAMFragment(line)
|
|
102
|
|
103 f.close()
|
|
104
|
|
105
|
|
106 if __name__ == '__main__':
|
|
107 opts, args = getopt.getopt(sys.argv[1:], 'h')
|
|
108
|
|
109 for o, a in opts:
|
|
110 if o == '-h':
|
|
111 print ('Usage:')
|
|
112 print (' %s <sam_file>' % sys.argv[0])
|
|
113 sys.exit(0)
|
|
114 else:
|
|
115 print ('unhandled option')
|
|
116 sys.exit(1)
|
|
117
|
|
118 if len(args) == 0:
|
|
119 print ('Specify a SAM file as an argument')
|
|
120 sys.exit(1)
|
|
121
|
|
122 for frag in SAMFile(args[0]):
|
|
123 print (frag)
|