Mercurial > repos > schang > frp_tool
comparison SAM_parser.py @ 6:d4ac3899145b draft
Uploaded
author | schang |
---|---|
date | Tue, 02 Dec 2014 19:54:26 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
5:70e76f6bfcfb | 6:d4ac3899145b |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Parse SAM file format from Bowtie2. | |
4 | |
5 Features: | |
6 - remove header lines using bowtie option --no-hd, and keep only aligned reads --no-unal | |
7 - the refseq field is assumed to look like: | |
8 gi|228472346|ref|NZ_ACLQ01000011.1| | |
9 which is common in HMP accessions to SRA. In this case, the string | |
10 starting with "NZ_" is treated as the refseq | |
11 """ | |
12 | |
13 from __future__ import division | |
14 import sys, getopt | |
15 | |
16 class SAMFragment: | |
17 """ | |
18 Represents a fragment match to a reference genome. | |
19 """ | |
20 | |
21 def __init__(self, sam_string): | |
22 '''Look at SAM file format documentation for more details''' | |
23 fields = sam_string.rstrip().split() | |
24 | |
25 self.name = fields[0] #sample run name (QNAME) | |
26 self.refseq = fields[2].split('|')[3] # Reference genome name (Accession#) or RNAME | |
27 # self.refseq = fields[2] | |
28 self.location = int(fields[3]) # POS | |
29 self.cigar = fields[5] # CIGAR | |
30 self.sequence = fields[9] # SEQ | |
31 self.length = len(self.sequence) # length of sequence | |
32 | |
33 for field in fields[11:]: | |
34 # Parses the OPT field that contains the MD tag for identity info. | |
35 if field[0:5] == "MD:Z:": | |
36 md = field[5:] | |
37 # warning: finite state machine ahead! | |
38 # state = 0: nothing to remember | |
39 # state = 1: read a number, looking to see if next is a number | |
40 # state = 2: read a "^", looking for A, T, C, and G for deletion | |
41 state = 0 | |
42 tempstr = '' | |
43 matches = 0 | |
44 mismatches = 0 | |
45 deletions = 0 | |
46 for char in md: | |
47 if char in ('0', '1', '2', '3', '4', '5', '6', '7', | |
48 '8', '9'): | |
49 if state == 2: | |
50 deletions += len(tempstr) | |
51 tempstr = '' | |
52 state = 1 | |
53 tempstr += char | |
54 elif char == '^': | |
55 if state == 1: | |
56 matches += int(tempstr) | |
57 tempstr = '' | |
58 state = 2 | |
59 elif char in ('A', 'T', 'C', 'G'): | |
60 if state == 1: | |
61 matches += int(tempstr) | |
62 tempstr = '' | |
63 state = 0 | |
64 mismatches += 1 | |
65 elif state == 2: | |
66 tempstr += char | |
67 else: | |
68 mismatches += 1 | |
69 if state == 1: | |
70 matches += int(tempstr) | |
71 elif state == 2: | |
72 deletions += len(tempstr) | |
73 | |
74 self.matches = matches | |
75 self.mismatches = mismatches | |
76 self.deletions = deletions | |
77 self.identity = 100 * (matches / (matches + mismatches)) | |
78 | |
79 def __repr__(self): | |
80 out = [] | |
81 out.append("Name: %s\n" % self.name) | |
82 out.append("Location: %d\n" % self.location) | |
83 out.append("Identity: %4.1f\n" % self.identity) | |
84 return ''.join(out) | |
85 | |
86 | |
87 def SAMFile(filename): | |
88 """ | |
89 Parses a SAM format file, returning SAMFragment objects. | |
90 | |
91 This is a generator, so that there is no need to store a full list of | |
92 fragments in memory if it's not necessary to do so. | |
93 """ | |
94 | |
95 f = open(filename, 'rb') | |
96 | |
97 for line in f: | |
98 if line.startswith('@'): | |
99 continue | |
100 if not line.startswith("Your job"): # preprocessing error on my part | |
101 yield SAMFragment(line) | |
102 | |
103 f.close() | |
104 | |
105 | |
106 if __name__ == '__main__': | |
107 opts, args = getopt.getopt(sys.argv[1:], 'h') | |
108 | |
109 for o, a in opts: | |
110 if o == '-h': | |
111 print ('Usage:') | |
112 print (' %s <sam_file>' % sys.argv[0]) | |
113 sys.exit(0) | |
114 else: | |
115 print ('unhandled option') | |
116 sys.exit(1) | |
117 | |
118 if len(args) == 0: | |
119 print ('Specify a SAM file as an argument') | |
120 sys.exit(1) | |
121 | |
122 for frag in SAMFile(args[0]): | |
123 print (frag) |