annotate bam_to_psl.py @ 0:b6794f4cb1c6 draft default tip

Uploaded
author greg
date Mon, 12 Dec 2022 14:42:58 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
1 #!/usr/bin/env python
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
2
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
3 # This code is very loosely based on
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
4 # the FusionCatcher sam2psl script here:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
5 # https://github.com/ndaniel/fusioncatcher/blob/master/bin/sam2psl.py
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
6
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
7 import argparse
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
8 import sys
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
9
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
10
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
11 def parse_cigar(c):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
12 r = []
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
13 d = ''
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
14 mismatches_x = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
15 c = c.upper()
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
16 for a in c:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
17 if a.isdigit():
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
18 d = d + a
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
19 elif a in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
20 dd = int(d)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
21 r.append((a, dd))
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
22 if a == 'X':
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
23 mismatches_x = mismatches_x + dd
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
24 d = ''
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
25 else:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
26 msg = "Error: unknown CIGAR: %s\n" % str(c)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
27 sys.exit(msg)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
28 return (r, mismatches_x)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
29
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
30
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
31 def blocks(cigar, ig=0):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
32 # Returns block of matches. Hard clipping is
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
33 # converted to soft clipping index on read.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
34 ir = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
35 rr = []
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
36 rg = []
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
37 match = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
38 mismatch = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
39 mismatch_x = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
40 mismatch_clip = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
41 insert_query = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
42 insert_query_count = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
43 insert_ref = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
44 insert_ref_count = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
45 # Sum of lengths of the M/I/S/=/X operations
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
46 # will equal the length of SEQ.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
47 seq_len = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
48 (cig, mismatch_x) = parse_cigar(cigar)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
49 mismatch = mismatch_x
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
50 for e in cig:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
51 if e[0] in ('S', 'H'):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
52 ir = ir + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
53 mismatch_clip = mismatch_clip + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
54 seq_len = seq_len + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
55 elif e[0] in ('I',):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
56 ir = ir + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
57 mismatch = mismatch + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
58 insert_query = insert_query + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
59 insert_query_count = insert_query_count + 1
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
60 seq_len = seq_len + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
61 elif e[0] in ('X'):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
62 ir = ir + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
63 ig = ig + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
64 mismatch = mismatch + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
65 mismatch_x = mismatch_x + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
66 seq_len = seq_len + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
67 elif e[0] in ('M', '='):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
68 rr.append((ir, ir + e[1]))
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
69 rg.append((ig, ig + e[1]))
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
70 ir = ir + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
71 ig = ig + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
72 match = match + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
73 seq_len = seq_len + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
74 elif e[0] in ('D', 'N', 'P'):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
75 ig = ig + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
76 insert_ref = insert_ref + e[1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
77 insert_ref_count = insert_ref_count + 1
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
78 return (rr, rg, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
79
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
80
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
81 def get_psl(sam, lens):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
82 # Returns PSL coordinates.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
83 if sam and sam[1].isdigit():
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
84 unmapped = True if int(sam[1]) & 0x4 else False
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
85 if (not unmapped) and sam[2] != '*' and sam[5] != '*' and sam[0] != '*':
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
86 # Initialize psl_items to those
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
87 # that constitute a PSL empty line.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
88 psl_items = ['0', '0', '0', '0', '0', '0', '0', '0', '+', 's', '0', '0', '0', 'r', '0', '0', '0', '0', ',', ',', ',']
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
89 # Read sequence length.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
90 psl_items[14] = lens.get(sam[2], 0)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
91 # Reference name.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
92 psl_items[13] = sam[2]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
93 # Read name.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
94 psl_items[9] = sam[0]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
95 # Strand.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
96 psl_items[8] = "-" if int(sam[1]) & 0x10 else '+'
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
97 # Start position.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
98 psl_items[15] = int(sam[3]) - 1
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
99 (interval_query, interval_ref, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) = blocks(sam[5], ig=psl_items[15])
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
100 # Read sequence length.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
101 if sam[9] != '*' and sam[5].find('H') == -1:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
102 psl_items[10] = len(sam[9])
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
103 else:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
104 # The length of SEQ is the sum of the
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
105 # lengths of the M/I/S/=/X operations.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
106 psl_items[10] = seq_len
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
107 psl_items[4] = insert_query_count
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
108 psl_items[5] = insert_query
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
109 psl_items[6] = insert_ref_count
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
110 psl_items[7] = insert_ref
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
111 # Extract the mismatches using tag
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
112 # NM:i (NM is mismatches per reads).
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
113 tag_nm_i = [e.partition("NM:i:")[2] for e in sam[11:] if e.startswith('NM:i:')]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
114 if not tag_nm_i:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
115 # NM is not ideal because it is mismatches
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
116 # per # fragment and not per read, but is
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
117 # etter than nothing.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
118 tag_nm_i = [e.partition("nM:i:")[2] for e in sam[11:] if e.startswith('nM:i:')]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
119 tag_nm_i = int(tag_nm_i[0]) if tag_nm_i else 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
120 if tag_nm_i > float(0.90) * seq_len:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
121 tag_nm_i = 0
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
122 # Compute the matches and mismatches (include the
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
123 # clipping as mismatches).
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
124 psl_items[0] = match
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
125 psl_items[1] = mismatch
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
126 if interval_query:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
127 psl_items[11] = interval_query[0][0]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
128 psl_items[12] = interval_query[-1][1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
129 psl_items[16] = interval_ref[-1][1]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
130 psl_items[17] = len(interval_query)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
131 # BLAT always gives the coordinates as
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
132 # everything is mapped on the forwward
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
133 # strand.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
134 psl_items[18] = ','.join([str(e[1] - e[0]) for e in interval_query]) + ','
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
135 psl_items[19] = ','.join([str(e[0]) for e in interval_query]) + ','
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
136 psl_items[20] = ','.join([str(e[0]) for e in interval_ref]) + ','
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
137 return map(str, psl_items)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
138 else:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
139 return None
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
140
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
141
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
142 def to_psl(input_file, psl_file):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
143 # Convert a SAM file to PSL format.
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
144 header = dict()
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
145 with open(input_file, 'r') as fin, open(psl_file, 'w') as fou:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
146 for i, sam_line in enumerate(fin):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
147 sam_line = sam_line.rstrip('\r\n')
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
148 if sam_line:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
149 sam_items = sam_line.split('\t')
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
150 if sam_items[0].startswith('@'):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
151 if sam_items[0].startswith('@SQ') and sam_items[1].startswith('SN:') and sam_items[2].startswith('LN:'):
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
152 k = sam_items[1][3:]
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
153 v = int(sam_items[2][3:])
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
154 header[k] = v
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
155 else:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
156 psl_items = get_psl(sam_items, header)
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
157 if psl_items is not None:
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
158 fou.write('%s\n' % '\t'.join(psl_items))
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
159
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
160
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
161 if __name__ == '__main__':
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
162
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
163 parser = argparse.ArgumentParser()
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
164
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
165 parser.add_argument("--input_file", action="store", dest="input_file", help="Input file in SAM format.")
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
166 parser.add_argument("--output_file", action="store", dest="output_file", help="Output file in PSL format.")
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
167
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
168 args = parser.parse_args()
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
169
b6794f4cb1c6 Uploaded
greg
parents:
diff changeset
170 to_psl(args.input_file, args.output_file)