comparison bam_to_psl.py @ 0:b6794f4cb1c6 draft default tip

Uploaded
author greg
date Mon, 12 Dec 2022 14:42:58 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:b6794f4cb1c6
1 #!/usr/bin/env python
2
3 # This code is very loosely based on
4 # the FusionCatcher sam2psl script here:
5 # https://github.com/ndaniel/fusioncatcher/blob/master/bin/sam2psl.py
6
7 import argparse
8 import sys
9
10
11 def parse_cigar(c):
12 r = []
13 d = ''
14 mismatches_x = 0
15 c = c.upper()
16 for a in c:
17 if a.isdigit():
18 d = d + a
19 elif a in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']:
20 dd = int(d)
21 r.append((a, dd))
22 if a == 'X':
23 mismatches_x = mismatches_x + dd
24 d = ''
25 else:
26 msg = "Error: unknown CIGAR: %s\n" % str(c)
27 sys.exit(msg)
28 return (r, mismatches_x)
29
30
31 def blocks(cigar, ig=0):
32 # Returns block of matches. Hard clipping is
33 # converted to soft clipping index on read.
34 ir = 0
35 rr = []
36 rg = []
37 match = 0
38 mismatch = 0
39 mismatch_x = 0
40 mismatch_clip = 0
41 insert_query = 0
42 insert_query_count = 0
43 insert_ref = 0
44 insert_ref_count = 0
45 # Sum of lengths of the M/I/S/=/X operations
46 # will equal the length of SEQ.
47 seq_len = 0
48 (cig, mismatch_x) = parse_cigar(cigar)
49 mismatch = mismatch_x
50 for e in cig:
51 if e[0] in ('S', 'H'):
52 ir = ir + e[1]
53 mismatch_clip = mismatch_clip + e[1]
54 seq_len = seq_len + e[1]
55 elif e[0] in ('I',):
56 ir = ir + e[1]
57 mismatch = mismatch + e[1]
58 insert_query = insert_query + e[1]
59 insert_query_count = insert_query_count + 1
60 seq_len = seq_len + e[1]
61 elif e[0] in ('X'):
62 ir = ir + e[1]
63 ig = ig + e[1]
64 mismatch = mismatch + e[1]
65 mismatch_x = mismatch_x + e[1]
66 seq_len = seq_len + e[1]
67 elif e[0] in ('M', '='):
68 rr.append((ir, ir + e[1]))
69 rg.append((ig, ig + e[1]))
70 ir = ir + e[1]
71 ig = ig + e[1]
72 match = match + e[1]
73 seq_len = seq_len + e[1]
74 elif e[0] in ('D', 'N', 'P'):
75 ig = ig + e[1]
76 insert_ref = insert_ref + e[1]
77 insert_ref_count = insert_ref_count + 1
78 return (rr, rg, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len)
79
80
81 def get_psl(sam, lens):
82 # Returns PSL coordinates.
83 if sam and sam[1].isdigit():
84 unmapped = True if int(sam[1]) & 0x4 else False
85 if (not unmapped) and sam[2] != '*' and sam[5] != '*' and sam[0] != '*':
86 # Initialize psl_items to those
87 # that constitute a PSL empty line.
88 psl_items = ['0', '0', '0', '0', '0', '0', '0', '0', '+', 's', '0', '0', '0', 'r', '0', '0', '0', '0', ',', ',', ',']
89 # Read sequence length.
90 psl_items[14] = lens.get(sam[2], 0)
91 # Reference name.
92 psl_items[13] = sam[2]
93 # Read name.
94 psl_items[9] = sam[0]
95 # Strand.
96 psl_items[8] = "-" if int(sam[1]) & 0x10 else '+'
97 # Start position.
98 psl_items[15] = int(sam[3]) - 1
99 (interval_query, interval_ref, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) = blocks(sam[5], ig=psl_items[15])
100 # Read sequence length.
101 if sam[9] != '*' and sam[5].find('H') == -1:
102 psl_items[10] = len(sam[9])
103 else:
104 # The length of SEQ is the sum of the
105 # lengths of the M/I/S/=/X operations.
106 psl_items[10] = seq_len
107 psl_items[4] = insert_query_count
108 psl_items[5] = insert_query
109 psl_items[6] = insert_ref_count
110 psl_items[7] = insert_ref
111 # Extract the mismatches using tag
112 # NM:i (NM is mismatches per reads).
113 tag_nm_i = [e.partition("NM:i:")[2] for e in sam[11:] if e.startswith('NM:i:')]
114 if not tag_nm_i:
115 # NM is not ideal because it is mismatches
116 # per # fragment and not per read, but is
117 # etter than nothing.
118 tag_nm_i = [e.partition("nM:i:")[2] for e in sam[11:] if e.startswith('nM:i:')]
119 tag_nm_i = int(tag_nm_i[0]) if tag_nm_i else 0
120 if tag_nm_i > float(0.90) * seq_len:
121 tag_nm_i = 0
122 # Compute the matches and mismatches (include the
123 # clipping as mismatches).
124 psl_items[0] = match
125 psl_items[1] = mismatch
126 if interval_query:
127 psl_items[11] = interval_query[0][0]
128 psl_items[12] = interval_query[-1][1]
129 psl_items[16] = interval_ref[-1][1]
130 psl_items[17] = len(interval_query)
131 # BLAT always gives the coordinates as
132 # everything is mapped on the forwward
133 # strand.
134 psl_items[18] = ','.join([str(e[1] - e[0]) for e in interval_query]) + ','
135 psl_items[19] = ','.join([str(e[0]) for e in interval_query]) + ','
136 psl_items[20] = ','.join([str(e[0]) for e in interval_ref]) + ','
137 return map(str, psl_items)
138 else:
139 return None
140
141
142 def to_psl(input_file, psl_file):
143 # Convert a SAM file to PSL format.
144 header = dict()
145 with open(input_file, 'r') as fin, open(psl_file, 'w') as fou:
146 for i, sam_line in enumerate(fin):
147 sam_line = sam_line.rstrip('\r\n')
148 if sam_line:
149 sam_items = sam_line.split('\t')
150 if sam_items[0].startswith('@'):
151 if sam_items[0].startswith('@SQ') and sam_items[1].startswith('SN:') and sam_items[2].startswith('LN:'):
152 k = sam_items[1][3:]
153 v = int(sam_items[2][3:])
154 header[k] = v
155 else:
156 psl_items = get_psl(sam_items, header)
157 if psl_items is not None:
158 fou.write('%s\n' % '\t'.join(psl_items))
159
160
161 if __name__ == '__main__':
162
163 parser = argparse.ArgumentParser()
164
165 parser.add_argument("--input_file", action="store", dest="input_file", help="Input file in SAM format.")
166 parser.add_argument("--output_file", action="store", dest="output_file", help="Output file in PSL format.")
167
168 args = parser.parse_args()
169
170 to_psl(args.input_file, args.output_file)