0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # This code is very loosely based on
|
|
4 # the FusionCatcher sam2psl script here:
|
|
5 # https://github.com/ndaniel/fusioncatcher/blob/master/bin/sam2psl.py
|
|
6
|
|
7 import argparse
|
|
8 import sys
|
|
9
|
|
10
|
|
11 def parse_cigar(c):
|
|
12 r = []
|
|
13 d = ''
|
|
14 mismatches_x = 0
|
|
15 c = c.upper()
|
|
16 for a in c:
|
|
17 if a.isdigit():
|
|
18 d = d + a
|
|
19 elif a in ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']:
|
|
20 dd = int(d)
|
|
21 r.append((a, dd))
|
|
22 if a == 'X':
|
|
23 mismatches_x = mismatches_x + dd
|
|
24 d = ''
|
|
25 else:
|
|
26 msg = "Error: unknown CIGAR: %s\n" % str(c)
|
|
27 sys.exit(msg)
|
|
28 return (r, mismatches_x)
|
|
29
|
|
30
|
|
31 def blocks(cigar, ig=0):
|
|
32 # Returns block of matches. Hard clipping is
|
|
33 # converted to soft clipping index on read.
|
|
34 ir = 0
|
|
35 rr = []
|
|
36 rg = []
|
|
37 match = 0
|
|
38 mismatch = 0
|
|
39 mismatch_x = 0
|
|
40 mismatch_clip = 0
|
|
41 insert_query = 0
|
|
42 insert_query_count = 0
|
|
43 insert_ref = 0
|
|
44 insert_ref_count = 0
|
|
45 # Sum of lengths of the M/I/S/=/X operations
|
|
46 # will equal the length of SEQ.
|
|
47 seq_len = 0
|
|
48 (cig, mismatch_x) = parse_cigar(cigar)
|
|
49 mismatch = mismatch_x
|
|
50 for e in cig:
|
|
51 if e[0] in ('S', 'H'):
|
|
52 ir = ir + e[1]
|
|
53 mismatch_clip = mismatch_clip + e[1]
|
|
54 seq_len = seq_len + e[1]
|
|
55 elif e[0] in ('I',):
|
|
56 ir = ir + e[1]
|
|
57 mismatch = mismatch + e[1]
|
|
58 insert_query = insert_query + e[1]
|
|
59 insert_query_count = insert_query_count + 1
|
|
60 seq_len = seq_len + e[1]
|
|
61 elif e[0] in ('X'):
|
|
62 ir = ir + e[1]
|
|
63 ig = ig + e[1]
|
|
64 mismatch = mismatch + e[1]
|
|
65 mismatch_x = mismatch_x + e[1]
|
|
66 seq_len = seq_len + e[1]
|
|
67 elif e[0] in ('M', '='):
|
|
68 rr.append((ir, ir + e[1]))
|
|
69 rg.append((ig, ig + e[1]))
|
|
70 ir = ir + e[1]
|
|
71 ig = ig + e[1]
|
|
72 match = match + e[1]
|
|
73 seq_len = seq_len + e[1]
|
|
74 elif e[0] in ('D', 'N', 'P'):
|
|
75 ig = ig + e[1]
|
|
76 insert_ref = insert_ref + e[1]
|
|
77 insert_ref_count = insert_ref_count + 1
|
|
78 return (rr, rg, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len)
|
|
79
|
|
80
|
|
81 def get_psl(sam, lens):
|
|
82 # Returns PSL coordinates.
|
|
83 if sam and sam[1].isdigit():
|
|
84 unmapped = True if int(sam[1]) & 0x4 else False
|
|
85 if (not unmapped) and sam[2] != '*' and sam[5] != '*' and sam[0] != '*':
|
|
86 # Initialize psl_items to those
|
|
87 # that constitute a PSL empty line.
|
|
88 psl_items = ['0', '0', '0', '0', '0', '0', '0', '0', '+', 's', '0', '0', '0', 'r', '0', '0', '0', '0', ',', ',', ',']
|
|
89 # Read sequence length.
|
|
90 psl_items[14] = lens.get(sam[2], 0)
|
|
91 # Reference name.
|
|
92 psl_items[13] = sam[2]
|
|
93 # Read name.
|
|
94 psl_items[9] = sam[0]
|
|
95 # Strand.
|
|
96 psl_items[8] = "-" if int(sam[1]) & 0x10 else '+'
|
|
97 # Start position.
|
|
98 psl_items[15] = int(sam[3]) - 1
|
|
99 (interval_query, interval_ref, match, mismatch, mismatch_clip, mismatch_x, insert_ref, insert_ref_count, insert_query, insert_query_count, seq_len) = blocks(sam[5], ig=psl_items[15])
|
|
100 # Read sequence length.
|
|
101 if sam[9] != '*' and sam[5].find('H') == -1:
|
|
102 psl_items[10] = len(sam[9])
|
|
103 else:
|
|
104 # The length of SEQ is the sum of the
|
|
105 # lengths of the M/I/S/=/X operations.
|
|
106 psl_items[10] = seq_len
|
|
107 psl_items[4] = insert_query_count
|
|
108 psl_items[5] = insert_query
|
|
109 psl_items[6] = insert_ref_count
|
|
110 psl_items[7] = insert_ref
|
|
111 # Extract the mismatches using tag
|
|
112 # NM:i (NM is mismatches per reads).
|
|
113 tag_nm_i = [e.partition("NM:i:")[2] for e in sam[11:] if e.startswith('NM:i:')]
|
|
114 if not tag_nm_i:
|
|
115 # NM is not ideal because it is mismatches
|
|
116 # per # fragment and not per read, but is
|
|
117 # etter than nothing.
|
|
118 tag_nm_i = [e.partition("nM:i:")[2] for e in sam[11:] if e.startswith('nM:i:')]
|
|
119 tag_nm_i = int(tag_nm_i[0]) if tag_nm_i else 0
|
|
120 if tag_nm_i > float(0.90) * seq_len:
|
|
121 tag_nm_i = 0
|
|
122 # Compute the matches and mismatches (include the
|
|
123 # clipping as mismatches).
|
|
124 psl_items[0] = match
|
|
125 psl_items[1] = mismatch
|
|
126 if interval_query:
|
|
127 psl_items[11] = interval_query[0][0]
|
|
128 psl_items[12] = interval_query[-1][1]
|
|
129 psl_items[16] = interval_ref[-1][1]
|
|
130 psl_items[17] = len(interval_query)
|
|
131 # BLAT always gives the coordinates as
|
|
132 # everything is mapped on the forwward
|
|
133 # strand.
|
|
134 psl_items[18] = ','.join([str(e[1] - e[0]) for e in interval_query]) + ','
|
|
135 psl_items[19] = ','.join([str(e[0]) for e in interval_query]) + ','
|
|
136 psl_items[20] = ','.join([str(e[0]) for e in interval_ref]) + ','
|
|
137 return map(str, psl_items)
|
|
138 else:
|
|
139 return None
|
|
140
|
|
141
|
|
142 def to_psl(input_file, psl_file):
|
|
143 # Convert a SAM file to PSL format.
|
|
144 header = dict()
|
|
145 with open(input_file, 'r') as fin, open(psl_file, 'w') as fou:
|
|
146 for i, sam_line in enumerate(fin):
|
|
147 sam_line = sam_line.rstrip('\r\n')
|
|
148 if sam_line:
|
|
149 sam_items = sam_line.split('\t')
|
|
150 if sam_items[0].startswith('@'):
|
|
151 if sam_items[0].startswith('@SQ') and sam_items[1].startswith('SN:') and sam_items[2].startswith('LN:'):
|
|
152 k = sam_items[1][3:]
|
|
153 v = int(sam_items[2][3:])
|
|
154 header[k] = v
|
|
155 else:
|
|
156 psl_items = get_psl(sam_items, header)
|
|
157 if psl_items is not None:
|
|
158 fou.write('%s\n' % '\t'.join(psl_items))
|
|
159
|
|
160
|
|
161 if __name__ == '__main__':
|
|
162
|
|
163 parser = argparse.ArgumentParser()
|
|
164
|
|
165 parser.add_argument("--input_file", action="store", dest="input_file", help="Input file in SAM format.")
|
|
166 parser.add_argument("--output_file", action="store", dest="output_file", help="Output file in PSL format.")
|
|
167
|
|
168 args = parser.parse_args()
|
|
169
|
|
170 to_psl(args.input_file, args.output_file)
|