Mercurial > repos > xuebing > sharplabtool
comparison tools/indels/indel_analysis.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Given an input sam file, provides analysis of the indels.. | |
5 | |
6 usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] | |
7 -i, --input=i: The sam file to analyze | |
8 -t, --threshold=t: The deletion frequency threshold | |
9 -I, --out_ins=I: The interval output file showing insertions | |
10 -D, --out_del=D: The interval output file showing deletions | |
11 """ | |
12 | |
13 import re, sys | |
14 from galaxy import eggs | |
15 import pkg_resources; pkg_resources.require( "bx-python" ) | |
16 from bx.cookbook import doc_optparse | |
17 | |
18 | |
19 def stop_err( msg ): | |
20 sys.stderr.write( '%s\n' % msg ) | |
21 sys.exit() | |
22 | |
23 def add_to_mis_matches( mis_matches, pos, bases ): | |
24 """ | |
25 Adds the bases and counts to the mis_matches dict | |
26 """ | |
27 for j, base in enumerate( bases ): | |
28 try: | |
29 mis_matches[ pos + j ][ base ] += 1 | |
30 except KeyError: | |
31 try: | |
32 mis_matches[ pos + j ][ base ] = 1 | |
33 except KeyError: | |
34 mis_matches[ pos + j ] = { base: 1 } | |
35 | |
36 def __main__(): | |
37 #Parse Command Line | |
38 options, args = doc_optparse.parse( __doc__ ) | |
39 # prep output files | |
40 out_ins = open( options.out_ins, 'wb' ) | |
41 out_del = open( options.out_del, 'wb' ) | |
42 # patterns | |
43 pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' ) | |
44 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) | |
45 # for tracking occurences at each pos of ref | |
46 mis_matches = {} | |
47 indels = {} | |
48 multi_indel_lines = 0 | |
49 # go through all lines in input file | |
50 for i,line in enumerate( open( options.input, 'rb' ) ): | |
51 if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) : | |
52 split_line = line.split( '\t' ) | |
53 chrom = split_line[2].strip() | |
54 pos = int( split_line[3].strip() ) | |
55 cigar = split_line[5].strip() | |
56 bases = split_line[9].strip() | |
57 # if not an indel or match, exit | |
58 if chrom == '*': | |
59 continue | |
60 # find matches like 3M2D7M or 7M3I10M | |
61 match = {} | |
62 m = pat.match( cigar ) | |
63 # unprocessable CIGAR | |
64 if not m: | |
65 m = pat_multi.match( cigar ) | |
66 # skip this line if no match | |
67 if not m: | |
68 continue | |
69 # account for multiple indels or operations we don't process | |
70 else: | |
71 multi_indel_lines += 1 | |
72 # get matching parts for the indel or full match if matching | |
73 else: | |
74 if not mis_matches.has_key( chrom ): | |
75 mis_matches[ chrom ] = {} | |
76 indels[ chrom ] = { 'D': {}, 'I': {} } | |
77 parts = m.groupdict() | |
78 if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ): | |
79 match = parts | |
80 # see if matches meet filter requirements | |
81 if match: | |
82 # match/mismatch | |
83 if parts[ 'match_width' ]: | |
84 add_to_mis_matches( mis_matches[ chrom ], pos, bases ) | |
85 # indel | |
86 else: | |
87 # pieces of CIGAR string | |
88 left = int( match[ 'lmatch' ] ) | |
89 middle = int( match[ 'ins_del_width' ] ) | |
90 right = int( match[ 'rmatch' ] ) | |
91 left_bases = bases[ : left ] | |
92 if match[ 'ins_del' ] == 'I': | |
93 middle_bases = bases[ left : left + middle ] | |
94 else: | |
95 middle_bases = '' | |
96 right_bases = bases[ -right : ] | |
97 start = pos + left | |
98 # add data to ref_pos dict for match/mismatch bases on left and on right | |
99 add_to_mis_matches( mis_matches[ chrom ], pos, left_bases ) | |
100 if match[ 'ins_del' ] == 'I': | |
101 add_to_mis_matches( mis_matches[ chrom ], start, right_bases ) | |
102 else: | |
103 add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases ) | |
104 # for insertions, count instances of particular inserted bases | |
105 if match[ 'ins_del' ] == 'I': | |
106 if indels[ chrom ][ 'I' ].has_key( start ): | |
107 try: | |
108 indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1 | |
109 except KeyError: | |
110 indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1 | |
111 else: | |
112 indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 } | |
113 # for deletions, count number of deletions bases | |
114 else: | |
115 if indels[ chrom ][ 'D' ].has_key( start ): | |
116 try: | |
117 indels[ chrom ][ 'D' ][ start ][ middle ] += 1 | |
118 except KeyError: | |
119 indels[ chrom ][ 'D' ][ start ][ middle ] = 1 | |
120 else: | |
121 indels[ chrom ][ 'D' ][ start ] = { middle: 1 } | |
122 # compute deletion frequencies and insertion frequencies for checking against threshold | |
123 freqs = {} | |
124 ins_freqs = {} | |
125 chroms = mis_matches.keys() | |
126 chroms.sort() | |
127 for chrom in chroms: | |
128 freqs[ chrom ] = {} | |
129 ins_freqs[ chrom ] = {} | |
130 poses = mis_matches[ chrom ].keys() | |
131 poses.extend( indels[ chrom ][ 'D' ].keys() ) | |
132 poses.extend( indels[ chrom ][ 'I' ].keys() ) | |
133 poses = list( set( poses ) ) | |
134 for pos in poses: | |
135 # all reads touching this particular position | |
136 freqs[ chrom ][ pos ] = {} | |
137 sum_counts = 0.0 | |
138 sum_counts_end = 0.0 | |
139 # get basic counts (match/mismatch) | |
140 try: | |
141 sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) ) | |
142 except KeyError: | |
143 pass | |
144 try: | |
145 sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) ) | |
146 except KeyError: | |
147 pass | |
148 # add deletions also touching this position | |
149 try: | |
150 sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) ) | |
151 except KeyError: | |
152 pass | |
153 try: | |
154 sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) ) | |
155 except KeyError: | |
156 pass | |
157 freqs[ chrom ][ pos ][ 'total' ] = sum_counts | |
158 # calculate actual frequencies | |
159 # deletions | |
160 # frequencies for deletions | |
161 try: | |
162 for d in indels[ chrom ][ 'D' ][ pos ].keys(): | |
163 freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts | |
164 except KeyError: | |
165 pass | |
166 # frequencies for matches/mismatches | |
167 try: | |
168 for base in mis_matches[ chrom ][ pos ].keys(): | |
169 try: | |
170 prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts | |
171 freqs[ chrom ][ pos ][ base ] = prop | |
172 except ZeroDivisionError: | |
173 freqs[ chrom ][ pos ][ base ] = 0.0 | |
174 except KeyError: | |
175 pass | |
176 # insertions | |
177 try: | |
178 for bases in indels[ chrom ][ 'I' ][ pos ].keys(): | |
179 prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts ) | |
180 try: | |
181 prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end ) | |
182 except ZeroDivisionError: | |
183 prop_end = 0.0 | |
184 try: | |
185 ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ] | |
186 except KeyError: | |
187 ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] } | |
188 except KeyError, e: | |
189 pass | |
190 # output to files if meet threshold requirement | |
191 threshold = float( options.threshold ) | |
192 #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' ) | |
193 #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' ) | |
194 for chrom in chroms: | |
195 # deletions file | |
196 poses = indels[ chrom ][ 'D' ].keys() | |
197 poses.sort() | |
198 for pos in poses: | |
199 start = pos | |
200 dels = indels[ chrom ][ 'D' ][ start ].keys() | |
201 dels.sort() | |
202 for d in dels: | |
203 end = start + d | |
204 prop = freqs[ chrom ][ start ][ d ] | |
205 if prop > threshold : | |
206 out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) ) | |
207 # insertions file | |
208 poses = indels[ chrom ][ 'I' ].keys() | |
209 poses.sort() | |
210 for pos in poses: | |
211 start = pos | |
212 end = pos + 1 | |
213 ins_bases = indels[ chrom ][ 'I' ][ start ].keys() | |
214 ins_bases.sort() | |
215 for bases in ins_bases: | |
216 prop_start = ins_freqs[ chrom ][ start ][ bases ][0] | |
217 prop_end = ins_freqs[ chrom ][ start ][ bases ][1] | |
218 if prop_start > threshold or prop_end > threshold: | |
219 out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) ) | |
220 # close out files | |
221 out_del.close() | |
222 out_ins.close() | |
223 # if skipped lines because of more than one indel, output message | |
224 if multi_indel_lines > 0: | |
225 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) | |
226 | |
227 if __name__=="__main__": __main__() |