comparison tools/indels/indel_analysis.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2
3 """
4 Given an input sam file, provides analysis of the indels..
5
6 usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
7 -i, --input=i: The sam file to analyze
8 -t, --threshold=t: The deletion frequency threshold
9 -I, --out_ins=I: The interval output file showing insertions
10 -D, --out_del=D: The interval output file showing deletions
11 """
12
13 import re, sys
14 from galaxy import eggs
15 import pkg_resources; pkg_resources.require( "bx-python" )
16 from bx.cookbook import doc_optparse
17
18
19 def stop_err( msg ):
20 sys.stderr.write( '%s\n' % msg )
21 sys.exit()
22
23 def add_to_mis_matches( mis_matches, pos, bases ):
24 """
25 Adds the bases and counts to the mis_matches dict
26 """
27 for j, base in enumerate( bases ):
28 try:
29 mis_matches[ pos + j ][ base ] += 1
30 except KeyError:
31 try:
32 mis_matches[ pos + j ][ base ] = 1
33 except KeyError:
34 mis_matches[ pos + j ] = { base: 1 }
35
36 def __main__():
37 #Parse Command Line
38 options, args = doc_optparse.parse( __doc__ )
39 # prep output files
40 out_ins = open( options.out_ins, 'wb' )
41 out_del = open( options.out_del, 'wb' )
42 # patterns
43 pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
44 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
45 # for tracking occurences at each pos of ref
46 mis_matches = {}
47 indels = {}
48 multi_indel_lines = 0
49 # go through all lines in input file
50 for i,line in enumerate( open( options.input, 'rb' ) ):
51 if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
52 split_line = line.split( '\t' )
53 chrom = split_line[2].strip()
54 pos = int( split_line[3].strip() )
55 cigar = split_line[5].strip()
56 bases = split_line[9].strip()
57 # if not an indel or match, exit
58 if chrom == '*':
59 continue
60 # find matches like 3M2D7M or 7M3I10M
61 match = {}
62 m = pat.match( cigar )
63 # unprocessable CIGAR
64 if not m:
65 m = pat_multi.match( cigar )
66 # skip this line if no match
67 if not m:
68 continue
69 # account for multiple indels or operations we don't process
70 else:
71 multi_indel_lines += 1
72 # get matching parts for the indel or full match if matching
73 else:
74 if not mis_matches.has_key( chrom ):
75 mis_matches[ chrom ] = {}
76 indels[ chrom ] = { 'D': {}, 'I': {} }
77 parts = m.groupdict()
78 if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
79 match = parts
80 # see if matches meet filter requirements
81 if match:
82 # match/mismatch
83 if parts[ 'match_width' ]:
84 add_to_mis_matches( mis_matches[ chrom ], pos, bases )
85 # indel
86 else:
87 # pieces of CIGAR string
88 left = int( match[ 'lmatch' ] )
89 middle = int( match[ 'ins_del_width' ] )
90 right = int( match[ 'rmatch' ] )
91 left_bases = bases[ : left ]
92 if match[ 'ins_del' ] == 'I':
93 middle_bases = bases[ left : left + middle ]
94 else:
95 middle_bases = ''
96 right_bases = bases[ -right : ]
97 start = pos + left
98 # add data to ref_pos dict for match/mismatch bases on left and on right
99 add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
100 if match[ 'ins_del' ] == 'I':
101 add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
102 else:
103 add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
104 # for insertions, count instances of particular inserted bases
105 if match[ 'ins_del' ] == 'I':
106 if indels[ chrom ][ 'I' ].has_key( start ):
107 try:
108 indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
109 except KeyError:
110 indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
111 else:
112 indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
113 # for deletions, count number of deletions bases
114 else:
115 if indels[ chrom ][ 'D' ].has_key( start ):
116 try:
117 indels[ chrom ][ 'D' ][ start ][ middle ] += 1
118 except KeyError:
119 indels[ chrom ][ 'D' ][ start ][ middle ] = 1
120 else:
121 indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
122 # compute deletion frequencies and insertion frequencies for checking against threshold
123 freqs = {}
124 ins_freqs = {}
125 chroms = mis_matches.keys()
126 chroms.sort()
127 for chrom in chroms:
128 freqs[ chrom ] = {}
129 ins_freqs[ chrom ] = {}
130 poses = mis_matches[ chrom ].keys()
131 poses.extend( indels[ chrom ][ 'D' ].keys() )
132 poses.extend( indels[ chrom ][ 'I' ].keys() )
133 poses = list( set( poses ) )
134 for pos in poses:
135 # all reads touching this particular position
136 freqs[ chrom ][ pos ] = {}
137 sum_counts = 0.0
138 sum_counts_end = 0.0
139 # get basic counts (match/mismatch)
140 try:
141 sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
142 except KeyError:
143 pass
144 try:
145 sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
146 except KeyError:
147 pass
148 # add deletions also touching this position
149 try:
150 sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
151 except KeyError:
152 pass
153 try:
154 sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
155 except KeyError:
156 pass
157 freqs[ chrom ][ pos ][ 'total' ] = sum_counts
158 # calculate actual frequencies
159 # deletions
160 # frequencies for deletions
161 try:
162 for d in indels[ chrom ][ 'D' ][ pos ].keys():
163 freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
164 except KeyError:
165 pass
166 # frequencies for matches/mismatches
167 try:
168 for base in mis_matches[ chrom ][ pos ].keys():
169 try:
170 prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
171 freqs[ chrom ][ pos ][ base ] = prop
172 except ZeroDivisionError:
173 freqs[ chrom ][ pos ][ base ] = 0.0
174 except KeyError:
175 pass
176 # insertions
177 try:
178 for bases in indels[ chrom ][ 'I' ][ pos ].keys():
179 prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
180 try:
181 prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
182 except ZeroDivisionError:
183 prop_end = 0.0
184 try:
185 ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
186 except KeyError:
187 ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
188 except KeyError, e:
189 pass
190 # output to files if meet threshold requirement
191 threshold = float( options.threshold )
192 #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
193 #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
194 for chrom in chroms:
195 # deletions file
196 poses = indels[ chrom ][ 'D' ].keys()
197 poses.sort()
198 for pos in poses:
199 start = pos
200 dels = indels[ chrom ][ 'D' ][ start ].keys()
201 dels.sort()
202 for d in dels:
203 end = start + d
204 prop = freqs[ chrom ][ start ][ d ]
205 if prop > threshold :
206 out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
207 # insertions file
208 poses = indels[ chrom ][ 'I' ].keys()
209 poses.sort()
210 for pos in poses:
211 start = pos
212 end = pos + 1
213 ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
214 ins_bases.sort()
215 for bases in ins_bases:
216 prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
217 prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
218 if prop_start > threshold or prop_end > threshold:
219 out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
220 # close out files
221 out_del.close()
222 out_ins.close()
223 # if skipped lines because of more than one indel, output message
224 if multi_indel_lines > 0:
225 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
226
227 if __name__=="__main__": __main__()