0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Given an input sam file, provides analysis of the indels..
|
|
5
|
|
6 usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
|
|
7 -i, --input=i: The sam file to analyze
|
|
8 -t, --threshold=t: The deletion frequency threshold
|
|
9 -I, --out_ins=I: The interval output file showing insertions
|
|
10 -D, --out_del=D: The interval output file showing deletions
|
|
11 """
|
|
12
|
|
13 import re, sys
|
|
14 from galaxy import eggs
|
|
15 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
16 from bx.cookbook import doc_optparse
|
|
17
|
|
18
|
|
19 def stop_err( msg ):
|
|
20 sys.stderr.write( '%s\n' % msg )
|
|
21 sys.exit()
|
|
22
|
|
23 def add_to_mis_matches( mis_matches, pos, bases ):
|
|
24 """
|
|
25 Adds the bases and counts to the mis_matches dict
|
|
26 """
|
|
27 for j, base in enumerate( bases ):
|
|
28 try:
|
|
29 mis_matches[ pos + j ][ base ] += 1
|
|
30 except KeyError:
|
|
31 try:
|
|
32 mis_matches[ pos + j ][ base ] = 1
|
|
33 except KeyError:
|
|
34 mis_matches[ pos + j ] = { base: 1 }
|
|
35
|
|
36 def __main__():
|
|
37 #Parse Command Line
|
|
38 options, args = doc_optparse.parse( __doc__ )
|
|
39 # prep output files
|
|
40 out_ins = open( options.out_ins, 'wb' )
|
|
41 out_del = open( options.out_del, 'wb' )
|
|
42 # patterns
|
|
43 pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
|
|
44 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
|
|
45 # for tracking occurences at each pos of ref
|
|
46 mis_matches = {}
|
|
47 indels = {}
|
|
48 multi_indel_lines = 0
|
|
49 # go through all lines in input file
|
|
50 for i,line in enumerate( open( options.input, 'rb' ) ):
|
|
51 if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
|
|
52 split_line = line.split( '\t' )
|
|
53 chrom = split_line[2].strip()
|
|
54 pos = int( split_line[3].strip() )
|
|
55 cigar = split_line[5].strip()
|
|
56 bases = split_line[9].strip()
|
|
57 # if not an indel or match, exit
|
|
58 if chrom == '*':
|
|
59 continue
|
|
60 # find matches like 3M2D7M or 7M3I10M
|
|
61 match = {}
|
|
62 m = pat.match( cigar )
|
|
63 # unprocessable CIGAR
|
|
64 if not m:
|
|
65 m = pat_multi.match( cigar )
|
|
66 # skip this line if no match
|
|
67 if not m:
|
|
68 continue
|
|
69 # account for multiple indels or operations we don't process
|
|
70 else:
|
|
71 multi_indel_lines += 1
|
|
72 # get matching parts for the indel or full match if matching
|
|
73 else:
|
|
74 if not mis_matches.has_key( chrom ):
|
|
75 mis_matches[ chrom ] = {}
|
|
76 indels[ chrom ] = { 'D': {}, 'I': {} }
|
|
77 parts = m.groupdict()
|
|
78 if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
|
|
79 match = parts
|
|
80 # see if matches meet filter requirements
|
|
81 if match:
|
|
82 # match/mismatch
|
|
83 if parts[ 'match_width' ]:
|
|
84 add_to_mis_matches( mis_matches[ chrom ], pos, bases )
|
|
85 # indel
|
|
86 else:
|
|
87 # pieces of CIGAR string
|
|
88 left = int( match[ 'lmatch' ] )
|
|
89 middle = int( match[ 'ins_del_width' ] )
|
|
90 right = int( match[ 'rmatch' ] )
|
|
91 left_bases = bases[ : left ]
|
|
92 if match[ 'ins_del' ] == 'I':
|
|
93 middle_bases = bases[ left : left + middle ]
|
|
94 else:
|
|
95 middle_bases = ''
|
|
96 right_bases = bases[ -right : ]
|
|
97 start = pos + left
|
|
98 # add data to ref_pos dict for match/mismatch bases on left and on right
|
|
99 add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
|
|
100 if match[ 'ins_del' ] == 'I':
|
|
101 add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
|
|
102 else:
|
|
103 add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
|
|
104 # for insertions, count instances of particular inserted bases
|
|
105 if match[ 'ins_del' ] == 'I':
|
|
106 if indels[ chrom ][ 'I' ].has_key( start ):
|
|
107 try:
|
|
108 indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
|
|
109 except KeyError:
|
|
110 indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
|
|
111 else:
|
|
112 indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
|
|
113 # for deletions, count number of deletions bases
|
|
114 else:
|
|
115 if indels[ chrom ][ 'D' ].has_key( start ):
|
|
116 try:
|
|
117 indels[ chrom ][ 'D' ][ start ][ middle ] += 1
|
|
118 except KeyError:
|
|
119 indels[ chrom ][ 'D' ][ start ][ middle ] = 1
|
|
120 else:
|
|
121 indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
|
|
122 # compute deletion frequencies and insertion frequencies for checking against threshold
|
|
123 freqs = {}
|
|
124 ins_freqs = {}
|
|
125 chroms = mis_matches.keys()
|
|
126 chroms.sort()
|
|
127 for chrom in chroms:
|
|
128 freqs[ chrom ] = {}
|
|
129 ins_freqs[ chrom ] = {}
|
|
130 poses = mis_matches[ chrom ].keys()
|
|
131 poses.extend( indels[ chrom ][ 'D' ].keys() )
|
|
132 poses.extend( indels[ chrom ][ 'I' ].keys() )
|
|
133 poses = list( set( poses ) )
|
|
134 for pos in poses:
|
|
135 # all reads touching this particular position
|
|
136 freqs[ chrom ][ pos ] = {}
|
|
137 sum_counts = 0.0
|
|
138 sum_counts_end = 0.0
|
|
139 # get basic counts (match/mismatch)
|
|
140 try:
|
|
141 sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
|
|
142 except KeyError:
|
|
143 pass
|
|
144 try:
|
|
145 sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
|
|
146 except KeyError:
|
|
147 pass
|
|
148 # add deletions also touching this position
|
|
149 try:
|
|
150 sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
|
|
151 except KeyError:
|
|
152 pass
|
|
153 try:
|
|
154 sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
|
|
155 except KeyError:
|
|
156 pass
|
|
157 freqs[ chrom ][ pos ][ 'total' ] = sum_counts
|
|
158 # calculate actual frequencies
|
|
159 # deletions
|
|
160 # frequencies for deletions
|
|
161 try:
|
|
162 for d in indels[ chrom ][ 'D' ][ pos ].keys():
|
|
163 freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
|
|
164 except KeyError:
|
|
165 pass
|
|
166 # frequencies for matches/mismatches
|
|
167 try:
|
|
168 for base in mis_matches[ chrom ][ pos ].keys():
|
|
169 try:
|
|
170 prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
|
|
171 freqs[ chrom ][ pos ][ base ] = prop
|
|
172 except ZeroDivisionError:
|
|
173 freqs[ chrom ][ pos ][ base ] = 0.0
|
|
174 except KeyError:
|
|
175 pass
|
|
176 # insertions
|
|
177 try:
|
|
178 for bases in indels[ chrom ][ 'I' ][ pos ].keys():
|
|
179 prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
|
|
180 try:
|
|
181 prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
|
|
182 except ZeroDivisionError:
|
|
183 prop_end = 0.0
|
|
184 try:
|
|
185 ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
|
|
186 except KeyError:
|
|
187 ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
|
|
188 except KeyError, e:
|
|
189 pass
|
|
190 # output to files if meet threshold requirement
|
|
191 threshold = float( options.threshold )
|
|
192 #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
|
|
193 #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
|
|
194 for chrom in chroms:
|
|
195 # deletions file
|
|
196 poses = indels[ chrom ][ 'D' ].keys()
|
|
197 poses.sort()
|
|
198 for pos in poses:
|
|
199 start = pos
|
|
200 dels = indels[ chrom ][ 'D' ][ start ].keys()
|
|
201 dels.sort()
|
|
202 for d in dels:
|
|
203 end = start + d
|
|
204 prop = freqs[ chrom ][ start ][ d ]
|
|
205 if prop > threshold :
|
|
206 out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
|
|
207 # insertions file
|
|
208 poses = indels[ chrom ][ 'I' ].keys()
|
|
209 poses.sort()
|
|
210 for pos in poses:
|
|
211 start = pos
|
|
212 end = pos + 1
|
|
213 ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
|
|
214 ins_bases.sort()
|
|
215 for bases in ins_bases:
|
|
216 prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
|
|
217 prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
|
|
218 if prop_start > threshold or prop_end > threshold:
|
|
219 out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
|
|
220 # close out files
|
|
221 out_del.close()
|
|
222 out_ins.close()
|
|
223 # if skipped lines because of more than one indel, output message
|
|
224 if multi_indel_lines > 0:
|
|
225 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
|
|
226
|
|
227 if __name__=="__main__": __main__()
|