annotate tools/indels/sam_indel_filter.py @ 1:cdcb0ce84a1b

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:45:15 -0500
parents 9071e359b9a3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
1 #!/usr/bin/env python
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
2
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
3 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
4 Allows user to filter out non-indels from SAM.
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
5
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
6 usage: %prog [options]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
7 -i, --input=i: Input SAM file to be filtered
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
8 -q, --quality_threshold=q: Minimum quality value for adjacent bases
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
9 -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
10 -o, --output=o: Filtered output SAM file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
11 """
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
12
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
13 import re, sys
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
14 from galaxy import eggs
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
15 import pkg_resources; pkg_resources.require( "bx-python" )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
16 from bx.cookbook import doc_optparse
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
17
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
18
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
19 def stop_err( msg ):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
20 sys.stderr.write( '%s\n' % msg )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
21 sys.exit()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
22
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
23 def __main__():
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
24 #Parse Command Line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
25 options, args = doc_optparse.parse( __doc__ )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
26 # prep output file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
27 output = open( options.output, 'wb' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
28 # patterns
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
29 pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
30 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
31 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
32 qual_thresh = int( options.quality_threshold )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
33 if qual_thresh < 0 or qual_thresh > 93:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
34 raise ValueError
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
35 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
36 stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
37 try:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
38 adj_bases = int( options.adjacent_bases )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
39 if adj_bases < 1:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
40 raise ValueError
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
41 except ValueError:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
42 stop_err( 'The number of adjacent bases should be an integer greater than 1.' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
43 # record lines skipped because of more than one indel
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
44 multi_indel_lines = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
45 # go through all lines in input file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
46 for i,line in enumerate(open( options.input, 'rb' )):
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
47 if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
48 split_line = line.split( '\t' )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
49 cigar = split_line[5].strip()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
50 # find matches like 3M2D7M or 7M3I10M
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
51 match = {}
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
52 m = pat.match( cigar )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
53 # if unprocessable CIGAR
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
54 if not m:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
55 m = pat_multi.match( cigar )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
56 # skip this line if no match
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
57 if not m:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
58 continue
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
59 # account for multiple indels or operations we don't process
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
60 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
61 multi_indel_lines += 1
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
62 # otherwise get matching parts
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
63 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
64 match = m.groupdict()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
65 # process for indels
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
66 if match:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
67 left = int( match[ 'lmatch' ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
68 right = int( match[ 'rmatch' ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
69 if match[ 'ins_del' ] == 'I':
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
70 middle = int( match[ 'ins_del_width' ] )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
71 else:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
72 middle = 0
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
73 # if there are enough adjacent bases to check, then do so
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
74 if left >= adj_bases and right >= adj_bases:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
75 quals = split_line[10]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
76 eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
77 qual_thresh_met = True
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
78 for q in eligible_quals:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
79 if ord( q ) - 33 < qual_thresh:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
80 qual_thresh_met = False
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
81 break
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
82 # if filter reqs met, output line
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
83 if qual_thresh_met:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
84 output.write( line )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
85 # close out file
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
86 output.close()
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
87 # if skipped lines because of more than one indel, output message
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
88 if multi_indel_lines > 0:
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
89 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
90
9071e359b9a3 Uploaded
xuebing
parents:
diff changeset
91 if __name__=="__main__": __main__()