comparison tools/indels/sam_indel_filter.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2
3 """
4 Allows user to filter out non-indels from SAM.
5
6 usage: %prog [options]
7 -i, --input=i: Input SAM file to be filtered
8 -q, --quality_threshold=q: Minimum quality value for adjacent bases
9 -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities
10 -o, --output=o: Filtered output SAM file
11 """
12
13 import re, sys
14 from galaxy import eggs
15 import pkg_resources; pkg_resources.require( "bx-python" )
16 from bx.cookbook import doc_optparse
17
18
19 def stop_err( msg ):
20 sys.stderr.write( '%s\n' % msg )
21 sys.exit()
22
23 def __main__():
24 #Parse Command Line
25 options, args = doc_optparse.parse( __doc__ )
26 # prep output file
27 output = open( options.output, 'wb' )
28 # patterns
29 pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
30 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
31 try:
32 qual_thresh = int( options.quality_threshold )
33 if qual_thresh < 0 or qual_thresh > 93:
34 raise ValueError
35 except ValueError:
36 stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
37 try:
38 adj_bases = int( options.adjacent_bases )
39 if adj_bases < 1:
40 raise ValueError
41 except ValueError:
42 stop_err( 'The number of adjacent bases should be an integer greater than 1.' )
43 # record lines skipped because of more than one indel
44 multi_indel_lines = 0
45 # go through all lines in input file
46 for i,line in enumerate(open( options.input, 'rb' )):
47 if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
48 split_line = line.split( '\t' )
49 cigar = split_line[5].strip()
50 # find matches like 3M2D7M or 7M3I10M
51 match = {}
52 m = pat.match( cigar )
53 # if unprocessable CIGAR
54 if not m:
55 m = pat_multi.match( cigar )
56 # skip this line if no match
57 if not m:
58 continue
59 # account for multiple indels or operations we don't process
60 else:
61 multi_indel_lines += 1
62 # otherwise get matching parts
63 else:
64 match = m.groupdict()
65 # process for indels
66 if match:
67 left = int( match[ 'lmatch' ] )
68 right = int( match[ 'rmatch' ] )
69 if match[ 'ins_del' ] == 'I':
70 middle = int( match[ 'ins_del_width' ] )
71 else:
72 middle = 0
73 # if there are enough adjacent bases to check, then do so
74 if left >= adj_bases and right >= adj_bases:
75 quals = split_line[10]
76 eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
77 qual_thresh_met = True
78 for q in eligible_quals:
79 if ord( q ) - 33 < qual_thresh:
80 qual_thresh_met = False
81 break
82 # if filter reqs met, output line
83 if qual_thresh_met:
84 output.write( line )
85 # close out file
86 output.close()
87 # if skipped lines because of more than one indel, output message
88 if multi_indel_lines > 0:
89 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
90
91 if __name__=="__main__": __main__()