0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Allows user to filter out non-indels from SAM.
|
|
5
|
|
6 usage: %prog [options]
|
|
7 -i, --input=i: Input SAM file to be filtered
|
|
8 -q, --quality_threshold=q: Minimum quality value for adjacent bases
|
|
9 -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities
|
|
10 -o, --output=o: Filtered output SAM file
|
|
11 """
|
|
12
|
|
13 import re, sys
|
|
14 from galaxy import eggs
|
|
15 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
16 from bx.cookbook import doc_optparse
|
|
17
|
|
18
|
|
19 def stop_err( msg ):
|
|
20 sys.stderr.write( '%s\n' % msg )
|
|
21 sys.exit()
|
|
22
|
|
23 def __main__():
|
|
24 #Parse Command Line
|
|
25 options, args = doc_optparse.parse( __doc__ )
|
|
26 # prep output file
|
|
27 output = open( options.output, 'wb' )
|
|
28 # patterns
|
|
29 pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
|
|
30 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
|
|
31 try:
|
|
32 qual_thresh = int( options.quality_threshold )
|
|
33 if qual_thresh < 0 or qual_thresh > 93:
|
|
34 raise ValueError
|
|
35 except ValueError:
|
|
36 stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
|
|
37 try:
|
|
38 adj_bases = int( options.adjacent_bases )
|
|
39 if adj_bases < 1:
|
|
40 raise ValueError
|
|
41 except ValueError:
|
|
42 stop_err( 'The number of adjacent bases should be an integer greater than 1.' )
|
|
43 # record lines skipped because of more than one indel
|
|
44 multi_indel_lines = 0
|
|
45 # go through all lines in input file
|
|
46 for i,line in enumerate(open( options.input, 'rb' )):
|
|
47 if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
|
|
48 split_line = line.split( '\t' )
|
|
49 cigar = split_line[5].strip()
|
|
50 # find matches like 3M2D7M or 7M3I10M
|
|
51 match = {}
|
|
52 m = pat.match( cigar )
|
|
53 # if unprocessable CIGAR
|
|
54 if not m:
|
|
55 m = pat_multi.match( cigar )
|
|
56 # skip this line if no match
|
|
57 if not m:
|
|
58 continue
|
|
59 # account for multiple indels or operations we don't process
|
|
60 else:
|
|
61 multi_indel_lines += 1
|
|
62 # otherwise get matching parts
|
|
63 else:
|
|
64 match = m.groupdict()
|
|
65 # process for indels
|
|
66 if match:
|
|
67 left = int( match[ 'lmatch' ] )
|
|
68 right = int( match[ 'rmatch' ] )
|
|
69 if match[ 'ins_del' ] == 'I':
|
|
70 middle = int( match[ 'ins_del_width' ] )
|
|
71 else:
|
|
72 middle = 0
|
|
73 # if there are enough adjacent bases to check, then do so
|
|
74 if left >= adj_bases and right >= adj_bases:
|
|
75 quals = split_line[10]
|
|
76 eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
|
|
77 qual_thresh_met = True
|
|
78 for q in eligible_quals:
|
|
79 if ord( q ) - 33 < qual_thresh:
|
|
80 qual_thresh_met = False
|
|
81 break
|
|
82 # if filter reqs met, output line
|
|
83 if qual_thresh_met:
|
|
84 output.write( line )
|
|
85 # close out file
|
|
86 output.close()
|
|
87 # if skipped lines because of more than one indel, output message
|
|
88 if multi_indel_lines > 0:
|
|
89 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
|
|
90
|
|
91 if __name__=="__main__": __main__()
|