Mercurial > repos > xuebing > sharplabtool
comparison tools/indels/sam_indel_filter.py @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Allows user to filter out non-indels from SAM. | |
5 | |
6 usage: %prog [options] | |
7 -i, --input=i: Input SAM file to be filtered | |
8 -q, --quality_threshold=q: Minimum quality value for adjacent bases | |
9 -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities | |
10 -o, --output=o: Filtered output SAM file | |
11 """ | |
12 | |
13 import re, sys | |
14 from galaxy import eggs | |
15 import pkg_resources; pkg_resources.require( "bx-python" ) | |
16 from bx.cookbook import doc_optparse | |
17 | |
18 | |
19 def stop_err( msg ): | |
20 sys.stderr.write( '%s\n' % msg ) | |
21 sys.exit() | |
22 | |
23 def __main__(): | |
24 #Parse Command Line | |
25 options, args = doc_optparse.parse( __doc__ ) | |
26 # prep output file | |
27 output = open( options.output, 'wb' ) | |
28 # patterns | |
29 pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) | |
30 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) | |
31 try: | |
32 qual_thresh = int( options.quality_threshold ) | |
33 if qual_thresh < 0 or qual_thresh > 93: | |
34 raise ValueError | |
35 except ValueError: | |
36 stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) | |
37 try: | |
38 adj_bases = int( options.adjacent_bases ) | |
39 if adj_bases < 1: | |
40 raise ValueError | |
41 except ValueError: | |
42 stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) | |
43 # record lines skipped because of more than one indel | |
44 multi_indel_lines = 0 | |
45 # go through all lines in input file | |
46 for i,line in enumerate(open( options.input, 'rb' )): | |
47 if line and not line.startswith( '#' ) and not line.startswith( '@' ) : | |
48 split_line = line.split( '\t' ) | |
49 cigar = split_line[5].strip() | |
50 # find matches like 3M2D7M or 7M3I10M | |
51 match = {} | |
52 m = pat.match( cigar ) | |
53 # if unprocessable CIGAR | |
54 if not m: | |
55 m = pat_multi.match( cigar ) | |
56 # skip this line if no match | |
57 if not m: | |
58 continue | |
59 # account for multiple indels or operations we don't process | |
60 else: | |
61 multi_indel_lines += 1 | |
62 # otherwise get matching parts | |
63 else: | |
64 match = m.groupdict() | |
65 # process for indels | |
66 if match: | |
67 left = int( match[ 'lmatch' ] ) | |
68 right = int( match[ 'rmatch' ] ) | |
69 if match[ 'ins_del' ] == 'I': | |
70 middle = int( match[ 'ins_del_width' ] ) | |
71 else: | |
72 middle = 0 | |
73 # if there are enough adjacent bases to check, then do so | |
74 if left >= adj_bases and right >= adj_bases: | |
75 quals = split_line[10] | |
76 eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] | |
77 qual_thresh_met = True | |
78 for q in eligible_quals: | |
79 if ord( q ) - 33 < qual_thresh: | |
80 qual_thresh_met = False | |
81 break | |
82 # if filter reqs met, output line | |
83 if qual_thresh_met: | |
84 output.write( line ) | |
85 # close out file | |
86 output.close() | |
87 # if skipped lines because of more than one indel, output message | |
88 if multi_indel_lines > 0: | |
89 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) | |
90 | |
91 if __name__=="__main__": __main__() |