0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Allows user to filter out non-indels from SAM.
|
|
5
|
|
6 usage: %prog [options]
|
|
7 -i, --input=i: The input SAM file
|
|
8 -u, --include_base=u: Whether or not to include the base for insertions
|
|
9 -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown
|
|
10 -o, --int_out=o: The interval output file for the converted SAM file
|
|
11 -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file
|
|
12 -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file
|
|
13 """
|
|
14
|
|
15 import re, sys
|
|
16 from galaxy import eggs
|
|
17 import pkg_resources; pkg_resources.require( "bx-python" )
|
|
18 from bx.cookbook import doc_optparse
|
|
19
|
|
20
|
|
21 def stop_err( msg ):
|
|
22 sys.stderr.write( '%s\n' % msg )
|
|
23 sys.exit()
|
|
24
|
|
25 def numeric_sort( text1, text2 ):
|
|
26 """
|
|
27 For two items containing space-separated text, compares equivalent pieces
|
|
28 numerically if both numeric or as text otherwise
|
|
29 """
|
|
30 pieces1 = text1.split()
|
|
31 pieces2 = text2.split()
|
|
32 if len( pieces1 ) == 0:
|
|
33 return 1
|
|
34 if len( pieces2 ) == 0:
|
|
35 return -1
|
|
36 for i, pc1 in enumerate( pieces1 ):
|
|
37 if i == len( pieces2 ):
|
|
38 return 1
|
|
39 if not pieces2[i].isdigit():
|
|
40 if pc1.isdigit():
|
|
41 return -1
|
|
42 else:
|
|
43 if pc1 > pieces2[i]:
|
|
44 return 1
|
|
45 elif pc1 < pieces2[i]:
|
|
46 return -1
|
|
47 else:
|
|
48 if not pc1.isdigit():
|
|
49 return 1
|
|
50 else:
|
|
51 if int( pc1 ) > int( pieces2[i] ):
|
|
52 return 1
|
|
53 elif int( pc1 ) < int( pieces2[i] ):
|
|
54 return -1
|
|
55 if i < len( pieces2 ) - 1:
|
|
56 return -1
|
|
57 return 0
|
|
58
|
|
59 def __main__():
|
|
60 #Parse Command Line
|
|
61 options, args = doc_optparse.parse( __doc__ )
|
|
62
|
|
63 # open up output files
|
|
64 output = open( options.int_out, 'wb' )
|
|
65 if options.bed_ins_out != 'None':
|
|
66 output_bed_ins = open( options.bed_ins_out, 'wb' )
|
|
67 else:
|
|
68 output_bed_ins = None
|
|
69 if options.bed_del_out != 'None':
|
|
70 output_bed_del = open( options.bed_del_out, 'wb' )
|
|
71 else:
|
|
72 output_bed_del = None
|
|
73
|
|
74 # the pattern to match, assuming just one indel per cigar string
|
|
75 pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
|
|
76 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
|
|
77
|
|
78 # go through all lines in input file
|
|
79 out_data = {}
|
|
80 multi_indel_lines = 0
|
|
81 for line in open( options.input, 'rb' ):
|
|
82 if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
|
|
83 split_line = line.split( '\t' )
|
|
84 if split_line < 12:
|
|
85 continue
|
|
86 # grab relevant pieces
|
|
87 cigar = split_line[5].strip()
|
|
88 pos = int( split_line[3] )
|
|
89 chr = split_line[2]
|
|
90 base_string = split_line[9]
|
|
91 # parse cigar string
|
|
92 m = pat_indel.match( cigar )
|
|
93 if not m:
|
|
94 m = pat_multi.match( cigar )
|
|
95 # skip this line if no match
|
|
96 if not m:
|
|
97 continue
|
|
98 # account for multiple indels or operations we don't process
|
|
99 else:
|
|
100 multi_indel_lines += 1
|
|
101 continue
|
|
102 else:
|
|
103 match = m.groupdict()
|
|
104 left = int( match[ 'lmatch' ] )
|
|
105 middle = int( match[ 'ins_del_width' ] )
|
|
106 middle_type = match[ 'ins_del' ]
|
|
107 bases = base_string[ left : left + middle ]
|
|
108 # calculate start and end positions, and output to insertion or deletion file
|
|
109 start = left + pos
|
|
110 if middle_type == 'D':
|
|
111 end = start + middle
|
|
112 data = [ chr, start, end, 'D' ]
|
|
113 if options.include_base == "true":
|
|
114 data.append( '-' )
|
|
115 else:
|
|
116 end = start + 1
|
|
117 data = [ chr, start, end, 'I' ]
|
|
118 if options.include_base == "true":
|
|
119 data.append( bases )
|
|
120 location = '\t'.join( [ '%s' % d for d in data ] )
|
|
121 try:
|
|
122 out_data[ location ] += 1
|
|
123 except KeyError:
|
|
124 out_data[ location ] = 1
|
|
125 # output to interval file
|
|
126 # get all locations and sort
|
|
127 locations = out_data.keys()
|
|
128 locations.sort( numeric_sort )
|
|
129 last_line = ''
|
|
130 # output each location, either with counts or each occurrence
|
|
131 for loc in locations:
|
|
132 sp_loc = loc.split( '\t' )
|
|
133 cur_line = '\t'.join( sp_loc[:3] )
|
|
134 if options.collapse == 'true':
|
|
135 output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) )
|
|
136 if output_bed_del and sp_loc[3] == 'D':
|
|
137 output_bed_del.write( '%s\n' % cur_line )
|
|
138 if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line:
|
|
139 output_bed_ins.write( '%s\n' % cur_line )
|
|
140 last_line = cur_line
|
|
141 else:
|
|
142 for i in range( out_data[ loc ] ):
|
|
143 output.write( '%s\n' % loc )
|
|
144 if output_bed_del or output_bed_ins:
|
|
145 if output_bed_del and sp_loc[3] == 'D':
|
|
146 output_bed_del.write( '%s\n' % cur_line )
|
|
147 if output_bed_ins and sp_loc[3] == 'I':
|
|
148 output_bed_ins.write( '%s\n' % cur_line )
|
|
149
|
|
150 # cleanup, close files
|
|
151 if output_bed_ins:
|
|
152 output_bed_ins.close()
|
|
153 if output_bed_del:
|
|
154 output_bed_del.close()
|
|
155 output.close()
|
|
156
|
|
157 # if skipped lines because of more than one indel, output message
|
|
158 if multi_indel_lines > 0:
|
|
159 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
|
|
160
|
|
161 if __name__=="__main__": __main__()
|