Mercurial > repos > xuebing > sharplabtool
comparison tools/indels/indel_sam2interval.py @ 0:9071e359b9a3
Uploaded
| author | xuebing |
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:9071e359b9a3 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 Allows user to filter out non-indels from SAM. | |
| 5 | |
| 6 usage: %prog [options] | |
| 7 -i, --input=i: The input SAM file | |
| 8 -u, --include_base=u: Whether or not to include the base for insertions | |
| 9 -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown | |
| 10 -o, --int_out=o: The interval output file for the converted SAM file | |
| 11 -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file | |
| 12 -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file | |
| 13 """ | |
| 14 | |
| 15 import re, sys | |
| 16 from galaxy import eggs | |
| 17 import pkg_resources; pkg_resources.require( "bx-python" ) | |
| 18 from bx.cookbook import doc_optparse | |
| 19 | |
| 20 | |
| 21 def stop_err( msg ): | |
| 22 sys.stderr.write( '%s\n' % msg ) | |
| 23 sys.exit() | |
| 24 | |
| 25 def numeric_sort( text1, text2 ): | |
| 26 """ | |
| 27 For two items containing space-separated text, compares equivalent pieces | |
| 28 numerically if both numeric or as text otherwise | |
| 29 """ | |
| 30 pieces1 = text1.split() | |
| 31 pieces2 = text2.split() | |
| 32 if len( pieces1 ) == 0: | |
| 33 return 1 | |
| 34 if len( pieces2 ) == 0: | |
| 35 return -1 | |
| 36 for i, pc1 in enumerate( pieces1 ): | |
| 37 if i == len( pieces2 ): | |
| 38 return 1 | |
| 39 if not pieces2[i].isdigit(): | |
| 40 if pc1.isdigit(): | |
| 41 return -1 | |
| 42 else: | |
| 43 if pc1 > pieces2[i]: | |
| 44 return 1 | |
| 45 elif pc1 < pieces2[i]: | |
| 46 return -1 | |
| 47 else: | |
| 48 if not pc1.isdigit(): | |
| 49 return 1 | |
| 50 else: | |
| 51 if int( pc1 ) > int( pieces2[i] ): | |
| 52 return 1 | |
| 53 elif int( pc1 ) < int( pieces2[i] ): | |
| 54 return -1 | |
| 55 if i < len( pieces2 ) - 1: | |
| 56 return -1 | |
| 57 return 0 | |
| 58 | |
| 59 def __main__(): | |
| 60 #Parse Command Line | |
| 61 options, args = doc_optparse.parse( __doc__ ) | |
| 62 | |
| 63 # open up output files | |
| 64 output = open( options.int_out, 'wb' ) | |
| 65 if options.bed_ins_out != 'None': | |
| 66 output_bed_ins = open( options.bed_ins_out, 'wb' ) | |
| 67 else: | |
| 68 output_bed_ins = None | |
| 69 if options.bed_del_out != 'None': | |
| 70 output_bed_del = open( options.bed_del_out, 'wb' ) | |
| 71 else: | |
| 72 output_bed_del = None | |
| 73 | |
| 74 # the pattern to match, assuming just one indel per cigar string | |
| 75 pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) | |
| 76 pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) | |
| 77 | |
| 78 # go through all lines in input file | |
| 79 out_data = {} | |
| 80 multi_indel_lines = 0 | |
| 81 for line in open( options.input, 'rb' ): | |
| 82 if line and not line.startswith( '#' ) and not line.startswith( '@' ) : | |
| 83 split_line = line.split( '\t' ) | |
| 84 if split_line < 12: | |
| 85 continue | |
| 86 # grab relevant pieces | |
| 87 cigar = split_line[5].strip() | |
| 88 pos = int( split_line[3] ) | |
| 89 chr = split_line[2] | |
| 90 base_string = split_line[9] | |
| 91 # parse cigar string | |
| 92 m = pat_indel.match( cigar ) | |
| 93 if not m: | |
| 94 m = pat_multi.match( cigar ) | |
| 95 # skip this line if no match | |
| 96 if not m: | |
| 97 continue | |
| 98 # account for multiple indels or operations we don't process | |
| 99 else: | |
| 100 multi_indel_lines += 1 | |
| 101 continue | |
| 102 else: | |
| 103 match = m.groupdict() | |
| 104 left = int( match[ 'lmatch' ] ) | |
| 105 middle = int( match[ 'ins_del_width' ] ) | |
| 106 middle_type = match[ 'ins_del' ] | |
| 107 bases = base_string[ left : left + middle ] | |
| 108 # calculate start and end positions, and output to insertion or deletion file | |
| 109 start = left + pos | |
| 110 if middle_type == 'D': | |
| 111 end = start + middle | |
| 112 data = [ chr, start, end, 'D' ] | |
| 113 if options.include_base == "true": | |
| 114 data.append( '-' ) | |
| 115 else: | |
| 116 end = start + 1 | |
| 117 data = [ chr, start, end, 'I' ] | |
| 118 if options.include_base == "true": | |
| 119 data.append( bases ) | |
| 120 location = '\t'.join( [ '%s' % d for d in data ] ) | |
| 121 try: | |
| 122 out_data[ location ] += 1 | |
| 123 except KeyError: | |
| 124 out_data[ location ] = 1 | |
| 125 # output to interval file | |
| 126 # get all locations and sort | |
| 127 locations = out_data.keys() | |
| 128 locations.sort( numeric_sort ) | |
| 129 last_line = '' | |
| 130 # output each location, either with counts or each occurrence | |
| 131 for loc in locations: | |
| 132 sp_loc = loc.split( '\t' ) | |
| 133 cur_line = '\t'.join( sp_loc[:3] ) | |
| 134 if options.collapse == 'true': | |
| 135 output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) ) | |
| 136 if output_bed_del and sp_loc[3] == 'D': | |
| 137 output_bed_del.write( '%s\n' % cur_line ) | |
| 138 if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line: | |
| 139 output_bed_ins.write( '%s\n' % cur_line ) | |
| 140 last_line = cur_line | |
| 141 else: | |
| 142 for i in range( out_data[ loc ] ): | |
| 143 output.write( '%s\n' % loc ) | |
| 144 if output_bed_del or output_bed_ins: | |
| 145 if output_bed_del and sp_loc[3] == 'D': | |
| 146 output_bed_del.write( '%s\n' % cur_line ) | |
| 147 if output_bed_ins and sp_loc[3] == 'I': | |
| 148 output_bed_ins.write( '%s\n' % cur_line ) | |
| 149 | |
| 150 # cleanup, close files | |
| 151 if output_bed_ins: | |
| 152 output_bed_ins.close() | |
| 153 if output_bed_del: | |
| 154 output_bed_del.close() | |
| 155 output.close() | |
| 156 | |
| 157 # if skipped lines because of more than one indel, output message | |
| 158 if multi_indel_lines > 0: | |
| 159 sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) | |
| 160 | |
| 161 if __name__=="__main__": __main__() |
