0
+ − 1 #!/usr/bin/env python
+ − 2 #Guruprasad Ananda
+ − 3
+ − 4 from galaxy import eggs
+ − 5 import pkg_resources
+ − 6 pkg_resources.require( "bx-python" )
+ − 7
+ − 8 import sys, os, tempfile
+ − 9 import traceback
+ − 10 import fileinput
+ − 11 from warnings import warn
+ − 12
+ − 13 from galaxy.tools.util.galaxyops import *
+ − 14 from bx.intervals.io import *
+ − 15
+ − 16 from bx.intervals.operations import quicksect
+ − 17
+ − 18 def stop_err(msg):
+ − 19 sys.stderr.write(msg)
+ − 20 sys.exit()
+ − 21
+ − 22 def counter(node, start, end, sort_col):
+ − 23 global full, blk_len, blk_list
+ − 24 if node.start < start:
+ − 25 if node.right:
+ − 26 counter(node.right, start, end, sort_col)
+ − 27 elif start <= node.start <= end and start <= node.end <= end:
+ − 28 full += 1
+ − 29 if node.other[0] not in blk_list:
+ − 30 blk_list.append(node.other[0])
+ − 31 blk_len += int(node.other[sort_col+2])
+ − 32 if node.left and node.left.maxend > start:
+ − 33 counter(node.left, start, end, sort_col)
+ − 34 if node.right:
+ − 35 counter(node.right, start, end, sort_col)
+ − 36 elif node.start > end:
+ − 37 if node.left:
+ − 38 counter(node.left, start, end, sort_col)
+ − 39
+ − 40
+ − 41 infile = sys.argv[1]
+ − 42 fout = open(sys.argv[2],'w')
+ − 43 int_file = sys.argv[3]
+ − 44 if int_file != "None": #User has specified an interval file
+ − 45 try:
+ − 46 fint = open(int_file, 'r')
+ − 47 dbkey_i = sys.argv[4]
+ − 48 chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] )
+ − 49 except:
+ − 50 stop_err("Unable to open input Interval file")
+ − 51
+ − 52 def main():
+ − 53
+ − 54 for i, line in enumerate( file ( infile )):
+ − 55 line = line.rstrip('\r\n')
+ − 56 if len( line )>0 and not line.startswith( '#' ):
+ − 57 elems = line.split( '\t' )
+ − 58 break
+ − 59 if i == 30:
+ − 60 break # Hopefully we'll never get here...
+ − 61
+ − 62 if len( elems ) != 18:
+ − 63 stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." )
+ − 64
+ − 65 for i, line in enumerate( file ( infile )):
+ − 66 line = line.rstrip('\r\n')
+ − 67 elems = line.split('\t')
+ − 68 try:
+ − 69 assert int(elems[0])
+ − 70 assert len(elems) == 18
+ − 71 if int_file != "None":
+ − 72 if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]:
+ − 73 stop_err("The species build corresponding to your interval file is not present in the Indel file.")
+ − 74 if dbkey_i in elems[3]:
+ − 75 sort_col = 4
+ − 76 elif dbkey_i in elems[8]:
+ − 77 sort_col = 9
+ − 78 elif dbkey_i in elems[13]:
+ − 79 sort_col = 14
+ − 80 else:
+ − 81 species = []
+ − 82 species.append( elems[3].split('.')[0] )
+ − 83 species.append( elems[8].split('.')[0] )
+ − 84 species.append( elems[13].split('.')[0] )
+ − 85 sort_col = 0 #Based on block numbers
+ − 86 break
+ − 87 except:
+ − 88 continue
+ − 89
+ − 90
+ − 91 fin = open(infile, 'r')
+ − 92 skipped = 0
+ − 93
+ − 94 if int_file == "None":
+ − 95 sorted_infile = tempfile.NamedTemporaryFile()
+ − 96 cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile
+ − 97 try:
+ − 98 os.system(cmdline)
+ − 99 except:
+ − 100 stop_err("Encountered error while sorting the input file.")
+ − 101 print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2])
+ − 102 prev_bnum = -1
+ − 103 sorted_infile.seek(0)
+ − 104 for line in sorted_infile.readlines():
+ − 105 line = line.rstrip('\r\n')
+ − 106 elems = line.split('\t')
+ − 107 try:
+ − 108 assert int(elems[0])
+ − 109 assert len(elems) == 18
+ − 110 new_bnum = int(elems[0])
+ − 111 if new_bnum != prev_bnum:
+ − 112 if prev_bnum != -1:
+ − 113 irate = []
+ − 114 drate = []
+ − 115 for i,elem in enumerate(inserts):
+ − 116 try:
+ − 117 irate.append(str("%.2e" %(inserts[i]/blen[i])))
+ − 118 except:
+ − 119 irate.append('0')
+ − 120 try:
+ − 121 drate.append(str("%.2e" %(deletes[i]/blen[i])))
+ − 122 except:
+ − 123 drate.append('0')
+ − 124 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
+ − 125 inserts = [0.0, 0.0, 0.0]
+ − 126 deletes = [0.0, 0.0, 0.0]
+ − 127 blen = []
+ − 128 blen.append( int(elems[6]) )
+ − 129 blen.append( int(elems[11]) )
+ − 130 blen.append( int(elems[16]) )
+ − 131 line_sp = elems[1].split('.')[0]
+ − 132 sp_ind = species.index(line_sp)
+ − 133 if elems[1].endswith('insert'):
+ − 134 inserts[sp_ind] += 1
+ − 135 elif elems[1].endswith('delete'):
+ − 136 deletes[sp_ind] += 1
+ − 137 prev_bnum = new_bnum
+ − 138 except Exception, ei:
+ − 139 #print >>sys.stderr, ei
+ − 140 continue
+ − 141 irate = []
+ − 142 drate = []
+ − 143 for i,elem in enumerate(inserts):
+ − 144 try:
+ − 145 irate.append(str("%.2e" %(inserts[i]/blen[i])))
+ − 146 except:
+ − 147 irate.append('0')
+ − 148 try:
+ − 149 drate.append(str("%.2e" %(deletes[i]/blen[i])))
+ − 150 except:
+ − 151 drate.append('0')
+ − 152 print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate))
+ − 153 sys.exit()
+ − 154
+ − 155
+ − 156 inf = open(infile, 'r')
+ − 157 start_met = False
+ − 158 end_met = False
+ − 159 sp_file = tempfile.NamedTemporaryFile()
+ − 160 for n, line in enumerate(inf):
+ − 161 line = line.rstrip('\r\n')
+ − 162 elems = line.split('\t')
+ − 163 try:
+ − 164 assert int(elems[0])
+ − 165 assert len(elems) == 18
+ − 166 if dbkey_i not in elems[1]:
+ − 167 if not(start_met):
+ − 168 continue
+ − 169 else:
+ − 170 sp_end = n
+ − 171 break
+ − 172 else:
+ − 173 print >>sp_file, line
+ − 174 if not(start_met):
+ − 175 start_met = True
+ − 176 sp_start = n
+ − 177 except:
+ − 178 continue
+ − 179
+ − 180 try:
+ − 181 assert sp_end
+ − 182 except:
+ − 183 sp_end = n+1
+ − 184
+ − 185 sp_file.seek(0)
+ − 186 win = NiceReaderWrapper( fileinput.FileInput( int_file ),
+ − 187 chrom_col=chr_col_i,
+ − 188 start_col=start_col_i,
+ − 189 end_col=end_col_i,
+ − 190 strand_col=strand_col_i,
+ − 191 fix_strand=True)
+ − 192
+ − 193 indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ),
+ − 194 chrom_col=1,
+ − 195 start_col=sort_col,
+ − 196 end_col=sort_col+1,
+ − 197 strand_col=-1,
+ − 198 fix_strand=True)
+ − 199
+ − 200 indelTree = quicksect.IntervalTree()
+ − 201 for item in indel:
+ − 202 if type( item ) is GenomicInterval:
+ − 203 indelTree.insert( item, indel.linenum, item.fields )
+ − 204 result=[]
+ − 205
+ − 206 global full, blk_len, blk_list
+ − 207 for interval in win:
+ − 208 if type( interval ) is Header:
+ − 209 pass
+ − 210 if type( interval ) is Comment:
+ − 211 pass
+ − 212 elif type( interval ) == GenomicInterval:
+ − 213 chrom = interval.chrom
+ − 214 start = int(interval.start)
+ − 215 end = int(interval.end)
+ − 216 if start > end:
+ − 217 warn( "Interval start after end!" )
+ − 218 ins_chr = "%s.%s_insert" %(dbkey_i,chrom)
+ − 219 del_chr = "%s.%s_delete" %(dbkey_i,chrom)
+ − 220 irate = 0
+ − 221 drate = 0
+ − 222 if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms:
+ − 223 pass
+ − 224 else:
+ − 225 if ins_chr in indelTree.chroms:
+ − 226 full = 0.0
+ − 227 blk_len = 0
+ − 228 blk_list = []
+ − 229 root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree
+ − 230 counter(root, start, end, sort_col)
+ − 231 if blk_len:
+ − 232 irate = full/blk_len
+ − 233
+ − 234 if del_chr in indelTree.chroms:
+ − 235 full = 0.0
+ − 236 blk_len = 0
+ − 237 blk_list = []
+ − 238 root = indelTree.chroms[del_chr] #root node for the chrom insertion tree
+ − 239 counter(root, start, end, sort_col)
+ − 240 if blk_len:
+ − 241 drate = full/blk_len
+ − 242
+ − 243 interval.fields.append(str("%.2e" %irate))
+ − 244 interval.fields.append(str("%.2e" %drate))
+ − 245 print >>fout, "\t".join(interval.fields)
+ − 246 fout.flush()
+ − 247
+ − 248 if __name__ == "__main__":
+ − 249 main()