0
|
1 #!/usr/bin/env python
|
|
2 #By: Guruprasad Ananda
|
|
3 """
|
|
4 Fetch closest up/downstream interval from features corresponding to every interval in primary
|
|
5
|
|
6 usage: %prog primary_file features_file out_file direction
|
|
7 -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
|
|
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
|
|
9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
|
|
10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
|
|
11 """
|
|
12
|
|
13 import sys, traceback, fileinput
|
|
14 from warnings import warn
|
|
15 from bx.cookbook import doc_optparse
|
|
16 from galaxy.tools.util.galaxyops import *
|
|
17 from bx.intervals.io import *
|
|
18 from bx.intervals.operations import quicksect
|
|
19 from utils.gff_util import *
|
|
20
|
|
21 assert sys.version_info[:2] >= ( 2, 4 )
|
|
22
|
|
23 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
|
|
24 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
|
|
25 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
|
|
26 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
|
|
27 if direction == 1:
|
|
28 if node.maxend <= threshold_up:
|
|
29 if node.end == node.maxend:
|
|
30 report_func_up(node)
|
|
31 elif node.right and node.left:
|
|
32 if node.right.maxend == node.maxend:
|
|
33 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
34 elif node.left.maxend == node.maxend:
|
|
35 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
36 elif node.right and node.right.maxend == node.maxend:
|
|
37 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
38 elif node.left and node.left.maxend == node.maxend:
|
|
39 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
40 elif node.minend <= threshold_up:
|
|
41 if node.end <= threshold_up:
|
|
42 report_func_up(node)
|
|
43 if node.left and node.right:
|
|
44 if node.right.minend <= threshold_up:
|
|
45 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
46 if node.left.minend <= threshold_up:
|
|
47 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
48 elif node.left:
|
|
49 if node.left.minend <= threshold_up:
|
|
50 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
51 elif node.right:
|
|
52 if node.right.minend <= threshold_up:
|
|
53 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
54 elif direction == 0:
|
|
55 if node.start > threshold_down:
|
|
56 report_func_down(node)
|
|
57 if node.left:
|
|
58 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
59 else:
|
|
60 if node.right:
|
|
61 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
|
|
62
|
|
63 def proximal_region_finder(readers, region, comments=True):
|
|
64 """
|
|
65 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
|
|
66 Intervals are GenomicInterval objects.
|
|
67 """
|
|
68 primary = readers[0]
|
|
69 features = readers[1]
|
|
70 either = False
|
|
71 if region == 'Upstream':
|
|
72 up, down = True, False
|
|
73 elif region == 'Downstream':
|
|
74 up, down = False, True
|
|
75 else:
|
|
76 up, down = True, True
|
|
77 if region == 'Either':
|
|
78 either = True
|
|
79
|
|
80 # Read features into memory:
|
|
81 rightTree = quicksect.IntervalTree()
|
|
82 for item in features:
|
|
83 if type( item ) is GenomicInterval:
|
|
84 rightTree.insert( item, features.linenum, item )
|
|
85
|
|
86 for interval in primary:
|
|
87 if type( interval ) is Header:
|
|
88 yield interval
|
|
89 if type( interval ) is Comment and comments:
|
|
90 yield interval
|
|
91 elif type( interval ) == GenomicInterval:
|
|
92 chrom = interval.chrom
|
|
93 start = int(interval.start)
|
|
94 end = int(interval.end)
|
|
95 strand = interval.strand
|
|
96 if chrom not in rightTree.chroms:
|
|
97 continue
|
|
98 else:
|
|
99 root = rightTree.chroms[chrom] #root node for the chrom tree
|
|
100 result_up = []
|
|
101 result_down = []
|
|
102 if (strand == '+' and up) or (strand == '-' and down):
|
|
103 #upstream +ve strand and downstream -ve strand cases
|
|
104 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
|
|
105
|
|
106 if (strand == '+' and down) or (strand == '-' and up):
|
|
107 #downstream +ve strand and upstream -ve strand case
|
|
108 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node ))
|
|
109
|
|
110 if result_up:
|
|
111 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
|
|
112 ends = []
|
|
113 for n in result_up:
|
|
114 ends.append(n.end)
|
|
115 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
|
|
116 else:
|
|
117 res_ind = 0
|
|
118 if not(either):
|
|
119 yield [ interval, result_up[res_ind].other ]
|
|
120
|
|
121 if result_down:
|
|
122 if not(either):
|
|
123 #The last element of result_down will be the closest element to the given interval
|
|
124 yield [ interval, result_down[-1].other ]
|
|
125
|
|
126 if either and (result_up or result_down):
|
|
127 iter_val = []
|
|
128 if result_up and result_down:
|
|
129 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
|
|
130 iter_val = [ interval, result_up[res_ind].other ]
|
|
131 else:
|
|
132 #The last element of result_down will be the closest element to the given interval
|
|
133 iter_val = [ interval, result_down[-1].other ]
|
|
134 elif result_up:
|
|
135 iter_val = [ interval, result_up[res_ind].other ]
|
|
136 elif result_down:
|
|
137 #The last element of result_down will be the closest element to the given interval
|
|
138 iter_val = [ interval, result_down[-1].other ]
|
|
139 yield iter_val
|
|
140
|
|
141 def main():
|
|
142 options, args = doc_optparse.parse( __doc__ )
|
|
143 try:
|
|
144 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
|
|
145 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
|
|
146 in1_gff_format = bool( options.gff1 )
|
|
147 in2_gff_format = bool( options.gff2 )
|
|
148 in_fname, in2_fname, out_fname, direction = args
|
|
149 except:
|
|
150 doc_optparse.exception()
|
|
151
|
|
152 # Set readers to handle either GFF or default format.
|
|
153 if in1_gff_format:
|
|
154 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
|
|
155 else:
|
|
156 in1_reader_wrapper = NiceReaderWrapper
|
|
157 if in2_gff_format:
|
|
158 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
|
|
159 else:
|
|
160 in2_reader_wrapper = NiceReaderWrapper
|
|
161
|
|
162 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
|
|
163 chrom_col=chr_col_1,
|
|
164 start_col=start_col_1,
|
|
165 end_col=end_col_1,
|
|
166 strand_col=strand_col_1,
|
|
167 fix_strand=True )
|
|
168 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
|
|
169 chrom_col=chr_col_2,
|
|
170 start_col=start_col_2,
|
|
171 end_col=end_col_2,
|
|
172 strand_col=strand_col_2,
|
|
173 fix_strand=True )
|
|
174
|
|
175 # Find flanking features.
|
|
176 out_file = open( out_fname, "w" )
|
|
177 try:
|
|
178 for result in proximal_region_finder([g1,g2], direction):
|
|
179 if type( result ) is list:
|
|
180 line, closest_feature = result
|
|
181 # Need to join outputs differently depending on file types.
|
|
182 if in1_gff_format:
|
|
183 # Output is GFF with added attribute 'closest feature.'
|
|
184
|
|
185 # Invervals are in BED coordinates; need to convert to GFF.
|
|
186 line = convert_bed_coords_to_gff( line )
|
|
187 closest_feature = convert_bed_coords_to_gff( closest_feature )
|
|
188
|
|
189 # Replace double quotes with single quotes in closest feature's attributes.
|
|
190 out_file.write( "%s closest_feature \"%s\" \n" %
|
|
191 ( "\t".join( line.fields ), \
|
|
192 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
|
|
193 ) )
|
|
194 else:
|
|
195 # Output is BED + closest feature fields.
|
|
196 output_line_fields = []
|
|
197 output_line_fields.extend( line.fields )
|
|
198 output_line_fields.extend( closest_feature.fields )
|
|
199 out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
|
|
200 else:
|
|
201 out_file.write( "%s\n" % result )
|
|
202 except ParseError, exc:
|
|
203 fail( "Invalid file format: %s" % str( exc ) )
|
|
204
|
|
205 print "Direction: %s" %(direction)
|
|
206 if g1.skipped > 0:
|
|
207 print skipped( g1, filedesc=" of 1st dataset" )
|
|
208 if g2.skipped > 0:
|
|
209 print skipped( g2, filedesc=" of 2nd dataset" )
|
|
210
|
|
211 if __name__ == "__main__":
|
|
212 main()
|