comparison tools/new_operations/flanking_features.py @ 0:9071e359b9a3

Uploaded
author xuebing
date Fri, 09 Mar 2012 19:37:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:9071e359b9a3
1 #!/usr/bin/env python
2 #By: Guruprasad Ananda
3 """
4 Fetch closest up/downstream interval from features corresponding to every interval in primary
5
6 usage: %prog primary_file features_file out_file direction
7 -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
8 -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
9 -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
10 -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
11 """
12 from galaxy import eggs
13 import pkg_resources
14 pkg_resources.require( "bx-python" )
15 import sys, traceback, fileinput
16 from warnings import warn
17 from bx.cookbook import doc_optparse
18 from galaxy.tools.util.galaxyops import *
19 from bx.intervals.io import *
20 from bx.intervals.operations import quicksect
21 from galaxy.datatypes.util.gff_util import *
22
23 assert sys.version_info[:2] >= ( 2, 4 )
24
25 def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
26 #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
27 #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
28 #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
29 if direction == 1:
30 if node.maxend <= threshold_up:
31 if node.end == node.maxend:
32 report_func_up(node)
33 elif node.right and node.left:
34 if node.right.maxend == node.maxend:
35 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
36 elif node.left.maxend == node.maxend:
37 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
38 elif node.right and node.right.maxend == node.maxend:
39 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
40 elif node.left and node.left.maxend == node.maxend:
41 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
42 elif node.minend <= threshold_up:
43 if node.end <= threshold_up:
44 report_func_up(node)
45 if node.left and node.right:
46 if node.right.minend <= threshold_up:
47 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
48 if node.left.minend <= threshold_up:
49 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
50 elif node.left:
51 if node.left.minend <= threshold_up:
52 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
53 elif node.right:
54 if node.right.minend <= threshold_up:
55 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
56 elif direction == 0:
57 if node.start > threshold_down:
58 report_func_down(node)
59 if node.left:
60 get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
61 else:
62 if node.right:
63 get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
64
65 def proximal_region_finder(readers, region, comments=True):
66 """
67 Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
68 Intervals are GenomicInterval objects.
69 """
70 primary = readers[0]
71 features = readers[1]
72 either = False
73 if region == 'Upstream':
74 up, down = True, False
75 elif region == 'Downstream':
76 up, down = False, True
77 else:
78 up, down = True, True
79 if region == 'Either':
80 either = True
81
82 # Read features into memory:
83 rightTree = quicksect.IntervalTree()
84 for item in features:
85 if type( item ) is GenomicInterval:
86 rightTree.insert( item, features.linenum, item )
87
88 for interval in primary:
89 if type( interval ) is Header:
90 yield interval
91 if type( interval ) is Comment and comments:
92 yield interval
93 elif type( interval ) == GenomicInterval:
94 chrom = interval.chrom
95 start = int(interval.start)
96 end = int(interval.end)
97 strand = interval.strand
98 if chrom not in rightTree.chroms:
99 continue
100 else:
101 root = rightTree.chroms[chrom] #root node for the chrom tree
102 result_up = []
103 result_down = []
104 if (strand == '+' and up) or (strand == '-' and down):
105 #upstream +ve strand and downstream -ve strand cases
106 get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
107
108 if (strand == '+' and down) or (strand == '-' and up):
109 #downstream +ve strand and upstream -ve strand case
110 get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node ))
111
112 if result_up:
113 if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
114 ends = []
115 for n in result_up:
116 ends.append(n.end)
117 res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
118 else:
119 res_ind = 0
120 if not(either):
121 yield [ interval, result_up[res_ind].other ]
122
123 if result_down:
124 if not(either):
125 #The last element of result_down will be the closest element to the given interval
126 yield [ interval, result_down[-1].other ]
127
128 if either and (result_up or result_down):
129 iter_val = []
130 if result_up and result_down:
131 if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
132 iter_val = [ interval, result_up[res_ind].other ]
133 else:
134 #The last element of result_down will be the closest element to the given interval
135 iter_val = [ interval, result_down[-1].other ]
136 elif result_up:
137 iter_val = [ interval, result_up[res_ind].other ]
138 elif result_down:
139 #The last element of result_down will be the closest element to the given interval
140 iter_val = [ interval, result_down[-1].other ]
141 yield iter_val
142
143 def main():
144 options, args = doc_optparse.parse( __doc__ )
145 try:
146 chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
147 chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
148 in1_gff_format = bool( options.gff1 )
149 in2_gff_format = bool( options.gff2 )
150 in_fname, in2_fname, out_fname, direction = args
151 except:
152 doc_optparse.exception()
153
154 # Set readers to handle either GFF or default format.
155 if in1_gff_format:
156 in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
157 else:
158 in1_reader_wrapper = NiceReaderWrapper
159 if in2_gff_format:
160 in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
161 else:
162 in2_reader_wrapper = NiceReaderWrapper
163
164 g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
165 chrom_col=chr_col_1,
166 start_col=start_col_1,
167 end_col=end_col_1,
168 strand_col=strand_col_1,
169 fix_strand=True )
170 g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
171 chrom_col=chr_col_2,
172 start_col=start_col_2,
173 end_col=end_col_2,
174 strand_col=strand_col_2,
175 fix_strand=True )
176
177 # Find flanking features.
178 out_file = open( out_fname, "w" )
179 try:
180 for result in proximal_region_finder([g1,g2], direction):
181 if type( result ) is list:
182 line, closest_feature = result
183 # Need to join outputs differently depending on file types.
184 if in1_gff_format:
185 # Output is GFF with added attribute 'closest feature.'
186
187 # Invervals are in BED coordinates; need to convert to GFF.
188 line = convert_bed_coords_to_gff( line )
189 closest_feature = convert_bed_coords_to_gff( closest_feature )
190
191 # Replace double quotes with single quotes in closest feature's attributes.
192 out_file.write( "%s closest_feature \"%s\" \n" %
193 ( "\t".join( line.fields ), \
194 "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
195 ) )
196 else:
197 # Output is BED + closest feature fields.
198 output_line_fields = []
199 output_line_fields.extend( line.fields )
200 output_line_fields.extend( closest_feature.fields )
201 out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
202 else:
203 out_file.write( "%s\n" % result )
204 except ParseError, exc:
205 fail( "Invalid file format: %s" % str( exc ) )
206
207 print "Direction: %s" %(direction)
208 if g1.skipped > 0:
209 print skipped( g1, filedesc=" of 1st dataset" )
210 if g2.skipped > 0:
211 print skipped( g2, filedesc=" of 2nd dataset" )
212
213 if __name__ == "__main__":
214 main()