0
|
1 """
|
|
2 Provides utilities for working with GFF files.
|
|
3 """
|
|
4
|
|
5 import copy
|
|
6 from bx.intervals.io import *
|
|
7 from bx.tabular.io import Header, Comment
|
|
8 from utils.odict import odict
|
|
9
|
|
10 class GFFInterval( GenomicInterval ):
|
|
11 """
|
|
12 A GFF interval, including attributes. If file is strictly a GFF file,
|
|
13 only attribute is 'group.'
|
|
14 """
|
|
15 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
|
|
16 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
|
|
17 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
|
|
18 # temporarily set strand and then unset after initing GenomicInterval.
|
|
19 unknown_strand = False
|
|
20 if not fix_strand and fields[ strand_col ] == '.':
|
|
21 unknown_strand = True
|
|
22 fields[ strand_col ] = '+'
|
|
23 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
|
|
24 default_strand, fix_strand=fix_strand )
|
|
25 if unknown_strand:
|
|
26 self.strand = '.'
|
|
27 self.fields[ strand_col ] = '.'
|
|
28
|
|
29 # Handle feature, score column.
|
|
30 self.feature_col = feature_col
|
|
31 if self.feature_col >= self.nfields:
|
|
32 raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
|
|
33 self.feature = self.fields[ self.feature_col ]
|
|
34 self.score_col = score_col
|
|
35 if self.score_col >= self.nfields:
|
|
36 raise MissingFieldError( "No field for score_col (%d)" % score_col )
|
|
37 self.score = self.fields[ self.score_col ]
|
|
38
|
|
39 # GFF attributes.
|
|
40 self.attributes = parse_gff_attributes( fields[8] )
|
|
41
|
|
42 def copy( self ):
|
|
43 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
|
|
44 self.end_col, self.strand_col, self.score_col, self.strand)
|
|
45
|
|
46 class GFFFeature( GFFInterval ):
|
|
47 """
|
|
48 A GFF feature, which can include multiple intervals.
|
|
49 """
|
|
50 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \
|
|
51 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \
|
|
52 raw_size=0 ):
|
|
53 # Use copy so that first interval and feature do not share fields.
|
|
54 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \
|
|
55 start_col, end_col, strand_col, score_col, default_strand, \
|
|
56 fix_strand=fix_strand )
|
|
57 self.intervals = intervals
|
|
58 self.raw_size = raw_size
|
|
59 # Use intervals to set feature attributes.
|
|
60 for interval in self.intervals:
|
|
61 # Error checking. NOTE: intervals need not share the same strand.
|
|
62 if interval.chrom != self.chrom:
|
|
63 raise ValueError( "interval chrom does not match self chrom: %s != %s" % \
|
|
64 ( interval.chrom, self.chrom ) )
|
|
65 # Set start, end of interval.
|
|
66 if interval.start < self.start:
|
|
67 self.start = interval.start
|
|
68 if interval.end > self.end:
|
|
69 self.end = interval.end
|
|
70
|
|
71 def name( self ):
|
|
72 """ Returns feature's name. """
|
|
73 name = None
|
|
74 # Preference for name: GTF, GFF3, GFF.
|
|
75 for attr_name in [
|
|
76 # GTF:
|
|
77 'gene_id', 'transcript_id',
|
|
78 # GFF3:
|
|
79 'ID', 'id',
|
|
80 # GFF (TODO):
|
|
81 'group' ]:
|
|
82 name = self.attributes.get( attr_name, None )
|
|
83 if name is not None:
|
|
84 break
|
|
85 return name
|
|
86
|
|
87 def copy( self ):
|
|
88 intervals_copy = []
|
|
89 for interval in self.intervals:
|
|
90 intervals_copy.append( interval.copy() )
|
|
91 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
|
|
92 self.score_col, self.strand, intervals=intervals_copy )
|
|
93
|
|
94 def lines( self ):
|
|
95 lines = []
|
|
96 for interval in self.intervals:
|
|
97 lines.append( '\t'.join( interval.fields ) )
|
|
98 return lines
|
|
99
|
|
100
|
|
101 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
|
|
102 """
|
|
103 Reader wrapper that reads GFF intervals/lines and automatically converts
|
|
104 them to BED format.
|
|
105 """
|
|
106
|
|
107 def parse_row( self, line ):
|
|
108 # HACK: this should return a GFF interval, but bx-python operations
|
|
109 # require GenomicInterval objects and subclasses will not work.
|
|
110 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
|
|
111 self.end_col, self.strand_col, self.default_strand, \
|
|
112 fix_strand=self.fix_strand )
|
|
113 interval = convert_gff_coords_to_bed( interval )
|
|
114 return interval
|
|
115
|
|
116 class GFFReaderWrapper( NiceReaderWrapper ):
|
|
117 """
|
|
118 Reader wrapper for GFF files.
|
|
119
|
|
120 Wrapper has two major functions:
|
|
121
|
|
122 1. group entries for GFF file (via group column), GFF3 (via id attribute),
|
|
123 or GTF (via gene_id/transcript id);
|
|
124 2. convert coordinates from GFF format--starting and ending coordinates
|
|
125 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
|
|
126 half-open. This is useful when using GFF files as inputs to tools that
|
|
127 expect traditional interval format.
|
|
128 """
|
|
129
|
|
130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
|
|
131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
|
|
132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
|
|
133 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
|
|
134 self.feature_col = feature_col
|
|
135 self.score_col = score_col
|
|
136 self.convert_to_bed_coord = convert_to_bed_coord
|
|
137 self.last_line = None
|
|
138 self.cur_offset = 0
|
|
139 self.seed_interval = None
|
|
140 self.seed_interval_line_len = 0
|
|
141
|
|
142 def parse_row( self, line ):
|
|
143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
|
|
144 self.start_col, self.end_col, self.strand_col, self.score_col, \
|
|
145 self.default_strand, fix_strand=self.fix_strand )
|
|
146 return interval
|
|
147
|
|
148 def next( self ):
|
|
149 """ Returns next GFFFeature. """
|
|
150
|
|
151 #
|
|
152 # Helper function.
|
|
153 #
|
|
154
|
|
155 def handle_parse_error( parse_error ):
|
|
156 """ Actions to take when ParseError found. """
|
|
157 if self.outstream:
|
|
158 if self.print_delegate and hasattr(self.print_delegate,"__call__"):
|
|
159 self.print_delegate( self.outstream, e, self )
|
|
160 self.skipped += 1
|
|
161 # no reason to stuff an entire bad file into memmory
|
|
162 if self.skipped < 10:
|
|
163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
|
|
164
|
|
165 # For debugging, uncomment this to propogate parsing exceptions up.
|
|
166 # I.e. the underlying reason for an unexpected StopIteration exception
|
|
167 # can be found by uncommenting this.
|
|
168 # raise e
|
|
169
|
|
170 #
|
|
171 # Get next GFFFeature
|
|
172 #
|
|
173 raw_size = self.seed_interval_line_len
|
|
174
|
|
175 # If there is no seed interval, set one. Also, if there are no more
|
|
176 # intervals to read, this is where iterator dies.
|
|
177 if not self.seed_interval:
|
|
178 while not self.seed_interval:
|
|
179 try:
|
|
180 self.seed_interval = GenomicIntervalReader.next( self )
|
|
181 except ParseError, e:
|
|
182 handle_parse_error( e )
|
|
183 # TODO: When no longer supporting python 2.4 use finally:
|
|
184 #finally:
|
|
185 raw_size += len( self.current_line )
|
|
186
|
|
187 # If header or comment, clear seed interval and return it with its size.
|
|
188 if isinstance( self.seed_interval, ( Header, Comment ) ):
|
|
189 return_val = self.seed_interval
|
|
190 return_val.raw_size = len( self.current_line )
|
|
191 self.seed_interval = None
|
|
192 self.seed_interval_line_len = 0
|
|
193 return return_val
|
|
194
|
|
195 # Initialize feature identifier from seed.
|
|
196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
|
|
197 # For GFF3
|
|
198 feature_id = self.seed_interval.attributes.get( 'ID', None )
|
|
199 feature_parent_id = self.seed_interval.attributes.get( 'Parent', None )
|
|
200 # For GTF.
|
|
201 feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None )
|
|
202 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
|
|
203
|
|
204 # Read all intervals associated with seed.
|
|
205 feature_intervals = []
|
|
206 feature_intervals.append( self.seed_interval )
|
|
207 while True:
|
|
208 try:
|
|
209 interval = GenomicIntervalReader.next( self )
|
|
210 raw_size += len( self.current_line )
|
|
211 except StopIteration, e:
|
|
212 # No more intervals to read, but last feature needs to be
|
|
213 # returned.
|
|
214 interval = None
|
|
215 raw_size += len( self.current_line )
|
|
216 break
|
|
217 except ParseError, e:
|
|
218 handle_parse_error( e )
|
|
219 raw_size += len( self.current_line )
|
|
220 continue
|
|
221 # TODO: When no longer supporting python 2.4 use finally:
|
|
222 #finally:
|
|
223 #raw_size += len( self.current_line )
|
|
224
|
|
225 # Ignore comments.
|
|
226 if isinstance( interval, Comment ):
|
|
227 continue
|
|
228
|
|
229 # Determine if interval is part of feature.
|
|
230 part_of = False
|
|
231 group = interval.attributes.get( 'group', None )
|
|
232 # GFF test:
|
|
233 if group and feature_group == group:
|
|
234 part_of = True
|
|
235 # GFF3 test:
|
|
236 parent_id = interval.attributes.get( 'Parent', None )
|
|
237 cur_id = interval.attributes.get( 'ID', None )
|
|
238 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
|
|
239 part_of = True
|
|
240 # GTF test:
|
|
241 transcript_id = interval.attributes.get( 'transcript_id', None )
|
|
242 if transcript_id and transcript_id == feature_transcript_id:
|
|
243 part_of = True
|
|
244
|
|
245 # If interval is not part of feature, clean up and break.
|
|
246 if not part_of:
|
|
247 # Adjust raw size because current line is not part of feature.
|
|
248 raw_size -= len( self.current_line )
|
|
249 break
|
|
250
|
|
251 # Interval associated with feature.
|
|
252 feature_intervals.append( interval )
|
|
253
|
|
254 # Last interval read is the seed for the next interval.
|
|
255 self.seed_interval = interval
|
|
256 self.seed_interval_line_len = len( self.current_line )
|
|
257
|
|
258 # Return feature.
|
|
259 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
|
|
260 self.end_col, self.strand_col, self.score_col, \
|
|
261 self.default_strand, fix_strand=self.fix_strand, \
|
|
262 intervals=feature_intervals, raw_size=raw_size )
|
|
263
|
|
264 # Convert to BED coords?
|
|
265 if self.convert_to_bed_coord:
|
|
266 convert_gff_coords_to_bed( feature )
|
|
267
|
|
268 return feature
|
|
269
|
|
270 def convert_bed_coords_to_gff( interval ):
|
|
271 """
|
|
272 Converts an interval object's coordinates from BED format to GFF format.
|
|
273 Accepted object types include GenomicInterval and list (where the first
|
|
274 element in the list is the interval's start, and the second element is
|
|
275 the interval's end).
|
|
276 """
|
|
277 if isinstance( interval, GenomicInterval ):
|
|
278 interval.start += 1
|
|
279 if isinstance( interval, GFFFeature ):
|
|
280 for subinterval in interval.intervals:
|
|
281 convert_bed_coords_to_gff( subinterval )
|
|
282 elif type ( interval ) is list:
|
|
283 interval[ 0 ] += 1
|
|
284 return interval
|
|
285
|
|
286 def convert_gff_coords_to_bed( interval ):
|
|
287 """
|
|
288 Converts an interval object's coordinates from GFF format to BED format.
|
|
289 Accepted object types include GFFFeature, GenomicInterval, and list (where
|
|
290 the first element in the list is the interval's start, and the second
|
|
291 element is the interval's end).
|
|
292 """
|
|
293 if isinstance( interval, GenomicInterval ):
|
|
294 interval.start -= 1
|
|
295 if isinstance( interval, GFFFeature ):
|
|
296 for subinterval in interval.intervals:
|
|
297 convert_gff_coords_to_bed( subinterval )
|
|
298 elif type ( interval ) is list:
|
|
299 interval[ 0 ] -= 1
|
|
300 return interval
|
|
301
|
|
302 def parse_gff_attributes( attr_str ):
|
|
303 """
|
|
304 Parses a GFF/GTF attribute string and returns a dictionary of name-value
|
|
305 pairs. The general format for a GFF3 attributes string is
|
|
306
|
|
307 name1=value1;name2=value2
|
|
308
|
|
309 The general format for a GTF attribute string is
|
|
310
|
|
311 name1 "value1" ; name2 "value2"
|
|
312
|
|
313 The general format for a GFF attribute string is a single string that
|
|
314 denotes the interval's group; in this case, method returns a dictionary
|
|
315 with a single key-value pair, and key name is 'group'
|
|
316 """
|
|
317 attributes_list = attr_str.split(";")
|
|
318 attributes = {}
|
|
319 for name_value_pair in attributes_list:
|
|
320 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
|
|
321 # attribute; next, try double quotes for GTF.
|
|
322 pair = name_value_pair.strip().split("=")
|
|
323 if len( pair ) == 1:
|
|
324 pair = name_value_pair.strip().split("\"")
|
|
325 if len( pair ) == 1:
|
|
326 # Could not split for some reason -- raise exception?
|
|
327 continue
|
|
328 if pair == '':
|
|
329 continue
|
|
330 name = pair[0].strip()
|
|
331 if name == '':
|
|
332 continue
|
|
333 # Need to strip double quote from values
|
|
334 value = pair[1].strip(" \"")
|
|
335 attributes[ name ] = value
|
|
336
|
|
337 if len( attributes ) == 0:
|
|
338 # Could not split attributes string, so entire string must be
|
|
339 # 'group' attribute. This is the case for strictly GFF files.
|
|
340 attributes['group'] = attr_str
|
|
341 return attributes
|
|
342
|
|
343 def gff_attributes_to_str( attrs, gff_format ):
|
|
344 """
|
|
345 Convert GFF attributes to string. Supported formats are GFF3, GTF.
|
|
346 """
|
|
347 if gff_format == 'GTF':
|
|
348 format_string = '%s "%s"'
|
|
349 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
|
|
350 id_attr = None
|
|
351 if 'group' in attrs:
|
|
352 id_attr = 'group'
|
|
353 elif 'ID' in attrs:
|
|
354 id_attr = 'ID'
|
|
355 elif 'Parent' in attrs:
|
|
356 id_attr = 'Parent'
|
|
357 if id_attr:
|
|
358 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
|
|
359 elif gff_format == 'GFF3':
|
|
360 format_string = '%s=%s'
|
|
361 attrs_strs = []
|
|
362 for name, value in attrs.items():
|
|
363 attrs_strs.append( format_string % ( name, value ) )
|
|
364 return " ; ".join( attrs_strs )
|
|
365
|
|
366 def read_unordered_gtf( iterator, strict=False ):
|
|
367 """
|
|
368 Returns GTF features found in an iterator. GTF lines need not be ordered
|
|
369 or clustered for reader to work. Reader returns GFFFeature objects sorted
|
|
370 by transcript_id, chrom, and start position.
|
|
371 """
|
|
372
|
|
373 # -- Get function that generates line/feature key. --
|
|
374
|
|
375 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
|
|
376 if strict:
|
|
377 # Strict GTF parsing uses transcript_id only to group lines into feature.
|
|
378 key_fn = get_transcript_id
|
|
379 else:
|
|
380 # Use lenient parsing where chromosome + transcript_id is the key. This allows
|
|
381 # transcripts with same ID on different chromosomes; this occurs in some popular
|
|
382 # datasources, such as RefGenes in UCSC.
|
|
383 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
|
|
384
|
|
385
|
|
386 # Aggregate intervals by transcript_id and collect comments.
|
|
387 feature_intervals = odict()
|
|
388 comments = []
|
|
389 for count, line in enumerate( iterator ):
|
|
390 if line.startswith( '#' ):
|
|
391 comments.append( Comment( line ) )
|
|
392 continue
|
|
393
|
|
394 line_key = key_fn( line.split('\t') )
|
|
395 if line_key in feature_intervals:
|
|
396 feature = feature_intervals[ line_key ]
|
|
397 else:
|
|
398 feature = []
|
|
399 feature_intervals[ line_key ] = feature
|
|
400 feature.append( GFFInterval( None, line.split( '\t' ) ) )
|
|
401
|
|
402 # Create features.
|
|
403 chroms_features = {}
|
|
404 for count, intervals in enumerate( feature_intervals.values() ):
|
|
405 # Sort intervals by start position.
|
|
406 intervals.sort( lambda a,b: cmp( a.start, b.start ) )
|
|
407 feature = GFFFeature( None, intervals=intervals )
|
|
408 if feature.chrom not in chroms_features:
|
|
409 chroms_features[ feature.chrom ] = []
|
|
410 chroms_features[ feature.chrom ].append( feature )
|
|
411
|
|
412 # Sort features by chrom, start position.
|
|
413 chroms_features_sorted = []
|
|
414 for chrom_features in chroms_features.values():
|
|
415 chroms_features_sorted.append( chrom_features )
|
|
416 chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) )
|
|
417 for features in chroms_features_sorted:
|
|
418 features.sort( lambda a,b: cmp( a.start, b.start ) )
|
|
419
|
|
420 # Yield comments first, then features.
|
|
421 # FIXME: comments can appear anywhere in file, not just the beginning.
|
|
422 # Ideally, then comments would be associated with features and output
|
|
423 # just before feature/line.
|
|
424 for comment in comments:
|
|
425 yield comment
|
|
426
|
|
427 for chrom_features in chroms_features_sorted:
|
|
428 for feature in chrom_features:
|
|
429 yield feature
|
|
430
|