comparison utils/gff_util.py @ 4:7a2a604ae9c8 draft

planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/subtract commit 4e2052686dfe8003f867449e0affff96398b2a62
author devteam
date Thu, 11 Feb 2016 12:11:59 -0500
parents
children 0145969324c4
comparison
equal deleted inserted replaced
3:ecb36112b056 4:7a2a604ae9c8
1 """
2 Provides utilities for working with GFF files.
3 """
4
5 import copy
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
7 from bx.tabular.io import Header, Comment, ParseError
8 from utils.odict import odict
9
10
11 class GFFInterval( GenomicInterval ):
12 """
13 A GFF interval, including attributes. If file is strictly a GFF file,
14 only attribute is 'group.'
15 """
16 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
17 strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
18 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
19 # temporarily set strand and then unset after initing GenomicInterval.
20 unknown_strand = False
21 if not fix_strand and fields[ strand_col ] == '.':
22 unknown_strand = True
23 fields[ strand_col ] = '+'
24 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
25 default_strand, fix_strand=fix_strand )
26 if unknown_strand:
27 self.strand = '.'
28 self.fields[ strand_col ] = '.'
29
30 # Handle feature, score column.
31 self.feature_col = feature_col
32 if self.feature_col >= self.nfields:
33 raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
34 self.feature = self.fields[ self.feature_col ]
35 self.score_col = score_col
36 if self.score_col >= self.nfields:
37 raise MissingFieldError( "No field for score_col (%d)" % score_col )
38 self.score = self.fields[ self.score_col ]
39
40 # GFF attributes.
41 self.attributes = parse_gff_attributes( fields[8] )
42
43 def copy( self ):
44 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
45 self.end_col, self.strand_col, self.score_col, self.strand)
46
47
48 class GFFFeature( GFFInterval ):
49 """
50 A GFF feature, which can include multiple intervals.
51 """
52 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
53 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
54 raw_size=0 ):
55 # Use copy so that first interval and feature do not share fields.
56 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
57 start_col, end_col, strand_col, score_col, default_strand,
58 fix_strand=fix_strand )
59 self.intervals = intervals
60 self.raw_size = raw_size
61 # Use intervals to set feature attributes.
62 for interval in self.intervals:
63 # Error checking. NOTE: intervals need not share the same strand.
64 if interval.chrom != self.chrom:
65 raise ValueError( "interval chrom does not match self chrom: %s != %s" %
66 ( interval.chrom, self.chrom ) )
67 # Set start, end of interval.
68 if interval.start < self.start:
69 self.start = interval.start
70 if interval.end > self.end:
71 self.end = interval.end
72
73 def name( self ):
74 """ Returns feature's name. """
75 name = None
76 # Preference for name: GTF, GFF3, GFF.
77 for attr_name in ['gene_id', 'transcript_id', # GTF
78 'ID', 'id', # GFF3
79 'group' ]: # GFF (TODO)
80 name = self.attributes.get( attr_name, None )
81 if name is not None:
82 break
83 return name
84
85 def copy( self ):
86 intervals_copy = []
87 for interval in self.intervals:
88 intervals_copy.append( interval.copy() )
89 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
90 self.score_col, self.strand, intervals=intervals_copy )
91
92 def lines( self ):
93 lines = []
94 for interval in self.intervals:
95 lines.append( '\t'.join( interval.fields ) )
96 return lines
97
98
99 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
100 """
101 Reader wrapper that reads GFF intervals/lines and automatically converts
102 them to BED format.
103 """
104
105 def parse_row( self, line ):
106 # HACK: this should return a GFF interval, but bx-python operations
107 # require GenomicInterval objects and subclasses will not work.
108 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
109 self.end_col, self.strand_col, self.default_strand,
110 fix_strand=self.fix_strand )
111 interval = convert_gff_coords_to_bed( interval )
112 return interval
113
114
115 class GFFReaderWrapper( NiceReaderWrapper ):
116 """
117 Reader wrapper for GFF files.
118
119 Wrapper has two major functions:
120
121 1. group entries for GFF file (via group column), GFF3 (via id attribute),
122 or GTF (via gene_id/transcript id);
123 2. convert coordinates from GFF format--starting and ending coordinates
124 are 1-based, closed--to the 'traditional'/BED interval format--0 based,
125 half-open. This is useful when using GFF files as inputs to tools that
126 expect traditional interval format.
127 """
128
129 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
130 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
131 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
132 strand_col=strand_col, fix_strand=fix_strand, **kwargs )
133 self.feature_col = feature_col
134 self.score_col = score_col
135 self.convert_to_bed_coord = convert_to_bed_coord
136 self.last_line = None
137 self.cur_offset = 0
138 self.seed_interval = None
139 self.seed_interval_line_len = 0
140
141 def parse_row( self, line ):
142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
143 self.start_col, self.end_col, self.strand_col, self.score_col,
144 self.default_strand, fix_strand=self.fix_strand )
145 return interval
146
147 def next( self ):
148 """ Returns next GFFFeature. """
149
150 #
151 # Helper function.
152 #
153
154 def handle_parse_error( parse_error ):
155 """ Actions to take when ParseError found. """
156 if self.outstream:
157 if self.print_delegate and hasattr(self.print_delegate, "__call__"):
158 self.print_delegate( self.outstream, e, self )
159 self.skipped += 1
160 # no reason to stuff an entire bad file into memmory
161 if self.skipped < 10:
162 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
163
164 # For debugging, uncomment this to propogate parsing exceptions up.
165 # I.e. the underlying reason for an unexpected StopIteration exception
166 # can be found by uncommenting this.
167 # raise e
168
169 #
170 # Get next GFFFeature
171 #
172 raw_size = self.seed_interval_line_len
173
174 # If there is no seed interval, set one. Also, if there are no more
175 # intervals to read, this is where iterator dies.
176 if not self.seed_interval:
177 while not self.seed_interval:
178 try:
179 self.seed_interval = GenomicIntervalReader.next( self )
180 except ParseError, e:
181 handle_parse_error( e )
182 # TODO: When no longer supporting python 2.4 use finally:
183 #finally:
184 raw_size += len( self.current_line )
185
186 # If header or comment, clear seed interval and return it with its size.
187 if isinstance( self.seed_interval, ( Header, Comment ) ):
188 return_val = self.seed_interval
189 return_val.raw_size = len( self.current_line )
190 self.seed_interval = None
191 self.seed_interval_line_len = 0
192 return return_val
193
194 # Initialize feature identifier from seed.
195 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
196 # For GFF3
197 feature_id = self.seed_interval.attributes.get( 'ID', None )
198 # For GTF.
199 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
200
201 # Read all intervals associated with seed.
202 feature_intervals = []
203 feature_intervals.append( self.seed_interval )
204 while True:
205 try:
206 interval = GenomicIntervalReader.next( self )
207 raw_size += len( self.current_line )
208 except StopIteration, e:
209 # No more intervals to read, but last feature needs to be
210 # returned.
211 interval = None
212 raw_size += len( self.current_line )
213 break
214 except ParseError, e:
215 handle_parse_error( e )
216 raw_size += len( self.current_line )
217 continue
218 # TODO: When no longer supporting python 2.4 use finally:
219 #finally:
220 #raw_size += len( self.current_line )
221
222 # Ignore comments.
223 if isinstance( interval, Comment ):
224 continue
225
226 # Determine if interval is part of feature.
227 part_of = False
228 group = interval.attributes.get( 'group', None )
229 # GFF test:
230 if group and feature_group == group:
231 part_of = True
232 # GFF3 test:
233 parent_id = interval.attributes.get( 'Parent', None )
234 cur_id = interval.attributes.get( 'ID', None )
235 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
236 part_of = True
237 # GTF test:
238 transcript_id = interval.attributes.get( 'transcript_id', None )
239 if transcript_id and transcript_id == feature_transcript_id:
240 part_of = True
241
242 # If interval is not part of feature, clean up and break.
243 if not part_of:
244 # Adjust raw size because current line is not part of feature.
245 raw_size -= len( self.current_line )
246 break
247
248 # Interval associated with feature.
249 feature_intervals.append( interval )
250
251 # Last interval read is the seed for the next interval.
252 self.seed_interval = interval
253 self.seed_interval_line_len = len( self.current_line )
254
255 # Return feature.
256 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
257 self.end_col, self.strand_col, self.score_col,
258 self.default_strand, fix_strand=self.fix_strand,
259 intervals=feature_intervals, raw_size=raw_size )
260
261 # Convert to BED coords?
262 if self.convert_to_bed_coord:
263 convert_gff_coords_to_bed( feature )
264
265 return feature
266
267
268 def convert_bed_coords_to_gff( interval ):
269 """
270 Converts an interval object's coordinates from BED format to GFF format.
271 Accepted object types include GenomicInterval and list (where the first
272 element in the list is the interval's start, and the second element is
273 the interval's end).
274 """
275 if isinstance( interval, GenomicInterval ):
276 interval.start += 1
277 if isinstance( interval, GFFFeature ):
278 for subinterval in interval.intervals:
279 convert_bed_coords_to_gff( subinterval )
280 elif type( interval ) is list:
281 interval[ 0 ] += 1
282 return interval
283
284
285 def convert_gff_coords_to_bed( interval ):
286 """
287 Converts an interval object's coordinates from GFF format to BED format.
288 Accepted object types include GFFFeature, GenomicInterval, and list (where
289 the first element in the list is the interval's start, and the second
290 element is the interval's end).
291 """
292 if isinstance( interval, GenomicInterval ):
293 interval.start -= 1
294 if isinstance( interval, GFFFeature ):
295 for subinterval in interval.intervals:
296 convert_gff_coords_to_bed( subinterval )
297 elif type( interval ) is list:
298 interval[ 0 ] -= 1
299 return interval
300
301
302 def parse_gff_attributes( attr_str ):
303 """
304 Parses a GFF/GTF attribute string and returns a dictionary of name-value
305 pairs. The general format for a GFF3 attributes string is
306
307 name1=value1;name2=value2
308
309 The general format for a GTF attribute string is
310
311 name1 "value1" ; name2 "value2"
312
313 The general format for a GFF attribute string is a single string that
314 denotes the interval's group; in this case, method returns a dictionary
315 with a single key-value pair, and key name is 'group'
316 """
317 attributes_list = attr_str.split(";")
318 attributes = {}
319 for name_value_pair in attributes_list:
320 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
321 # attribute; next, try double quotes for GTF.
322 pair = name_value_pair.strip().split("=")
323 if len( pair ) == 1:
324 pair = name_value_pair.strip().split("\"")
325 if len( pair ) == 1:
326 # Could not split for some reason -- raise exception?
327 continue
328 if pair == '':
329 continue
330 name = pair[0].strip()
331 if name == '':
332 continue
333 # Need to strip double quote from values
334 value = pair[1].strip(" \"")
335 attributes[ name ] = value
336
337 if len( attributes ) == 0:
338 # Could not split attributes string, so entire string must be
339 # 'group' attribute. This is the case for strictly GFF files.
340 attributes['group'] = attr_str
341 return attributes
342
343
344 def gff_attributes_to_str( attrs, gff_format ):
345 """
346 Convert GFF attributes to string. Supported formats are GFF3, GTF.
347 """
348 if gff_format == 'GTF':
349 format_string = '%s "%s"'
350 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
351 id_attr = None
352 if 'group' in attrs:
353 id_attr = 'group'
354 elif 'ID' in attrs:
355 id_attr = 'ID'
356 elif 'Parent' in attrs:
357 id_attr = 'Parent'
358 if id_attr:
359 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
360 elif gff_format == 'GFF3':
361 format_string = '%s=%s'
362 attrs_strs = []
363 for name, value in attrs.items():
364 attrs_strs.append( format_string % ( name, value ) )
365 return " ; ".join( attrs_strs )
366
367
368 def read_unordered_gtf( iterator, strict=False ):
369 """
370 Returns GTF features found in an iterator. GTF lines need not be ordered
371 or clustered for reader to work. Reader returns GFFFeature objects sorted
372 by transcript_id, chrom, and start position.
373 """
374
375 # -- Get function that generates line/feature key. --
376
377 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
378 if strict:
379 # Strict GTF parsing uses transcript_id only to group lines into feature.
380 key_fn = get_transcript_id
381 else:
382 # Use lenient parsing where chromosome + transcript_id is the key. This allows
383 # transcripts with same ID on different chromosomes; this occurs in some popular
384 # datasources, such as RefGenes in UCSC.
385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
386
387 # Aggregate intervals by transcript_id and collect comments.
388 feature_intervals = odict()
389 comments = []
390 for count, line in enumerate( iterator ):
391 if line.startswith( '#' ):
392 comments.append( Comment( line ) )
393 continue
394
395 line_key = key_fn( line.split('\t') )
396 if line_key in feature_intervals:
397 feature = feature_intervals[ line_key ]
398 else:
399 feature = []
400 feature_intervals[ line_key ] = feature
401 feature.append( GFFInterval( None, line.split( '\t' ) ) )
402
403 # Create features.
404 chroms_features = {}
405 for count, intervals in enumerate( feature_intervals.values() ):
406 # Sort intervals by start position.
407 intervals.sort( lambda a, b: cmp( a.start, b.start ) )
408 feature = GFFFeature( None, intervals=intervals )
409 if feature.chrom not in chroms_features:
410 chroms_features[ feature.chrom ] = []
411 chroms_features[ feature.chrom ].append( feature )
412
413 # Sort features by chrom, start position.
414 chroms_features_sorted = []
415 for chrom_features in chroms_features.values():
416 chroms_features_sorted.append( chrom_features )
417 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
418 for features in chroms_features_sorted:
419 features.sort( lambda a, b: cmp( a.start, b.start ) )
420
421 # Yield comments first, then features.
422 # FIXME: comments can appear anywhere in file, not just the beginning.
423 # Ideally, then comments would be associated with features and output
424 # just before feature/line.
425 for comment in comments:
426 yield comment
427
428 for chrom_features in chroms_features_sorted:
429 for feature in chrom_features:
430 yield feature