Mercurial > repos > devteam > flanking_features
comparison utils/gff_util.py @ 1:8307665c4b6c draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
| author | devteam | 
|---|---|
| date | Wed, 11 Nov 2015 12:48:18 -0500 | 
| parents | 90100b587723 | 
| children | a09d13b108fd | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:90100b587723 | 1:8307665c4b6c | 
|---|---|
| 1 """ | 1 """ | 
| 2 Provides utilities for working with GFF files. | 2 Provides utilities for working with GFF files. | 
| 3 """ | 3 """ | 
| 4 | 4 | 
| 5 import copy | 5 import copy | 
| 6 from bx.intervals.io import * | 6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper | 
| 7 from bx.tabular.io import Header, Comment | 7 from bx.tabular.io import Header, Comment, ParseError | 
| 8 from utils.odict import odict | 8 from utils.odict import odict | 
| 9 | |
| 9 | 10 | 
| 10 class GFFInterval( GenomicInterval ): | 11 class GFFInterval( GenomicInterval ): | 
| 11 """ | 12 """ | 
| 12 A GFF interval, including attributes. If file is strictly a GFF file, | 13 A GFF interval, including attributes. If file is strictly a GFF file, | 
| 13 only attribute is 'group.' | 14 only attribute is 'group.' | 
| 14 """ | 15 """ | 
| 15 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ | 16 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, | 
| 16 strand_col=6, score_col=5, default_strand='.', fix_strand=False ): | 17 strand_col=6, score_col=5, default_strand='.', fix_strand=False ): | 
| 17 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, | 18 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, | 
| 18 # temporarily set strand and then unset after initing GenomicInterval. | 19 # temporarily set strand and then unset after initing GenomicInterval. | 
| 19 unknown_strand = False | 20 unknown_strand = False | 
| 20 if not fix_strand and fields[ strand_col ] == '.': | 21 if not fix_strand and fields[ strand_col ] == '.': | 
| 21 unknown_strand = True | 22 unknown_strand = True | 
| 22 fields[ strand_col ] = '+' | 23 fields[ strand_col ] = '+' | 
| 23 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ | 24 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, | 
| 24 default_strand, fix_strand=fix_strand ) | 25 default_strand, fix_strand=fix_strand ) | 
| 25 if unknown_strand: | 26 if unknown_strand: | 
| 26 self.strand = '.' | 27 self.strand = '.' | 
| 27 self.fields[ strand_col ] = '.' | 28 self.fields[ strand_col ] = '.' | 
| 28 | 29 | 
| 41 | 42 | 
| 42 def copy( self ): | 43 def copy( self ): | 
| 43 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, | 44 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, | 
| 44 self.end_col, self.strand_col, self.score_col, self.strand) | 45 self.end_col, self.strand_col, self.score_col, self.strand) | 
| 45 | 46 | 
| 47 | |
| 46 class GFFFeature( GFFInterval ): | 48 class GFFFeature( GFFInterval ): | 
| 47 """ | 49 """ | 
| 48 A GFF feature, which can include multiple intervals. | 50 A GFF feature, which can include multiple intervals. | 
| 49 """ | 51 """ | 
| 50 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ | 52 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, | 
| 51 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ | 53 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], | 
| 52 raw_size=0 ): | 54 raw_size=0 ): | 
| 53 # Use copy so that first interval and feature do not share fields. | 55 # Use copy so that first interval and feature do not share fields. | 
| 54 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ | 56 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, | 
| 55 start_col, end_col, strand_col, score_col, default_strand, \ | 57 start_col, end_col, strand_col, score_col, default_strand, | 
| 56 fix_strand=fix_strand ) | 58 fix_strand=fix_strand ) | 
| 57 self.intervals = intervals | 59 self.intervals = intervals | 
| 58 self.raw_size = raw_size | 60 self.raw_size = raw_size | 
| 59 # Use intervals to set feature attributes. | 61 # Use intervals to set feature attributes. | 
| 60 for interval in self.intervals: | 62 for interval in self.intervals: | 
| 61 # Error checking. NOTE: intervals need not share the same strand. | 63 # Error checking. NOTE: intervals need not share the same strand. | 
| 62 if interval.chrom != self.chrom: | 64 if interval.chrom != self.chrom: | 
| 63 raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ | 65 raise ValueError( "interval chrom does not match self chrom: %s != %s" % | 
| 64 ( interval.chrom, self.chrom ) ) | 66 ( interval.chrom, self.chrom ) ) | 
| 65 # Set start, end of interval. | 67 # Set start, end of interval. | 
| 66 if interval.start < self.start: | 68 if interval.start < self.start: | 
| 67 self.start = interval.start | 69 self.start = interval.start | 
| 68 if interval.end > self.end: | 70 if interval.end > self.end: | 
| 70 | 72 | 
| 71 def name( self ): | 73 def name( self ): | 
| 72 """ Returns feature's name. """ | 74 """ Returns feature's name. """ | 
| 73 name = None | 75 name = None | 
| 74 # Preference for name: GTF, GFF3, GFF. | 76 # Preference for name: GTF, GFF3, GFF. | 
| 75 for attr_name in [ | 77 for attr_name in ['gene_id', 'transcript_id', # GTF | 
| 76 # GTF: | 78 'ID', 'id', # GFF3 | 
| 77 'gene_id', 'transcript_id', | 79 'group' ]: # GFF (TODO) | 
| 78 # GFF3: | |
| 79 'ID', 'id', | |
| 80 # GFF (TODO): | |
| 81 'group' ]: | |
| 82 name = self.attributes.get( attr_name, None ) | 80 name = self.attributes.get( attr_name, None ) | 
| 83 if name is not None: | 81 if name is not None: | 
| 84 break | 82 break | 
| 85 return name | 83 return name | 
| 86 | 84 | 
| 105 """ | 103 """ | 
| 106 | 104 | 
| 107 def parse_row( self, line ): | 105 def parse_row( self, line ): | 
| 108 # HACK: this should return a GFF interval, but bx-python operations | 106 # HACK: this should return a GFF interval, but bx-python operations | 
| 109 # require GenomicInterval objects and subclasses will not work. | 107 # require GenomicInterval objects and subclasses will not work. | 
| 110 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ | 108 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, | 
| 111 self.end_col, self.strand_col, self.default_strand, \ | 109 self.end_col, self.strand_col, self.default_strand, | 
| 112 fix_strand=self.fix_strand ) | 110 fix_strand=self.fix_strand ) | 
| 113 interval = convert_gff_coords_to_bed( interval ) | 111 interval = convert_gff_coords_to_bed( interval ) | 
| 114 return interval | 112 return interval | 
| 113 | |
| 115 | 114 | 
| 116 class GFFReaderWrapper( NiceReaderWrapper ): | 115 class GFFReaderWrapper( NiceReaderWrapper ): | 
| 117 """ | 116 """ | 
| 118 Reader wrapper for GFF files. | 117 Reader wrapper for GFF files. | 
| 119 | 118 | 
| 125 are 1-based, closed--to the 'traditional'/BED interval format--0 based, | 124 are 1-based, closed--to the 'traditional'/BED interval format--0 based, | 
| 126 half-open. This is useful when using GFF files as inputs to tools that | 125 half-open. This is useful when using GFF files as inputs to tools that | 
| 127 expect traditional interval format. | 126 expect traditional interval format. | 
| 128 """ | 127 """ | 
| 129 | 128 | 
| 130 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ | 129 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, | 
| 131 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): | 130 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): | 
| 132 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ | 131 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, | 
| 133 strand_col=strand_col, fix_strand=fix_strand, **kwargs ) | 132 strand_col=strand_col, fix_strand=fix_strand, **kwargs ) | 
| 134 self.feature_col = feature_col | 133 self.feature_col = feature_col | 
| 135 self.score_col = score_col | 134 self.score_col = score_col | 
| 136 self.convert_to_bed_coord = convert_to_bed_coord | 135 self.convert_to_bed_coord = convert_to_bed_coord | 
| 137 self.last_line = None | 136 self.last_line = None | 
| 138 self.cur_offset = 0 | 137 self.cur_offset = 0 | 
| 139 self.seed_interval = None | 138 self.seed_interval = None | 
| 140 self.seed_interval_line_len = 0 | 139 self.seed_interval_line_len = 0 | 
| 141 | 140 | 
| 142 def parse_row( self, line ): | 141 def parse_row( self, line ): | 
| 143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ | 142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, | 
| 144 self.start_col, self.end_col, self.strand_col, self.score_col, \ | 143 self.start_col, self.end_col, self.strand_col, self.score_col, | 
| 145 self.default_strand, fix_strand=self.fix_strand ) | 144 self.default_strand, fix_strand=self.fix_strand ) | 
| 146 return interval | 145 return interval | 
| 147 | 146 | 
| 148 def next( self ): | 147 def next( self ): | 
| 149 """ Returns next GFFFeature. """ | 148 """ Returns next GFFFeature. """ | 
| 153 # | 152 # | 
| 154 | 153 | 
| 155 def handle_parse_error( parse_error ): | 154 def handle_parse_error( parse_error ): | 
| 156 """ Actions to take when ParseError found. """ | 155 """ Actions to take when ParseError found. """ | 
| 157 if self.outstream: | 156 if self.outstream: | 
| 158 if self.print_delegate and hasattr(self.print_delegate,"__call__"): | 157 if self.print_delegate and hasattr(self.print_delegate, "__call__"): | 
| 159 self.print_delegate( self.outstream, e, self ) | 158 self.print_delegate( self.outstream, e, self ) | 
| 160 self.skipped += 1 | 159 self.skipped += 1 | 
| 161 # no reason to stuff an entire bad file into memmory | 160 # no reason to stuff an entire bad file into memmory | 
| 162 if self.skipped < 10: | 161 if self.skipped < 10: | 
| 163 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) | 162 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) | 
| 164 | 163 | 
| 165 # For debugging, uncomment this to propogate parsing exceptions up. | 164 # For debugging, uncomment this to propogate parsing exceptions up. | 
| 166 # I.e. the underlying reason for an unexpected StopIteration exception | 165 # I.e. the underlying reason for an unexpected StopIteration exception | 
| 167 # can be found by uncommenting this. | 166 # can be found by uncommenting this. | 
| 168 # raise e | 167 # raise e | 
| 191 self.seed_interval = None | 190 self.seed_interval = None | 
| 192 self.seed_interval_line_len = 0 | 191 self.seed_interval_line_len = 0 | 
| 193 return return_val | 192 return return_val | 
| 194 | 193 | 
| 195 # Initialize feature identifier from seed. | 194 # Initialize feature identifier from seed. | 
| 196 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF | 195 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF | 
| 197 # For GFF3 | 196 # For GFF3 | 
| 198 feature_id = self.seed_interval.attributes.get( 'ID', None ) | 197 feature_id = self.seed_interval.attributes.get( 'ID', None ) | 
| 199 feature_parent_id = self.seed_interval.attributes.get( 'Parent', None ) | |
| 200 # For GTF. | 198 # For GTF. | 
| 201 feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) | |
| 202 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) | 199 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) | 
| 203 | 200 | 
| 204 # Read all intervals associated with seed. | 201 # Read all intervals associated with seed. | 
| 205 feature_intervals = [] | 202 feature_intervals = [] | 
| 206 feature_intervals.append( self.seed_interval ) | 203 feature_intervals.append( self.seed_interval ) | 
| 254 # Last interval read is the seed for the next interval. | 251 # Last interval read is the seed for the next interval. | 
| 255 self.seed_interval = interval | 252 self.seed_interval = interval | 
| 256 self.seed_interval_line_len = len( self.current_line ) | 253 self.seed_interval_line_len = len( self.current_line ) | 
| 257 | 254 | 
| 258 # Return feature. | 255 # Return feature. | 
| 259 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ | 256 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, | 
| 260 self.end_col, self.strand_col, self.score_col, \ | 257 self.end_col, self.strand_col, self.score_col, | 
| 261 self.default_strand, fix_strand=self.fix_strand, \ | 258 self.default_strand, fix_strand=self.fix_strand, | 
| 262 intervals=feature_intervals, raw_size=raw_size ) | 259 intervals=feature_intervals, raw_size=raw_size ) | 
| 263 | 260 | 
| 264 # Convert to BED coords? | 261 # Convert to BED coords? | 
| 265 if self.convert_to_bed_coord: | 262 if self.convert_to_bed_coord: | 
| 266 convert_gff_coords_to_bed( feature ) | 263 convert_gff_coords_to_bed( feature ) | 
| 267 | 264 | 
| 268 return feature | 265 return feature | 
| 266 | |
| 269 | 267 | 
| 270 def convert_bed_coords_to_gff( interval ): | 268 def convert_bed_coords_to_gff( interval ): | 
| 271 """ | 269 """ | 
| 272 Converts an interval object's coordinates from BED format to GFF format. | 270 Converts an interval object's coordinates from BED format to GFF format. | 
| 273 Accepted object types include GenomicInterval and list (where the first | 271 Accepted object types include GenomicInterval and list (where the first | 
| 277 if isinstance( interval, GenomicInterval ): | 275 if isinstance( interval, GenomicInterval ): | 
| 278 interval.start += 1 | 276 interval.start += 1 | 
| 279 if isinstance( interval, GFFFeature ): | 277 if isinstance( interval, GFFFeature ): | 
| 280 for subinterval in interval.intervals: | 278 for subinterval in interval.intervals: | 
| 281 convert_bed_coords_to_gff( subinterval ) | 279 convert_bed_coords_to_gff( subinterval ) | 
| 282 elif type ( interval ) is list: | 280 elif type( interval ) is list: | 
| 283 interval[ 0 ] += 1 | 281 interval[ 0 ] += 1 | 
| 284 return interval | 282 return interval | 
| 283 | |
| 285 | 284 | 
| 286 def convert_gff_coords_to_bed( interval ): | 285 def convert_gff_coords_to_bed( interval ): | 
| 287 """ | 286 """ | 
| 288 Converts an interval object's coordinates from GFF format to BED format. | 287 Converts an interval object's coordinates from GFF format to BED format. | 
| 289 Accepted object types include GFFFeature, GenomicInterval, and list (where | 288 Accepted object types include GFFFeature, GenomicInterval, and list (where | 
| 293 if isinstance( interval, GenomicInterval ): | 292 if isinstance( interval, GenomicInterval ): | 
| 294 interval.start -= 1 | 293 interval.start -= 1 | 
| 295 if isinstance( interval, GFFFeature ): | 294 if isinstance( interval, GFFFeature ): | 
| 296 for subinterval in interval.intervals: | 295 for subinterval in interval.intervals: | 
| 297 convert_gff_coords_to_bed( subinterval ) | 296 convert_gff_coords_to_bed( subinterval ) | 
| 298 elif type ( interval ) is list: | 297 elif type( interval ) is list: | 
| 299 interval[ 0 ] -= 1 | 298 interval[ 0 ] -= 1 | 
| 300 return interval | 299 return interval | 
| 300 | |
| 301 | 301 | 
| 302 def parse_gff_attributes( attr_str ): | 302 def parse_gff_attributes( attr_str ): | 
| 303 """ | 303 """ | 
| 304 Parses a GFF/GTF attribute string and returns a dictionary of name-value | 304 Parses a GFF/GTF attribute string and returns a dictionary of name-value | 
| 305 pairs. The general format for a GFF3 attributes string is | 305 pairs. The general format for a GFF3 attributes string is | 
| 338 # Could not split attributes string, so entire string must be | 338 # Could not split attributes string, so entire string must be | 
| 339 # 'group' attribute. This is the case for strictly GFF files. | 339 # 'group' attribute. This is the case for strictly GFF files. | 
| 340 attributes['group'] = attr_str | 340 attributes['group'] = attr_str | 
| 341 return attributes | 341 return attributes | 
| 342 | 342 | 
| 343 | |
| 343 def gff_attributes_to_str( attrs, gff_format ): | 344 def gff_attributes_to_str( attrs, gff_format ): | 
| 344 """ | 345 """ | 
| 345 Convert GFF attributes to string. Supported formats are GFF3, GTF. | 346 Convert GFF attributes to string. Supported formats are GFF3, GTF. | 
| 346 """ | 347 """ | 
| 347 if gff_format == 'GTF': | 348 if gff_format == 'GTF': | 
| 361 attrs_strs = [] | 362 attrs_strs = [] | 
| 362 for name, value in attrs.items(): | 363 for name, value in attrs.items(): | 
| 363 attrs_strs.append( format_string % ( name, value ) ) | 364 attrs_strs.append( format_string % ( name, value ) ) | 
| 364 return " ; ".join( attrs_strs ) | 365 return " ; ".join( attrs_strs ) | 
| 365 | 366 | 
| 367 | |
| 366 def read_unordered_gtf( iterator, strict=False ): | 368 def read_unordered_gtf( iterator, strict=False ): | 
| 367 """ | 369 """ | 
| 368 Returns GTF features found in an iterator. GTF lines need not be ordered | 370 Returns GTF features found in an iterator. GTF lines need not be ordered | 
| 369 or clustered for reader to work. Reader returns GFFFeature objects sorted | 371 or clustered for reader to work. Reader returns GFFFeature objects sorted | 
| 370 by transcript_id, chrom, and start position. | 372 by transcript_id, chrom, and start position. | 
| 380 # Use lenient parsing where chromosome + transcript_id is the key. This allows | 382 # Use lenient parsing where chromosome + transcript_id is the key. This allows | 
| 381 # transcripts with same ID on different chromosomes; this occurs in some popular | 383 # transcripts with same ID on different chromosomes; this occurs in some popular | 
| 382 # datasources, such as RefGenes in UCSC. | 384 # datasources, such as RefGenes in UCSC. | 
| 383 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) | 385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) | 
| 384 | 386 | 
| 385 | |
| 386 # Aggregate intervals by transcript_id and collect comments. | 387 # Aggregate intervals by transcript_id and collect comments. | 
| 387 feature_intervals = odict() | 388 feature_intervals = odict() | 
| 388 comments = [] | 389 comments = [] | 
| 389 for count, line in enumerate( iterator ): | 390 for count, line in enumerate( iterator ): | 
| 390 if line.startswith( '#' ): | 391 if line.startswith( '#' ): | 
| 401 | 402 | 
| 402 # Create features. | 403 # Create features. | 
| 403 chroms_features = {} | 404 chroms_features = {} | 
| 404 for count, intervals in enumerate( feature_intervals.values() ): | 405 for count, intervals in enumerate( feature_intervals.values() ): | 
| 405 # Sort intervals by start position. | 406 # Sort intervals by start position. | 
| 406 intervals.sort( lambda a,b: cmp( a.start, b.start ) ) | 407 intervals.sort( lambda a, b: cmp( a.start, b.start ) ) | 
| 407 feature = GFFFeature( None, intervals=intervals ) | 408 feature = GFFFeature( None, intervals=intervals ) | 
| 408 if feature.chrom not in chroms_features: | 409 if feature.chrom not in chroms_features: | 
| 409 chroms_features[ feature.chrom ] = [] | 410 chroms_features[ feature.chrom ] = [] | 
| 410 chroms_features[ feature.chrom ].append( feature ) | 411 chroms_features[ feature.chrom ].append( feature ) | 
| 411 | 412 | 
| 412 # Sort features by chrom, start position. | 413 # Sort features by chrom, start position. | 
| 413 chroms_features_sorted = [] | 414 chroms_features_sorted = [] | 
| 414 for chrom_features in chroms_features.values(): | 415 for chrom_features in chroms_features.values(): | 
| 415 chroms_features_sorted.append( chrom_features ) | 416 chroms_features_sorted.append( chrom_features ) | 
| 416 chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) | 417 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) | 
| 417 for features in chroms_features_sorted: | 418 for features in chroms_features_sorted: | 
| 418 features.sort( lambda a,b: cmp( a.start, b.start ) ) | 419 features.sort( lambda a, b: cmp( a.start, b.start ) ) | 
| 419 | 420 | 
| 420 # Yield comments first, then features. | 421 # Yield comments first, then features. | 
| 421 # FIXME: comments can appear anywhere in file, not just the beginning. | 422 # FIXME: comments can appear anywhere in file, not just the beginning. | 
| 422 # Ideally, then comments would be associated with features and output | 423 # Ideally, then comments would be associated with features and output | 
| 423 # just before feature/line. | 424 # just before feature/line. | 
| 425 yield comment | 426 yield comment | 
| 426 | 427 | 
| 427 for chrom_features in chroms_features_sorted: | 428 for chrom_features in chroms_features_sorted: | 
| 428 for feature in chrom_features: | 429 for feature in chrom_features: | 
| 429 yield feature | 430 yield feature | 
| 430 | 
