| 0 | 1 """ | 
|  | 2 Provides utilities for working with GFF files. | 
|  | 3 """ | 
|  | 4 | 
|  | 5 import copy | 
|  | 6 from bx.intervals.io import * | 
|  | 7 from bx.tabular.io import Header, Comment | 
|  | 8 from utils.odict import odict | 
|  | 9 | 
|  | 10 class GFFInterval( GenomicInterval ): | 
|  | 11     """ | 
|  | 12     A GFF interval, including attributes. If file is strictly a GFF file, | 
|  | 13     only attribute is 'group.' | 
|  | 14     """ | 
|  | 15     def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ | 
|  | 16                   strand_col=6, score_col=5, default_strand='.', fix_strand=False ): | 
|  | 17         # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, | 
|  | 18         # temporarily set strand and then unset after initing GenomicInterval. | 
|  | 19         unknown_strand = False | 
|  | 20         if not fix_strand and fields[ strand_col ] == '.': | 
|  | 21             unknown_strand = True | 
|  | 22             fields[ strand_col ] = '+' | 
|  | 23         GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ | 
|  | 24                                   default_strand, fix_strand=fix_strand ) | 
|  | 25         if unknown_strand: | 
|  | 26             self.strand = '.' | 
|  | 27             self.fields[ strand_col ] = '.' | 
|  | 28 | 
|  | 29         # Handle feature, score column. | 
|  | 30         self.feature_col = feature_col | 
|  | 31         if self.feature_col >= self.nfields: | 
|  | 32             raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) | 
|  | 33         self.feature = self.fields[ self.feature_col ] | 
|  | 34         self.score_col = score_col | 
|  | 35         if self.score_col >= self.nfields: | 
|  | 36             raise MissingFieldError( "No field for score_col (%d)" % score_col ) | 
|  | 37         self.score = self.fields[ self.score_col ] | 
|  | 38 | 
|  | 39         # GFF attributes. | 
|  | 40         self.attributes = parse_gff_attributes( fields[8] ) | 
|  | 41 | 
|  | 42     def copy( self ): | 
|  | 43         return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, | 
|  | 44                            self.end_col, self.strand_col, self.score_col, self.strand) | 
|  | 45 | 
|  | 46 class GFFFeature( GFFInterval ): | 
|  | 47     """ | 
|  | 48     A GFF feature, which can include multiple intervals. | 
|  | 49     """ | 
|  | 50     def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ | 
|  | 51                   strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ | 
|  | 52                   raw_size=0 ): | 
|  | 53         # Use copy so that first interval and feature do not share fields. | 
|  | 54         GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ | 
|  | 55                               start_col, end_col, strand_col, score_col, default_strand, \ | 
|  | 56                               fix_strand=fix_strand ) | 
|  | 57         self.intervals = intervals | 
|  | 58         self.raw_size = raw_size | 
|  | 59         # Use intervals to set feature attributes. | 
|  | 60         for interval in self.intervals: | 
|  | 61             # Error checking. NOTE: intervals need not share the same strand. | 
|  | 62             if interval.chrom != self.chrom: | 
|  | 63                 raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ | 
|  | 64                                   ( interval.chrom, self.chrom ) ) | 
|  | 65             # Set start, end of interval. | 
|  | 66             if interval.start < self.start: | 
|  | 67                 self.start = interval.start | 
|  | 68             if interval.end > self.end: | 
|  | 69                 self.end = interval.end | 
|  | 70 | 
|  | 71     def name( self ): | 
|  | 72         """ Returns feature's name. """ | 
|  | 73         name = None | 
|  | 74         # Preference for name: GTF, GFF3, GFF. | 
|  | 75         for attr_name in [ | 
|  | 76                            # GTF: | 
|  | 77                            'gene_id', 'transcript_id', | 
|  | 78                            # GFF3: | 
|  | 79                            'ID', 'id', | 
|  | 80                            # GFF (TODO): | 
|  | 81                            'group' ]: | 
|  | 82             name = self.attributes.get( attr_name, None ) | 
|  | 83             if name is not None: | 
|  | 84                 break | 
|  | 85         return name | 
|  | 86 | 
|  | 87     def copy( self ): | 
|  | 88         intervals_copy = [] | 
|  | 89         for interval in self.intervals: | 
|  | 90             intervals_copy.append( interval.copy() ) | 
|  | 91         return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, | 
|  | 92                           self.score_col, self.strand, intervals=intervals_copy ) | 
|  | 93 | 
|  | 94     def lines( self ): | 
|  | 95         lines = [] | 
|  | 96         for interval in self.intervals: | 
|  | 97             lines.append( '\t'.join( interval.fields ) ) | 
|  | 98         return lines | 
|  | 99 | 
|  | 100 | 
|  | 101 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): | 
|  | 102     """ | 
|  | 103     Reader wrapper that reads GFF intervals/lines and automatically converts | 
|  | 104     them to BED format. | 
|  | 105     """ | 
|  | 106 | 
|  | 107     def parse_row( self, line ): | 
|  | 108         # HACK: this should return a GFF interval, but bx-python operations | 
|  | 109         # require GenomicInterval objects and subclasses will not work. | 
|  | 110         interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ | 
|  | 111                                     self.end_col, self.strand_col, self.default_strand, \ | 
|  | 112                                     fix_strand=self.fix_strand ) | 
|  | 113         interval = convert_gff_coords_to_bed( interval ) | 
|  | 114         return interval | 
|  | 115 | 
|  | 116 class GFFReaderWrapper( NiceReaderWrapper ): | 
|  | 117     """ | 
|  | 118     Reader wrapper for GFF files. | 
|  | 119 | 
|  | 120     Wrapper has two major functions: | 
|  | 121 | 
|  | 122     1. group entries for GFF file (via group column), GFF3 (via id attribute), | 
|  | 123        or GTF (via gene_id/transcript id); | 
|  | 124     2. convert coordinates from GFF format--starting and ending coordinates | 
|  | 125        are 1-based, closed--to the 'traditional'/BED interval format--0 based, | 
|  | 126        half-open. This is useful when using GFF files as inputs to tools that | 
|  | 127        expect traditional interval format. | 
|  | 128     """ | 
|  | 129 | 
|  | 130     def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ | 
|  | 131                   end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): | 
|  | 132         NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ | 
|  | 133                                     strand_col=strand_col, fix_strand=fix_strand, **kwargs ) | 
|  | 134         self.feature_col = feature_col | 
|  | 135         self.score_col = score_col | 
|  | 136         self.convert_to_bed_coord = convert_to_bed_coord | 
|  | 137         self.last_line = None | 
|  | 138         self.cur_offset = 0 | 
|  | 139         self.seed_interval = None | 
|  | 140         self.seed_interval_line_len = 0 | 
|  | 141 | 
|  | 142     def parse_row( self, line ): | 
|  | 143         interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ | 
|  | 144                                 self.start_col, self.end_col, self.strand_col, self.score_col, \ | 
|  | 145                                 self.default_strand, fix_strand=self.fix_strand ) | 
|  | 146         return interval | 
|  | 147 | 
|  | 148     def next( self ): | 
|  | 149         """ Returns next GFFFeature. """ | 
|  | 150 | 
|  | 151         # | 
|  | 152         # Helper function. | 
|  | 153         # | 
|  | 154 | 
|  | 155         def handle_parse_error( parse_error ): | 
|  | 156             """ Actions to take when ParseError found. """ | 
|  | 157             if self.outstream: | 
|  | 158                if self.print_delegate and hasattr(self.print_delegate,"__call__"): | 
|  | 159                    self.print_delegate( self.outstream, e, self ) | 
|  | 160             self.skipped += 1 | 
|  | 161             # no reason to stuff an entire bad file into memmory | 
|  | 162             if self.skipped < 10: | 
|  | 163                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) | 
|  | 164 | 
|  | 165             # For debugging, uncomment this to propogate parsing exceptions up. | 
|  | 166             # I.e. the underlying reason for an unexpected StopIteration exception | 
|  | 167             # can be found by uncommenting this. | 
|  | 168             # raise e | 
|  | 169 | 
|  | 170         # | 
|  | 171         # Get next GFFFeature | 
|  | 172         # | 
|  | 173         raw_size = self.seed_interval_line_len | 
|  | 174 | 
|  | 175         # If there is no seed interval, set one. Also, if there are no more | 
|  | 176         # intervals to read, this is where iterator dies. | 
|  | 177         if not self.seed_interval: | 
|  | 178             while not self.seed_interval: | 
|  | 179                 try: | 
|  | 180                     self.seed_interval = GenomicIntervalReader.next( self ) | 
|  | 181                 except ParseError, e: | 
|  | 182                     handle_parse_error( e ) | 
|  | 183                 # TODO: When no longer supporting python 2.4 use finally: | 
|  | 184                 #finally: | 
|  | 185                 raw_size += len( self.current_line ) | 
|  | 186 | 
|  | 187         # If header or comment, clear seed interval and return it with its size. | 
|  | 188         if isinstance( self.seed_interval, ( Header, Comment ) ): | 
|  | 189             return_val = self.seed_interval | 
|  | 190             return_val.raw_size = len( self.current_line ) | 
|  | 191             self.seed_interval = None | 
|  | 192             self.seed_interval_line_len = 0 | 
|  | 193             return return_val | 
|  | 194 | 
|  | 195         # Initialize feature identifier from seed. | 
|  | 196         feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF | 
|  | 197         # For GFF3 | 
|  | 198         feature_id = self.seed_interval.attributes.get( 'ID', None ) | 
|  | 199         feature_parent_id = self.seed_interval.attributes.get( 'Parent', None ) | 
|  | 200         # For GTF. | 
|  | 201         feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) | 
|  | 202         feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) | 
|  | 203 | 
|  | 204         # Read all intervals associated with seed. | 
|  | 205         feature_intervals = [] | 
|  | 206         feature_intervals.append( self.seed_interval ) | 
|  | 207         while True: | 
|  | 208             try: | 
|  | 209                 interval = GenomicIntervalReader.next( self ) | 
|  | 210                 raw_size += len( self.current_line ) | 
|  | 211             except StopIteration, e: | 
|  | 212                 # No more intervals to read, but last feature needs to be | 
|  | 213                 # returned. | 
|  | 214                 interval = None | 
|  | 215                 raw_size += len( self.current_line ) | 
|  | 216                 break | 
|  | 217             except ParseError, e: | 
|  | 218                 handle_parse_error( e ) | 
|  | 219                 raw_size += len( self.current_line ) | 
|  | 220                 continue | 
|  | 221             # TODO: When no longer supporting python 2.4 use finally: | 
|  | 222             #finally: | 
|  | 223             #raw_size += len( self.current_line ) | 
|  | 224 | 
|  | 225             # Ignore comments. | 
|  | 226             if isinstance( interval, Comment ): | 
|  | 227                 continue | 
|  | 228 | 
|  | 229             # Determine if interval is part of feature. | 
|  | 230             part_of = False | 
|  | 231             group = interval.attributes.get( 'group', None ) | 
|  | 232             # GFF test: | 
|  | 233             if group and feature_group == group: | 
|  | 234                 part_of = True | 
|  | 235             # GFF3 test: | 
|  | 236             parent_id = interval.attributes.get( 'Parent', None ) | 
|  | 237             cur_id = interval.attributes.get( 'ID', None ) | 
|  | 238             if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): | 
|  | 239                 part_of = True | 
|  | 240             # GTF test: | 
|  | 241             transcript_id = interval.attributes.get( 'transcript_id', None ) | 
|  | 242             if transcript_id and transcript_id == feature_transcript_id: | 
|  | 243                 part_of = True | 
|  | 244 | 
|  | 245             # If interval is not part of feature, clean up and break. | 
|  | 246             if not part_of: | 
|  | 247                 # Adjust raw size because current line is not part of feature. | 
|  | 248                 raw_size -= len( self.current_line ) | 
|  | 249                 break | 
|  | 250 | 
|  | 251             # Interval associated with feature. | 
|  | 252             feature_intervals.append( interval ) | 
|  | 253 | 
|  | 254         # Last interval read is the seed for the next interval. | 
|  | 255         self.seed_interval = interval | 
|  | 256         self.seed_interval_line_len = len( self.current_line ) | 
|  | 257 | 
|  | 258         # Return feature. | 
|  | 259         feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ | 
|  | 260                               self.end_col, self.strand_col, self.score_col, \ | 
|  | 261                               self.default_strand, fix_strand=self.fix_strand, \ | 
|  | 262                               intervals=feature_intervals, raw_size=raw_size ) | 
|  | 263 | 
|  | 264         # Convert to BED coords? | 
|  | 265         if self.convert_to_bed_coord: | 
|  | 266             convert_gff_coords_to_bed( feature ) | 
|  | 267 | 
|  | 268         return feature | 
|  | 269 | 
|  | 270 def convert_bed_coords_to_gff( interval ): | 
|  | 271     """ | 
|  | 272     Converts an interval object's coordinates from BED format to GFF format. | 
|  | 273     Accepted object types include GenomicInterval and list (where the first | 
|  | 274     element in the list is the interval's start, and the second element is | 
|  | 275     the interval's end). | 
|  | 276     """ | 
|  | 277     if isinstance( interval, GenomicInterval ): | 
|  | 278         interval.start += 1 | 
|  | 279         if isinstance( interval, GFFFeature ): | 
|  | 280             for subinterval in interval.intervals: | 
|  | 281                 convert_bed_coords_to_gff( subinterval ) | 
|  | 282     elif type ( interval ) is list: | 
|  | 283         interval[ 0 ] += 1 | 
|  | 284     return interval | 
|  | 285 | 
|  | 286 def convert_gff_coords_to_bed( interval ): | 
|  | 287     """ | 
|  | 288     Converts an interval object's coordinates from GFF format to BED format. | 
|  | 289     Accepted object types include GFFFeature, GenomicInterval, and list (where | 
|  | 290     the first element in the list is the interval's start, and the second | 
|  | 291     element is the interval's end). | 
|  | 292     """ | 
|  | 293     if isinstance( interval, GenomicInterval ): | 
|  | 294         interval.start -= 1 | 
|  | 295         if isinstance( interval, GFFFeature ): | 
|  | 296             for subinterval in interval.intervals: | 
|  | 297                 convert_gff_coords_to_bed( subinterval ) | 
|  | 298     elif type ( interval ) is list: | 
|  | 299         interval[ 0 ] -= 1 | 
|  | 300     return interval | 
|  | 301 | 
|  | 302 def parse_gff_attributes( attr_str ): | 
|  | 303     """ | 
|  | 304     Parses a GFF/GTF attribute string and returns a dictionary of name-value | 
|  | 305     pairs. The general format for a GFF3 attributes string is | 
|  | 306 | 
|  | 307         name1=value1;name2=value2 | 
|  | 308 | 
|  | 309     The general format for a GTF attribute string is | 
|  | 310 | 
|  | 311         name1 "value1" ; name2 "value2" | 
|  | 312 | 
|  | 313     The general format for a GFF attribute string is a single string that | 
|  | 314     denotes the interval's group; in this case, method returns a dictionary | 
|  | 315     with a single key-value pair, and key name is 'group' | 
|  | 316     """ | 
|  | 317     attributes_list = attr_str.split(";") | 
|  | 318     attributes = {} | 
|  | 319     for name_value_pair in attributes_list: | 
|  | 320         # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 | 
|  | 321         # attribute; next, try double quotes for GTF. | 
|  | 322         pair = name_value_pair.strip().split("=") | 
|  | 323         if len( pair ) == 1: | 
|  | 324             pair = name_value_pair.strip().split("\"") | 
|  | 325         if len( pair ) == 1: | 
|  | 326             # Could not split for some reason -- raise exception? | 
|  | 327             continue | 
|  | 328         if pair == '': | 
|  | 329             continue | 
|  | 330         name = pair[0].strip() | 
|  | 331         if name == '': | 
|  | 332             continue | 
|  | 333         # Need to strip double quote from values | 
|  | 334         value = pair[1].strip(" \"") | 
|  | 335         attributes[ name ] = value | 
|  | 336 | 
|  | 337     if len( attributes ) == 0: | 
|  | 338         # Could not split attributes string, so entire string must be | 
|  | 339         # 'group' attribute. This is the case for strictly GFF files. | 
|  | 340         attributes['group'] = attr_str | 
|  | 341     return attributes | 
|  | 342 | 
|  | 343 def gff_attributes_to_str( attrs, gff_format ): | 
|  | 344     """ | 
|  | 345     Convert GFF attributes to string. Supported formats are GFF3, GTF. | 
|  | 346     """ | 
|  | 347     if gff_format == 'GTF': | 
|  | 348         format_string = '%s "%s"' | 
|  | 349         # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id | 
|  | 350         id_attr = None | 
|  | 351         if 'group' in attrs: | 
|  | 352             id_attr = 'group' | 
|  | 353         elif 'ID' in attrs: | 
|  | 354             id_attr = 'ID' | 
|  | 355         elif 'Parent' in attrs: | 
|  | 356             id_attr = 'Parent' | 
|  | 357         if id_attr: | 
|  | 358             attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] | 
|  | 359     elif gff_format == 'GFF3': | 
|  | 360         format_string = '%s=%s' | 
|  | 361     attrs_strs = [] | 
|  | 362     for name, value in attrs.items(): | 
|  | 363         attrs_strs.append( format_string % ( name, value ) ) | 
|  | 364     return " ; ".join( attrs_strs ) | 
|  | 365 | 
|  | 366 def read_unordered_gtf( iterator, strict=False ): | 
|  | 367     """ | 
|  | 368     Returns GTF features found in an iterator. GTF lines need not be ordered | 
|  | 369     or clustered for reader to work. Reader returns GFFFeature objects sorted | 
|  | 370     by transcript_id, chrom, and start position. | 
|  | 371     """ | 
|  | 372 | 
|  | 373     # -- Get function that generates line/feature key. -- | 
|  | 374 | 
|  | 375     get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] | 
|  | 376     if strict: | 
|  | 377         # Strict GTF parsing uses transcript_id only to group lines into feature. | 
|  | 378         key_fn = get_transcript_id | 
|  | 379     else: | 
|  | 380         # Use lenient parsing where chromosome + transcript_id is the key. This allows | 
|  | 381         # transcripts with same ID on different chromosomes; this occurs in some popular | 
|  | 382         # datasources, such as RefGenes in UCSC. | 
|  | 383         key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) | 
|  | 384 | 
|  | 385 | 
|  | 386     # Aggregate intervals by transcript_id and collect comments. | 
|  | 387     feature_intervals = odict() | 
|  | 388     comments = [] | 
|  | 389     for count, line in enumerate( iterator ): | 
|  | 390         if line.startswith( '#' ): | 
|  | 391             comments.append( Comment( line ) ) | 
|  | 392             continue | 
|  | 393 | 
|  | 394         line_key = key_fn( line.split('\t') ) | 
|  | 395         if line_key in feature_intervals: | 
|  | 396             feature = feature_intervals[ line_key ] | 
|  | 397         else: | 
|  | 398             feature = [] | 
|  | 399             feature_intervals[ line_key ] = feature | 
|  | 400         feature.append( GFFInterval( None, line.split( '\t' ) ) ) | 
|  | 401 | 
|  | 402     # Create features. | 
|  | 403     chroms_features = {} | 
|  | 404     for count, intervals in enumerate( feature_intervals.values() ): | 
|  | 405         # Sort intervals by start position. | 
|  | 406         intervals.sort( lambda a,b: cmp( a.start, b.start ) ) | 
|  | 407         feature = GFFFeature( None, intervals=intervals ) | 
|  | 408         if feature.chrom not in chroms_features: | 
|  | 409             chroms_features[ feature.chrom ] = [] | 
|  | 410         chroms_features[ feature.chrom ].append( feature ) | 
|  | 411 | 
|  | 412     # Sort features by chrom, start position. | 
|  | 413     chroms_features_sorted = [] | 
|  | 414     for chrom_features in chroms_features.values(): | 
|  | 415         chroms_features_sorted.append( chrom_features ) | 
|  | 416     chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) | 
|  | 417     for features in chroms_features_sorted: | 
|  | 418         features.sort( lambda a,b: cmp( a.start, b.start ) ) | 
|  | 419 | 
|  | 420     # Yield comments first, then features. | 
|  | 421     # FIXME: comments can appear anywhere in file, not just the beginning. | 
|  | 422     # Ideally, then comments would be associated with features and output | 
|  | 423     # just before feature/line. | 
|  | 424     for comment in comments: | 
|  | 425         yield comment | 
|  | 426 | 
|  | 427     for chrom_features in chroms_features_sorted: | 
|  | 428         for feature in chrom_features: | 
|  | 429             yield feature | 
|  | 430 |