# HG changeset patch
# User devteam
# Date 1455210685 18000
# Node ID 8ddabc73af9281adf109d2c12f2eb63efa7ff52c
# Parent 5f72be09cfd3c289b872d6008b87666ac873f90a
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/intersect commit 4e2052686dfe8003f867449e0affff96398b2a62
diff -r 5f72be09cfd3 -r 8ddabc73af92 tool_dependencies.xml
--- a/tool_dependencies.xml Wed Nov 11 12:48:44 2015 -0500
+++ b/tool_dependencies.xml Thu Feb 11 12:11:25 2016 -0500
@@ -4,6 +4,6 @@
-
+
diff -r 5f72be09cfd3 -r 8ddabc73af92 utils/__init__.py
diff -r 5f72be09cfd3 -r 8ddabc73af92 utils/gff_util.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/gff_util.py Thu Feb 11 12:11:25 2016 -0500
@@ -0,0 +1,430 @@
+"""
+Provides utilities for working with GFF files.
+"""
+
+import copy
+from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper
+from bx.tabular.io import Header, Comment, ParseError
+from utils.odict import odict
+
+
+class GFFInterval( GenomicInterval ):
+ """
+ A GFF interval, including attributes. If file is strictly a GFF file,
+ only attribute is 'group.'
+ """
+ def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+ strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
+ # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
+ # temporarily set strand and then unset after initing GenomicInterval.
+ unknown_strand = False
+ if not fix_strand and fields[ strand_col ] == '.':
+ unknown_strand = True
+ fields[ strand_col ] = '+'
+ GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
+ default_strand, fix_strand=fix_strand )
+ if unknown_strand:
+ self.strand = '.'
+ self.fields[ strand_col ] = '.'
+
+ # Handle feature, score column.
+ self.feature_col = feature_col
+ if self.feature_col >= self.nfields:
+ raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
+ self.feature = self.fields[ self.feature_col ]
+ self.score_col = score_col
+ if self.score_col >= self.nfields:
+ raise MissingFieldError( "No field for score_col (%d)" % score_col )
+ self.score = self.fields[ self.score_col ]
+
+ # GFF attributes.
+ self.attributes = parse_gff_attributes( fields[8] )
+
+ def copy( self ):
+ return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col,
+ self.end_col, self.strand_col, self.score_col, self.strand)
+
+
+class GFFFeature( GFFInterval ):
+ """
+ A GFF feature, which can include multiple intervals.
+ """
+ def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4,
+ strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[],
+ raw_size=0 ):
+ # Use copy so that first interval and feature do not share fields.
+ GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col,
+ start_col, end_col, strand_col, score_col, default_strand,
+ fix_strand=fix_strand )
+ self.intervals = intervals
+ self.raw_size = raw_size
+ # Use intervals to set feature attributes.
+ for interval in self.intervals:
+ # Error checking. NOTE: intervals need not share the same strand.
+ if interval.chrom != self.chrom:
+ raise ValueError( "interval chrom does not match self chrom: %s != %s" %
+ ( interval.chrom, self.chrom ) )
+ # Set start, end of interval.
+ if interval.start < self.start:
+ self.start = interval.start
+ if interval.end > self.end:
+ self.end = interval.end
+
+ def name( self ):
+ """ Returns feature's name. """
+ name = None
+ # Preference for name: GTF, GFF3, GFF.
+ for attr_name in ['gene_id', 'transcript_id', # GTF
+ 'ID', 'id', # GFF3
+ 'group' ]: # GFF (TODO)
+ name = self.attributes.get( attr_name, None )
+ if name is not None:
+ break
+ return name
+
+ def copy( self ):
+ intervals_copy = []
+ for interval in self.intervals:
+ intervals_copy.append( interval.copy() )
+ return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col,
+ self.score_col, self.strand, intervals=intervals_copy )
+
+ def lines( self ):
+ lines = []
+ for interval in self.intervals:
+ lines.append( '\t'.join( interval.fields ) )
+ return lines
+
+
+class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
+ """
+ Reader wrapper that reads GFF intervals/lines and automatically converts
+ them to BED format.
+ """
+
+ def parse_row( self, line ):
+ # HACK: this should return a GFF interval, but bx-python operations
+ # require GenomicInterval objects and subclasses will not work.
+ interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
+ self.end_col, self.strand_col, self.default_strand,
+ fix_strand=self.fix_strand )
+ interval = convert_gff_coords_to_bed( interval )
+ return interval
+
+
+class GFFReaderWrapper( NiceReaderWrapper ):
+ """
+ Reader wrapper for GFF files.
+
+ Wrapper has two major functions:
+
+ 1. group entries for GFF file (via group column), GFF3 (via id attribute),
+ or GTF (via gene_id/transcript id);
+ 2. convert coordinates from GFF format--starting and ending coordinates
+ are 1-based, closed--to the 'traditional'/BED interval format--0 based,
+ half-open. This is useful when using GFF files as inputs to tools that
+ expect traditional interval format.
+ """
+
+ def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3,
+ end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ):
+ NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col,
+ strand_col=strand_col, fix_strand=fix_strand, **kwargs )
+ self.feature_col = feature_col
+ self.score_col = score_col
+ self.convert_to_bed_coord = convert_to_bed_coord
+ self.last_line = None
+ self.cur_offset = 0
+ self.seed_interval = None
+ self.seed_interval_line_len = 0
+
+ def parse_row( self, line ):
+ interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col,
+ self.start_col, self.end_col, self.strand_col, self.score_col,
+ self.default_strand, fix_strand=self.fix_strand )
+ return interval
+
+ def next( self ):
+ """ Returns next GFFFeature. """
+
+ #
+ # Helper function.
+ #
+
+ def handle_parse_error( parse_error ):
+ """ Actions to take when ParseError found. """
+ if self.outstream:
+ if self.print_delegate and hasattr(self.print_delegate, "__call__"):
+ self.print_delegate( self.outstream, e, self )
+ self.skipped += 1
+ # no reason to stuff an entire bad file into memmory
+ if self.skipped < 10:
+ self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+
+ # For debugging, uncomment this to propogate parsing exceptions up.
+ # I.e. the underlying reason for an unexpected StopIteration exception
+ # can be found by uncommenting this.
+ # raise e
+
+ #
+ # Get next GFFFeature
+ #
+ raw_size = self.seed_interval_line_len
+
+ # If there is no seed interval, set one. Also, if there are no more
+ # intervals to read, this is where iterator dies.
+ if not self.seed_interval:
+ while not self.seed_interval:
+ try:
+ self.seed_interval = GenomicIntervalReader.next( self )
+ except ParseError, e:
+ handle_parse_error( e )
+ # TODO: When no longer supporting python 2.4 use finally:
+ #finally:
+ raw_size += len( self.current_line )
+
+ # If header or comment, clear seed interval and return it with its size.
+ if isinstance( self.seed_interval, ( Header, Comment ) ):
+ return_val = self.seed_interval
+ return_val.raw_size = len( self.current_line )
+ self.seed_interval = None
+ self.seed_interval_line_len = 0
+ return return_val
+
+ # Initialize feature identifier from seed.
+ feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+ # For GFF3
+ feature_id = self.seed_interval.attributes.get( 'ID', None )
+ # For GTF.
+ feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )
+
+ # Read all intervals associated with seed.
+ feature_intervals = []
+ feature_intervals.append( self.seed_interval )
+ while True:
+ try:
+ interval = GenomicIntervalReader.next( self )
+ raw_size += len( self.current_line )
+ except StopIteration, e:
+ # No more intervals to read, but last feature needs to be
+ # returned.
+ interval = None
+ raw_size += len( self.current_line )
+ break
+ except ParseError, e:
+ handle_parse_error( e )
+ raw_size += len( self.current_line )
+ continue
+ # TODO: When no longer supporting python 2.4 use finally:
+ #finally:
+ #raw_size += len( self.current_line )
+
+ # Ignore comments.
+ if isinstance( interval, Comment ):
+ continue
+
+ # Determine if interval is part of feature.
+ part_of = False
+ group = interval.attributes.get( 'group', None )
+ # GFF test:
+ if group and feature_group == group:
+ part_of = True
+ # GFF3 test:
+ parent_id = interval.attributes.get( 'Parent', None )
+ cur_id = interval.attributes.get( 'ID', None )
+ if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
+ part_of = True
+ # GTF test:
+ transcript_id = interval.attributes.get( 'transcript_id', None )
+ if transcript_id and transcript_id == feature_transcript_id:
+ part_of = True
+
+ # If interval is not part of feature, clean up and break.
+ if not part_of:
+ # Adjust raw size because current line is not part of feature.
+ raw_size -= len( self.current_line )
+ break
+
+ # Interval associated with feature.
+ feature_intervals.append( interval )
+
+ # Last interval read is the seed for the next interval.
+ self.seed_interval = interval
+ self.seed_interval_line_len = len( self.current_line )
+
+ # Return feature.
+ feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
+ self.end_col, self.strand_col, self.score_col,
+ self.default_strand, fix_strand=self.fix_strand,
+ intervals=feature_intervals, raw_size=raw_size )
+
+ # Convert to BED coords?
+ if self.convert_to_bed_coord:
+ convert_gff_coords_to_bed( feature )
+
+ return feature
+
+
+def convert_bed_coords_to_gff( interval ):
+ """
+ Converts an interval object's coordinates from BED format to GFF format.
+ Accepted object types include GenomicInterval and list (where the first
+ element in the list is the interval's start, and the second element is
+ the interval's end).
+ """
+ if isinstance( interval, GenomicInterval ):
+ interval.start += 1
+ if isinstance( interval, GFFFeature ):
+ for subinterval in interval.intervals:
+ convert_bed_coords_to_gff( subinterval )
+ elif type( interval ) is list:
+ interval[ 0 ] += 1
+ return interval
+
+
+def convert_gff_coords_to_bed( interval ):
+ """
+ Converts an interval object's coordinates from GFF format to BED format.
+ Accepted object types include GFFFeature, GenomicInterval, and list (where
+ the first element in the list is the interval's start, and the second
+ element is the interval's end).
+ """
+ if isinstance( interval, GenomicInterval ):
+ interval.start -= 1
+ if isinstance( interval, GFFFeature ):
+ for subinterval in interval.intervals:
+ convert_gff_coords_to_bed( subinterval )
+ elif type( interval ) is list:
+ interval[ 0 ] -= 1
+ return interval
+
+
+def parse_gff_attributes( attr_str ):
+ """
+ Parses a GFF/GTF attribute string and returns a dictionary of name-value
+ pairs. The general format for a GFF3 attributes string is
+
+ name1=value1;name2=value2
+
+ The general format for a GTF attribute string is
+
+ name1 "value1" ; name2 "value2"
+
+ The general format for a GFF attribute string is a single string that
+ denotes the interval's group; in this case, method returns a dictionary
+ with a single key-value pair, and key name is 'group'
+ """
+ attributes_list = attr_str.split(";")
+ attributes = {}
+ for name_value_pair in attributes_list:
+ # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3
+ # attribute; next, try double quotes for GTF.
+ pair = name_value_pair.strip().split("=")
+ if len( pair ) == 1:
+ pair = name_value_pair.strip().split("\"")
+ if len( pair ) == 1:
+ # Could not split for some reason -- raise exception?
+ continue
+ if pair == '':
+ continue
+ name = pair[0].strip()
+ if name == '':
+ continue
+ # Need to strip double quote from values
+ value = pair[1].strip(" \"")
+ attributes[ name ] = value
+
+ if len( attributes ) == 0:
+ # Could not split attributes string, so entire string must be
+ # 'group' attribute. This is the case for strictly GFF files.
+ attributes['group'] = attr_str
+ return attributes
+
+
+def gff_attributes_to_str( attrs, gff_format ):
+ """
+ Convert GFF attributes to string. Supported formats are GFF3, GTF.
+ """
+ if gff_format == 'GTF':
+ format_string = '%s "%s"'
+ # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id
+ id_attr = None
+ if 'group' in attrs:
+ id_attr = 'group'
+ elif 'ID' in attrs:
+ id_attr = 'ID'
+ elif 'Parent' in attrs:
+ id_attr = 'Parent'
+ if id_attr:
+ attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr]
+ elif gff_format == 'GFF3':
+ format_string = '%s=%s'
+ attrs_strs = []
+ for name, value in attrs.items():
+ attrs_strs.append( format_string % ( name, value ) )
+ return " ; ".join( attrs_strs )
+
+
+def read_unordered_gtf( iterator, strict=False ):
+ """
+ Returns GTF features found in an iterator. GTF lines need not be ordered
+ or clustered for reader to work. Reader returns GFFFeature objects sorted
+ by transcript_id, chrom, and start position.
+ """
+
+ # -- Get function that generates line/feature key. --
+
+ get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ]
+ if strict:
+ # Strict GTF parsing uses transcript_id only to group lines into feature.
+ key_fn = get_transcript_id
+ else:
+ # Use lenient parsing where chromosome + transcript_id is the key. This allows
+ # transcripts with same ID on different chromosomes; this occurs in some popular
+ # datasources, such as RefGenes in UCSC.
+ key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields )
+
+ # Aggregate intervals by transcript_id and collect comments.
+ feature_intervals = odict()
+ comments = []
+ for count, line in enumerate( iterator ):
+ if line.startswith( '#' ):
+ comments.append( Comment( line ) )
+ continue
+
+ line_key = key_fn( line.split('\t') )
+ if line_key in feature_intervals:
+ feature = feature_intervals[ line_key ]
+ else:
+ feature = []
+ feature_intervals[ line_key ] = feature
+ feature.append( GFFInterval( None, line.split( '\t' ) ) )
+
+ # Create features.
+ chroms_features = {}
+ for count, intervals in enumerate( feature_intervals.values() ):
+ # Sort intervals by start position.
+ intervals.sort( lambda a, b: cmp( a.start, b.start ) )
+ feature = GFFFeature( None, intervals=intervals )
+ if feature.chrom not in chroms_features:
+ chroms_features[ feature.chrom ] = []
+ chroms_features[ feature.chrom ].append( feature )
+
+ # Sort features by chrom, start position.
+ chroms_features_sorted = []
+ for chrom_features in chroms_features.values():
+ chroms_features_sorted.append( chrom_features )
+ chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) )
+ for features in chroms_features_sorted:
+ features.sort( lambda a, b: cmp( a.start, b.start ) )
+
+ # Yield comments first, then features.
+ # FIXME: comments can appear anywhere in file, not just the beginning.
+ # Ideally, then comments would be associated with features and output
+ # just before feature/line.
+ for comment in comments:
+ yield comment
+
+ for chrom_features in chroms_features_sorted:
+ for feature in chrom_features:
+ yield feature
diff -r 5f72be09cfd3 -r 8ddabc73af92 utils/odict.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils/odict.py Thu Feb 11 12:11:25 2016 -0500
@@ -0,0 +1,86 @@
+"""
+Ordered dictionary implementation.
+"""
+
+from UserDict import UserDict
+
+
+class odict(UserDict):
+ """
+ http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747
+
+ This dictionary class extends UserDict to record the order in which items are
+ added. Calling keys(), values(), items(), etc. will return results in this
+ order.
+ """
+ def __init__( self, dict=None ):
+ self._keys = []
+ UserDict.__init__( self, dict )
+
+ def __delitem__( self, key ):
+ UserDict.__delitem__( self, key )
+ self._keys.remove( key )
+
+ def __setitem__( self, key, item ):
+ UserDict.__setitem__( self, key, item )
+ if key not in self._keys:
+ self._keys.append( key )
+
+ def clear( self ):
+ UserDict.clear( self )
+ self._keys = []
+
+ def copy(self):
+ new = odict()
+ new.update( self )
+ return new
+
+ def items( self ):
+ return zip( self._keys, self.values() )
+
+ def keys( self ):
+ return self._keys[:]
+
+ def popitem( self ):
+ try:
+ key = self._keys[-1]
+ except IndexError:
+ raise KeyError( 'dictionary is empty' )
+ val = self[ key ]
+ del self[ key ]
+ return ( key, val )
+
+ def setdefault( self, key, failobj=None ):
+ if key not in self._keys:
+ self._keys.append( key )
+ return UserDict.setdefault( self, key, failobj )
+
+ def update( self, dict ):
+ for ( key, val ) in dict.items():
+ self.__setitem__( key, val )
+
+ def values( self ):
+ return map( self.get, self._keys )
+
+ def iterkeys( self ):
+ return iter( self._keys )
+
+ def itervalues( self ):
+ for key in self._keys:
+ yield self.get( key )
+
+ def iteritems( self ):
+ for key in self._keys:
+ yield key, self.get( key )
+
+ def __iter__( self ):
+ for key in self._keys:
+ yield key
+
+ def reverse( self ):
+ self._keys.reverse()
+
+ def insert( self, index, key, item ):
+ if key not in self._keys:
+ self._keys.insert( index, key )
+ UserDict.__setitem__( self, key, item )