Mercurial > repos > devteam > flanking_features
annotate utils/gff_util.py @ 1:8307665c4b6c draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
| author | devteam |
|---|---|
| date | Wed, 11 Nov 2015 12:48:18 -0500 |
| parents | 90100b587723 |
| children | a09d13b108fd |
| rev | line source |
|---|---|
| 0 | 1 """ |
| 2 Provides utilities for working with GFF files. | |
| 3 """ | |
| 4 | |
| 5 import copy | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
7 from bx.tabular.io import Header, Comment, ParseError |
| 0 | 8 from utils.odict import odict |
| 9 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
10 |
| 0 | 11 class GFFInterval( GenomicInterval ): |
| 12 """ | |
| 13 A GFF interval, including attributes. If file is strictly a GFF file, | |
| 14 only attribute is 'group.' | |
| 15 """ | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
16 def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, |
| 0 | 17 strand_col=6, score_col=5, default_strand='.', fix_strand=False ): |
| 18 # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, | |
| 19 # temporarily set strand and then unset after initing GenomicInterval. | |
| 20 unknown_strand = False | |
| 21 if not fix_strand and fields[ strand_col ] == '.': | |
| 22 unknown_strand = True | |
| 23 fields[ strand_col ] = '+' | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
24 GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, |
| 0 | 25 default_strand, fix_strand=fix_strand ) |
| 26 if unknown_strand: | |
| 27 self.strand = '.' | |
| 28 self.fields[ strand_col ] = '.' | |
| 29 | |
| 30 # Handle feature, score column. | |
| 31 self.feature_col = feature_col | |
| 32 if self.feature_col >= self.nfields: | |
| 33 raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) | |
| 34 self.feature = self.fields[ self.feature_col ] | |
| 35 self.score_col = score_col | |
| 36 if self.score_col >= self.nfields: | |
| 37 raise MissingFieldError( "No field for score_col (%d)" % score_col ) | |
| 38 self.score = self.fields[ self.score_col ] | |
| 39 | |
| 40 # GFF attributes. | |
| 41 self.attributes = parse_gff_attributes( fields[8] ) | |
| 42 | |
| 43 def copy( self ): | |
| 44 return GFFInterval(self.reader, list( self.fields ), self.chrom_col, self.feature_col, self.start_col, | |
| 45 self.end_col, self.strand_col, self.score_col, self.strand) | |
| 46 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
47 |
| 0 | 48 class GFFFeature( GFFInterval ): |
| 49 """ | |
| 50 A GFF feature, which can include multiple intervals. | |
| 51 """ | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
52 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
53 strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], |
| 0 | 54 raw_size=0 ): |
| 55 # Use copy so that first interval and feature do not share fields. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
56 GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
57 start_col, end_col, strand_col, score_col, default_strand, |
| 0 | 58 fix_strand=fix_strand ) |
| 59 self.intervals = intervals | |
| 60 self.raw_size = raw_size | |
| 61 # Use intervals to set feature attributes. | |
| 62 for interval in self.intervals: | |
| 63 # Error checking. NOTE: intervals need not share the same strand. | |
| 64 if interval.chrom != self.chrom: | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
65 raise ValueError( "interval chrom does not match self chrom: %s != %s" % |
| 0 | 66 ( interval.chrom, self.chrom ) ) |
| 67 # Set start, end of interval. | |
| 68 if interval.start < self.start: | |
| 69 self.start = interval.start | |
| 70 if interval.end > self.end: | |
| 71 self.end = interval.end | |
| 72 | |
| 73 def name( self ): | |
| 74 """ Returns feature's name. """ | |
| 75 name = None | |
| 76 # Preference for name: GTF, GFF3, GFF. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
77 for attr_name in ['gene_id', 'transcript_id', # GTF |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
78 'ID', 'id', # GFF3 |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
79 'group' ]: # GFF (TODO) |
| 0 | 80 name = self.attributes.get( attr_name, None ) |
| 81 if name is not None: | |
| 82 break | |
| 83 return name | |
| 84 | |
| 85 def copy( self ): | |
| 86 intervals_copy = [] | |
| 87 for interval in self.intervals: | |
| 88 intervals_copy.append( interval.copy() ) | |
| 89 return GFFFeature(self.reader, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, | |
| 90 self.score_col, self.strand, intervals=intervals_copy ) | |
| 91 | |
| 92 def lines( self ): | |
| 93 lines = [] | |
| 94 for interval in self.intervals: | |
| 95 lines.append( '\t'.join( interval.fields ) ) | |
| 96 return lines | |
| 97 | |
| 98 | |
| 99 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): | |
| 100 """ | |
| 101 Reader wrapper that reads GFF intervals/lines and automatically converts | |
| 102 them to BED format. | |
| 103 """ | |
| 104 | |
| 105 def parse_row( self, line ): | |
| 106 # HACK: this should return a GFF interval, but bx-python operations | |
| 107 # require GenomicInterval objects and subclasses will not work. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
108 interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
109 self.end_col, self.strand_col, self.default_strand, |
| 0 | 110 fix_strand=self.fix_strand ) |
| 111 interval = convert_gff_coords_to_bed( interval ) | |
| 112 return interval | |
| 113 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
114 |
| 0 | 115 class GFFReaderWrapper( NiceReaderWrapper ): |
| 116 """ | |
| 117 Reader wrapper for GFF files. | |
| 118 | |
| 119 Wrapper has two major functions: | |
| 120 | |
| 121 1. group entries for GFF file (via group column), GFF3 (via id attribute), | |
| 122 or GTF (via gene_id/transcript id); | |
| 123 2. convert coordinates from GFF format--starting and ending coordinates | |
| 124 are 1-based, closed--to the 'traditional'/BED interval format--0 based, | |
| 125 half-open. This is useful when using GFF files as inputs to tools that | |
| 126 expect traditional interval format. | |
| 127 """ | |
| 128 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
129 def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, |
| 0 | 130 end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs ): |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
131 NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, |
| 0 | 132 strand_col=strand_col, fix_strand=fix_strand, **kwargs ) |
| 133 self.feature_col = feature_col | |
| 134 self.score_col = score_col | |
| 135 self.convert_to_bed_coord = convert_to_bed_coord | |
| 136 self.last_line = None | |
| 137 self.cur_offset = 0 | |
| 138 self.seed_interval = None | |
| 139 self.seed_interval_line_len = 0 | |
| 140 | |
| 141 def parse_row( self, line ): | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
143 self.start_col, self.end_col, self.strand_col, self.score_col, |
| 0 | 144 self.default_strand, fix_strand=self.fix_strand ) |
| 145 return interval | |
| 146 | |
| 147 def next( self ): | |
| 148 """ Returns next GFFFeature. """ | |
| 149 | |
| 150 # | |
| 151 # Helper function. | |
| 152 # | |
| 153 | |
| 154 def handle_parse_error( parse_error ): | |
| 155 """ Actions to take when ParseError found. """ | |
| 156 if self.outstream: | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
157 if self.print_delegate and hasattr(self.print_delegate, "__call__"): |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
158 self.print_delegate( self.outstream, e, self ) |
| 0 | 159 self.skipped += 1 |
| 160 # no reason to stuff an entire bad file into memmory | |
| 161 if self.skipped < 10: | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
162 self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) |
| 0 | 163 |
| 164 # For debugging, uncomment this to propogate parsing exceptions up. | |
| 165 # I.e. the underlying reason for an unexpected StopIteration exception | |
| 166 # can be found by uncommenting this. | |
| 167 # raise e | |
| 168 | |
| 169 # | |
| 170 # Get next GFFFeature | |
| 171 # | |
| 172 raw_size = self.seed_interval_line_len | |
| 173 | |
| 174 # If there is no seed interval, set one. Also, if there are no more | |
| 175 # intervals to read, this is where iterator dies. | |
| 176 if not self.seed_interval: | |
| 177 while not self.seed_interval: | |
| 178 try: | |
| 179 self.seed_interval = GenomicIntervalReader.next( self ) | |
| 180 except ParseError, e: | |
| 181 handle_parse_error( e ) | |
| 182 # TODO: When no longer supporting python 2.4 use finally: | |
| 183 #finally: | |
| 184 raw_size += len( self.current_line ) | |
| 185 | |
| 186 # If header or comment, clear seed interval and return it with its size. | |
| 187 if isinstance( self.seed_interval, ( Header, Comment ) ): | |
| 188 return_val = self.seed_interval | |
| 189 return_val.raw_size = len( self.current_line ) | |
| 190 self.seed_interval = None | |
| 191 self.seed_interval_line_len = 0 | |
| 192 return return_val | |
| 193 | |
| 194 # Initialize feature identifier from seed. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
195 feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF |
| 0 | 196 # For GFF3 |
| 197 feature_id = self.seed_interval.attributes.get( 'ID', None ) | |
| 198 # For GTF. | |
| 199 feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) | |
| 200 | |
| 201 # Read all intervals associated with seed. | |
| 202 feature_intervals = [] | |
| 203 feature_intervals.append( self.seed_interval ) | |
| 204 while True: | |
| 205 try: | |
| 206 interval = GenomicIntervalReader.next( self ) | |
| 207 raw_size += len( self.current_line ) | |
| 208 except StopIteration, e: | |
| 209 # No more intervals to read, but last feature needs to be | |
| 210 # returned. | |
| 211 interval = None | |
| 212 raw_size += len( self.current_line ) | |
| 213 break | |
| 214 except ParseError, e: | |
| 215 handle_parse_error( e ) | |
| 216 raw_size += len( self.current_line ) | |
| 217 continue | |
| 218 # TODO: When no longer supporting python 2.4 use finally: | |
| 219 #finally: | |
| 220 #raw_size += len( self.current_line ) | |
| 221 | |
| 222 # Ignore comments. | |
| 223 if isinstance( interval, Comment ): | |
| 224 continue | |
| 225 | |
| 226 # Determine if interval is part of feature. | |
| 227 part_of = False | |
| 228 group = interval.attributes.get( 'group', None ) | |
| 229 # GFF test: | |
| 230 if group and feature_group == group: | |
| 231 part_of = True | |
| 232 # GFF3 test: | |
| 233 parent_id = interval.attributes.get( 'Parent', None ) | |
| 234 cur_id = interval.attributes.get( 'ID', None ) | |
| 235 if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): | |
| 236 part_of = True | |
| 237 # GTF test: | |
| 238 transcript_id = interval.attributes.get( 'transcript_id', None ) | |
| 239 if transcript_id and transcript_id == feature_transcript_id: | |
| 240 part_of = True | |
| 241 | |
| 242 # If interval is not part of feature, clean up and break. | |
| 243 if not part_of: | |
| 244 # Adjust raw size because current line is not part of feature. | |
| 245 raw_size -= len( self.current_line ) | |
| 246 break | |
| 247 | |
| 248 # Interval associated with feature. | |
| 249 feature_intervals.append( interval ) | |
| 250 | |
| 251 # Last interval read is the seed for the next interval. | |
| 252 self.seed_interval = interval | |
| 253 self.seed_interval_line_len = len( self.current_line ) | |
| 254 | |
| 255 # Return feature. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
256 feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
257 self.end_col, self.strand_col, self.score_col, |
|
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
258 self.default_strand, fix_strand=self.fix_strand, |
| 0 | 259 intervals=feature_intervals, raw_size=raw_size ) |
| 260 | |
| 261 # Convert to BED coords? | |
| 262 if self.convert_to_bed_coord: | |
| 263 convert_gff_coords_to_bed( feature ) | |
| 264 | |
| 265 return feature | |
| 266 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
267 |
| 0 | 268 def convert_bed_coords_to_gff( interval ): |
| 269 """ | |
| 270 Converts an interval object's coordinates from BED format to GFF format. | |
| 271 Accepted object types include GenomicInterval and list (where the first | |
| 272 element in the list is the interval's start, and the second element is | |
| 273 the interval's end). | |
| 274 """ | |
| 275 if isinstance( interval, GenomicInterval ): | |
| 276 interval.start += 1 | |
| 277 if isinstance( interval, GFFFeature ): | |
| 278 for subinterval in interval.intervals: | |
| 279 convert_bed_coords_to_gff( subinterval ) | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
280 elif type( interval ) is list: |
| 0 | 281 interval[ 0 ] += 1 |
| 282 return interval | |
| 283 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
284 |
| 0 | 285 def convert_gff_coords_to_bed( interval ): |
| 286 """ | |
| 287 Converts an interval object's coordinates from GFF format to BED format. | |
| 288 Accepted object types include GFFFeature, GenomicInterval, and list (where | |
| 289 the first element in the list is the interval's start, and the second | |
| 290 element is the interval's end). | |
| 291 """ | |
| 292 if isinstance( interval, GenomicInterval ): | |
| 293 interval.start -= 1 | |
| 294 if isinstance( interval, GFFFeature ): | |
| 295 for subinterval in interval.intervals: | |
| 296 convert_gff_coords_to_bed( subinterval ) | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
297 elif type( interval ) is list: |
| 0 | 298 interval[ 0 ] -= 1 |
| 299 return interval | |
| 300 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
301 |
| 0 | 302 def parse_gff_attributes( attr_str ): |
| 303 """ | |
| 304 Parses a GFF/GTF attribute string and returns a dictionary of name-value | |
| 305 pairs. The general format for a GFF3 attributes string is | |
| 306 | |
| 307 name1=value1;name2=value2 | |
| 308 | |
| 309 The general format for a GTF attribute string is | |
| 310 | |
| 311 name1 "value1" ; name2 "value2" | |
| 312 | |
| 313 The general format for a GFF attribute string is a single string that | |
| 314 denotes the interval's group; in this case, method returns a dictionary | |
| 315 with a single key-value pair, and key name is 'group' | |
| 316 """ | |
| 317 attributes_list = attr_str.split(";") | |
| 318 attributes = {} | |
| 319 for name_value_pair in attributes_list: | |
| 320 # Try splitting by '=' (GFF3) first because spaces are allowed in GFF3 | |
| 321 # attribute; next, try double quotes for GTF. | |
| 322 pair = name_value_pair.strip().split("=") | |
| 323 if len( pair ) == 1: | |
| 324 pair = name_value_pair.strip().split("\"") | |
| 325 if len( pair ) == 1: | |
| 326 # Could not split for some reason -- raise exception? | |
| 327 continue | |
| 328 if pair == '': | |
| 329 continue | |
| 330 name = pair[0].strip() | |
| 331 if name == '': | |
| 332 continue | |
| 333 # Need to strip double quote from values | |
| 334 value = pair[1].strip(" \"") | |
| 335 attributes[ name ] = value | |
| 336 | |
| 337 if len( attributes ) == 0: | |
| 338 # Could not split attributes string, so entire string must be | |
| 339 # 'group' attribute. This is the case for strictly GFF files. | |
| 340 attributes['group'] = attr_str | |
| 341 return attributes | |
| 342 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
343 |
| 0 | 344 def gff_attributes_to_str( attrs, gff_format ): |
| 345 """ | |
| 346 Convert GFF attributes to string. Supported formats are GFF3, GTF. | |
| 347 """ | |
| 348 if gff_format == 'GTF': | |
| 349 format_string = '%s "%s"' | |
| 350 # Convert group (GFF) and ID, parent (GFF3) attributes to transcript_id, gene_id | |
| 351 id_attr = None | |
| 352 if 'group' in attrs: | |
| 353 id_attr = 'group' | |
| 354 elif 'ID' in attrs: | |
| 355 id_attr = 'ID' | |
| 356 elif 'Parent' in attrs: | |
| 357 id_attr = 'Parent' | |
| 358 if id_attr: | |
| 359 attrs['transcript_id'] = attrs['gene_id'] = attrs[id_attr] | |
| 360 elif gff_format == 'GFF3': | |
| 361 format_string = '%s=%s' | |
| 362 attrs_strs = [] | |
| 363 for name, value in attrs.items(): | |
| 364 attrs_strs.append( format_string % ( name, value ) ) | |
| 365 return " ; ".join( attrs_strs ) | |
| 366 | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
367 |
| 0 | 368 def read_unordered_gtf( iterator, strict=False ): |
| 369 """ | |
| 370 Returns GTF features found in an iterator. GTF lines need not be ordered | |
| 371 or clustered for reader to work. Reader returns GFFFeature objects sorted | |
| 372 by transcript_id, chrom, and start position. | |
| 373 """ | |
| 374 | |
| 375 # -- Get function that generates line/feature key. -- | |
| 376 | |
| 377 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] | |
| 378 if strict: | |
| 379 # Strict GTF parsing uses transcript_id only to group lines into feature. | |
| 380 key_fn = get_transcript_id | |
| 381 else: | |
| 382 # Use lenient parsing where chromosome + transcript_id is the key. This allows | |
| 383 # transcripts with same ID on different chromosomes; this occurs in some popular | |
| 384 # datasources, such as RefGenes in UCSC. | |
| 385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) | |
| 386 | |
| 387 # Aggregate intervals by transcript_id and collect comments. | |
| 388 feature_intervals = odict() | |
| 389 comments = [] | |
| 390 for count, line in enumerate( iterator ): | |
| 391 if line.startswith( '#' ): | |
| 392 comments.append( Comment( line ) ) | |
| 393 continue | |
| 394 | |
| 395 line_key = key_fn( line.split('\t') ) | |
| 396 if line_key in feature_intervals: | |
| 397 feature = feature_intervals[ line_key ] | |
| 398 else: | |
| 399 feature = [] | |
| 400 feature_intervals[ line_key ] = feature | |
| 401 feature.append( GFFInterval( None, line.split( '\t' ) ) ) | |
| 402 | |
| 403 # Create features. | |
| 404 chroms_features = {} | |
| 405 for count, intervals in enumerate( feature_intervals.values() ): | |
| 406 # Sort intervals by start position. | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
407 intervals.sort( lambda a, b: cmp( a.start, b.start ) ) |
| 0 | 408 feature = GFFFeature( None, intervals=intervals ) |
| 409 if feature.chrom not in chroms_features: | |
| 410 chroms_features[ feature.chrom ] = [] | |
| 411 chroms_features[ feature.chrom ].append( feature ) | |
| 412 | |
| 413 # Sort features by chrom, start position. | |
| 414 chroms_features_sorted = [] | |
| 415 for chrom_features in chroms_features.values(): | |
| 416 chroms_features_sorted.append( chrom_features ) | |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
417 chroms_features_sorted.sort( lambda a, b: cmp( a[0].chrom, b[0].chrom ) ) |
| 0 | 418 for features in chroms_features_sorted: |
|
1
8307665c4b6c
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/flanking_features commit a1517c9d22029095120643bbe2c8fa53754dd2b7
devteam
parents:
0
diff
changeset
|
419 features.sort( lambda a, b: cmp( a.start, b.start ) ) |
| 0 | 420 |
| 421 # Yield comments first, then features. | |
| 422 # FIXME: comments can appear anywhere in file, not just the beginning. | |
| 423 # Ideally, then comments would be associated with features and output | |
| 424 # just before feature/line. | |
| 425 for comment in comments: | |
| 426 yield comment | |
| 427 | |
| 428 for chrom_features in chroms_features_sorted: | |
| 429 for feature in chrom_features: | |
| 430 yield feature |
