Mercurial > repos > devteam > intersect
comparison utils/gff_util.py @ 5:33b3f3688db4 draft
planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tool_collections/gops/intersect commit cae3e05d02e60f595bb8b6d77a84f030e9bd1689
author | devteam |
---|---|
date | Thu, 22 Jun 2017 18:52:23 -0400 |
parents | 8ddabc73af92 |
children |
comparison
equal
deleted
inserted
replaced
4:8ddabc73af92 | 5:33b3f3688db4 |
---|---|
1 """ | 1 """ |
2 Provides utilities for working with GFF files. | 2 Provides utilities for working with GFF files. |
3 """ | 3 """ |
4 | |
5 import copy | 4 import copy |
5 | |
6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper | 6 from bx.intervals.io import GenomicInterval, GenomicIntervalReader, MissingFieldError, NiceReaderWrapper |
7 from bx.tabular.io import Header, Comment, ParseError | 7 from bx.tabular.io import Comment, Header, ParseError |
8 from utils.odict import odict | 8 |
9 from .odict import odict | |
9 | 10 |
10 | 11 |
11 class GFFInterval( GenomicInterval ): | 12 class GFFInterval( GenomicInterval ): |
12 """ | 13 """ |
13 A GFF interval, including attributes. If file is strictly a GFF file, | 14 A GFF interval, including attributes. If file is strictly a GFF file, |
142 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, | 143 interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, |
143 self.start_col, self.end_col, self.strand_col, self.score_col, | 144 self.start_col, self.end_col, self.strand_col, self.score_col, |
144 self.default_strand, fix_strand=self.fix_strand ) | 145 self.default_strand, fix_strand=self.fix_strand ) |
145 return interval | 146 return interval |
146 | 147 |
147 def next( self ): | 148 def __next__( self ): |
148 """ Returns next GFFFeature. """ | 149 """ Returns next GFFFeature. """ |
149 | 150 |
150 # | 151 # |
151 # Helper function. | 152 # Helper function. |
152 # | 153 # |
175 # intervals to read, this is where iterator dies. | 176 # intervals to read, this is where iterator dies. |
176 if not self.seed_interval: | 177 if not self.seed_interval: |
177 while not self.seed_interval: | 178 while not self.seed_interval: |
178 try: | 179 try: |
179 self.seed_interval = GenomicIntervalReader.next( self ) | 180 self.seed_interval = GenomicIntervalReader.next( self ) |
180 except ParseError, e: | 181 except ParseError as e: |
181 handle_parse_error( e ) | 182 handle_parse_error( e ) |
182 # TODO: When no longer supporting python 2.4 use finally: | 183 # TODO: When no longer supporting python 2.4 use finally: |
183 #finally: | 184 # finally: |
184 raw_size += len( self.current_line ) | 185 raw_size += len( self.current_line ) |
185 | 186 |
186 # If header or comment, clear seed interval and return it with its size. | 187 # If header or comment, clear seed interval and return it with its size. |
187 if isinstance( self.seed_interval, ( Header, Comment ) ): | 188 if isinstance( self.seed_interval, ( Header, Comment ) ): |
188 return_val = self.seed_interval | 189 return_val = self.seed_interval |
203 feature_intervals.append( self.seed_interval ) | 204 feature_intervals.append( self.seed_interval ) |
204 while True: | 205 while True: |
205 try: | 206 try: |
206 interval = GenomicIntervalReader.next( self ) | 207 interval = GenomicIntervalReader.next( self ) |
207 raw_size += len( self.current_line ) | 208 raw_size += len( self.current_line ) |
208 except StopIteration, e: | 209 except StopIteration as e: |
209 # No more intervals to read, but last feature needs to be | 210 # No more intervals to read, but last feature needs to be |
210 # returned. | 211 # returned. |
211 interval = None | 212 interval = None |
212 raw_size += len( self.current_line ) | 213 raw_size += len( self.current_line ) |
213 break | 214 break |
214 except ParseError, e: | 215 except ParseError as e: |
215 handle_parse_error( e ) | 216 handle_parse_error( e ) |
216 raw_size += len( self.current_line ) | 217 raw_size += len( self.current_line ) |
217 continue | 218 continue |
218 # TODO: When no longer supporting python 2.4 use finally: | 219 # TODO: When no longer supporting python 2.4 use finally: |
219 #finally: | 220 # finally: |
220 #raw_size += len( self.current_line ) | 221 # raw_size += len( self.current_line ) |
221 | 222 |
222 # Ignore comments. | 223 # Ignore comments. |
223 if isinstance( interval, Comment ): | 224 if isinstance( interval, Comment ): |
224 continue | 225 continue |
225 | 226 |
261 # Convert to BED coords? | 262 # Convert to BED coords? |
262 if self.convert_to_bed_coord: | 263 if self.convert_to_bed_coord: |
263 convert_gff_coords_to_bed( feature ) | 264 convert_gff_coords_to_bed( feature ) |
264 | 265 |
265 return feature | 266 return feature |
267 next = __next__ # This line should be removed once the bx-python port to Python3 is finished | |
266 | 268 |
267 | 269 |
268 def convert_bed_coords_to_gff( interval ): | 270 def convert_bed_coords_to_gff( interval ): |
269 """ | 271 """ |
270 Converts an interval object's coordinates from BED format to GFF format. | 272 Converts an interval object's coordinates from BED format to GFF format. |
372 by transcript_id, chrom, and start position. | 374 by transcript_id, chrom, and start position. |
373 """ | 375 """ |
374 | 376 |
375 # -- Get function that generates line/feature key. -- | 377 # -- Get function that generates line/feature key. -- |
376 | 378 |
377 get_transcript_id = lambda fields: parse_gff_attributes( fields[8] )[ 'transcript_id' ] | 379 def get_transcript_id(fields): |
380 return parse_gff_attributes( fields[8] )[ 'transcript_id' ] | |
381 | |
378 if strict: | 382 if strict: |
379 # Strict GTF parsing uses transcript_id only to group lines into feature. | 383 # Strict GTF parsing uses transcript_id only to group lines into feature. |
380 key_fn = get_transcript_id | 384 key_fn = get_transcript_id |
381 else: | 385 else: |
382 # Use lenient parsing where chromosome + transcript_id is the key. This allows | 386 # Use lenient parsing where chromosome + transcript_id is the key. This allows |
383 # transcripts with same ID on different chromosomes; this occurs in some popular | 387 # transcripts with same ID on different chromosomes; this occurs in some popular |
384 # datasources, such as RefGenes in UCSC. | 388 # datasources, such as RefGenes in UCSC. |
385 key_fn = lambda fields: fields[0] + '_' + get_transcript_id( fields ) | 389 def key_fn(fields): |
390 return fields[0] + '_' + get_transcript_id( fields ) | |
386 | 391 |
387 # Aggregate intervals by transcript_id and collect comments. | 392 # Aggregate intervals by transcript_id and collect comments. |
388 feature_intervals = odict() | 393 feature_intervals = odict() |
389 comments = [] | 394 comments = [] |
390 for count, line in enumerate( iterator ): | 395 for count, line in enumerate( iterator ): |