Mercurial > repos > devteam > fasta_concatenate_by_species
changeset 2:16df616b39e5 draft
"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_concatenate_by_species commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author | devteam |
---|---|
date | Sun, 01 Mar 2020 07:24:26 -0500 |
parents | 717aee069681 |
children | 25b8736c627a |
files | fasta_concatenate_by_species.py fasta_concatenate_by_species.xml tool_dependencies.xml utils/maf_utilities.py utils/odict.py |
diffstat | 5 files changed, 442 insertions(+), 453 deletions(-) [+] |
line wrap: on
line diff
--- a/fasta_concatenate_by_species.py Mon Nov 17 10:15:05 2014 -0500 +++ b/fasta_concatenate_by_species.py Sun Mar 01 07:24:26 2020 -0500 @@ -1,39 +1,43 @@ #!/usr/bin/env python -#Dan Blankenberg +# Dan Blankenberg """ -Takes a Multiple Alignment FASTA file and concatenates -sequences for each species, resulting in one sequence +Takes a Multiple Alignment FASTA file and concatenates +sequences for each species, resulting in one sequence alignment per species. """ -import sys, tempfile +import sys +import tempfile +from collections import OrderedDict + from utils.maf_utilities import iter_fasta_alignment -from utils.odict import odict + def __main__(): input_filename = sys.argv[1] output_filename = sys.argv[2] - species = odict() + species = OrderedDict() cur_size = 0 - for components in iter_fasta_alignment( input_filename ): - species_not_written = species.keys() + for components in iter_fasta_alignment(input_filename): + species_not_written = list(species.keys()) for component in components: if component.species not in species: - species[component.species] = tempfile.TemporaryFile() - species[component.species].write( "-" * cur_size ) - species[component.species].write( component.text ) + species[component.species] = tempfile.TemporaryFile(mode="r+") + species[component.species].write("-" * cur_size) + species[component.species].write(component.text) try: - species_not_written.remove( component.species ) + species_not_written.remove(component.species) except ValueError: - #this is a new species + # this is a new species pass for spec in species_not_written: - species[spec].write( "-" * len( components[0].text ) ) - cur_size += len( components[0].text ) - out = open( output_filename, 'wb' ) - for spec, f in species.iteritems(): - f.seek( 0 ) - out.write( ">%s\n%s\n" % ( spec, f.read() ) ) - out.close() + species[spec].write("-" * len(components[0].text)) + cur_size += len(components[0].text) + with open(output_filename, 'w') as out: + for spec, f in species.items(): + f.seek(0) + out.write(">%s\n%s\n" % (spec, f.read())) -if __name__ == "__main__" : __main__() + +if __name__ == "__main__": + __main__()
--- a/fasta_concatenate_by_species.xml Mon Nov 17 10:15:05 2014 -0500 +++ b/fasta_concatenate_by_species.xml Sun Mar 01 07:24:26 2020 -0500 @@ -1,9 +1,13 @@ -<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0"> +<tool id="fasta_concatenate0" name="Concatenate" version="0.0.1" profile="16.04"> <description>FASTA alignment by species</description> <requirements> - <requirement type="package" version="0.7.1">bx-python</requirement> + <requirement type="package" version="0.8.8">bx-python</requirement> </requirements> - <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command> + <command> + python '$__tool_directory__/fasta_concatenate_by_species.py' + '$input1' + '$out_file1' + </command> <inputs> <param name="input1" type="data" format="fasta" label="FASTA alignment"/> </inputs> @@ -16,7 +20,7 @@ <output name="out_file1" file="fasta_concatenate_out.fasta" /> </test> </tests> - <help> + <help><![CDATA[ **What it does** @@ -71,5 +75,5 @@ This tool will only work properly on files with Galaxy style FASTA headers. -</help> + ]]></help> </tool>
--- a/tool_dependencies.xml Mon Nov 17 10:15:05 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,6 +0,0 @@ -<?xml version="1.0"?> -<tool_dependency> - <package name="bx-python" version="0.7.1"> - <repository changeset_revision="2d0c08728bca" name="package_bx_python_0_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" /> - </package> -</tool_dependency>
--- a/utils/maf_utilities.py Mon Nov 17 10:15:05 2014 -0500 +++ b/utils/maf_utilities.py Sun Mar 01 07:24:26 2020 -0500 @@ -2,197 +2,226 @@ """ Provides wrappers and utilities for working with MAF files and alignments. """ -#Dan Blankenberg +# Dan Blankenberg import bx.align.maf import bx.intervals import bx.interval_index_file -import sys, os, string, tempfile +import sys +import os +import tempfile import logging from copy import deepcopy -assert sys.version_info[:2] >= ( 2, 4 ) +try: + maketrans = str.maketrans +except AttributeError: + from string import maketrans + +assert sys.version_info[:2] >= (2, 4) log = logging.getLogger(__name__) -GAP_CHARS = [ '-' ] +GAP_CHARS = ['-'] SRC_SPLIT_CHAR = '.' -def src_split( src ): - fields = src.split( SRC_SPLIT_CHAR, 1 ) - spec = fields.pop( 0 ) + +def src_split(src): + fields = src.split(SRC_SPLIT_CHAR, 1) + spec = fields.pop(0) if fields: - chrom = fields.pop( 0 ) + chrom = fields.pop(0) else: chrom = spec return spec, chrom -def src_merge( spec, chrom, contig = None ): - if None in [ spec, chrom ]: + +def src_merge(spec, chrom, contig=None): + if None in [spec, chrom]: spec = chrom = spec or chrom - return bx.align.maf.src_merge( spec, chrom, contig ) + return bx.align.maf.src_merge(spec, chrom, contig) -def get_species_in_block( block ): + +def get_species_in_block(block): species = [] for c in block.components: - spec, chrom = src_split( c.src ) + spec, chrom = src_split(c.src) if spec not in species: - species.append( spec ) + species.append(spec) return species -def tool_fail( msg = "Unknown Error" ): - print >> sys.stderr, "Fatal Error: %s" % msg - sys.exit() + +def tool_fail(msg="Unknown Error"): + msg = "Fatal Error: %s" % msg + sys.exit(msg) + +# an object corresponding to a reference layered alignment -#an object corresponding to a reference layered alignment -class RegionAlignment( object ): + +class RegionAlignment(object): - DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) - MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed + DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca") - def __init__( self, size, species = [] ): - assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE ) + def __init__(self, size, species=[]): self.size = size self.sequences = {} - if not isinstance( species, list ): + if not isinstance(species, list): species = [species] for spec in species: - self.add_species( spec ) + self.add_species(spec) - #add a species to the alignment - def add_species( self, species ): - #make temporary sequence files + # add a species to the alignment + def add_species(self, species): + # make temporary sequence files self.sequences[species] = tempfile.TemporaryFile() - self.sequences[species].write( "-" * self.size ) + self.sequences[species].write("-" * self.size) - #returns the names for species found in alignment, skipping names as requested - def get_species_names( self, skip = [] ): - if not isinstance( skip, list ): skip = [skip] + # returns the names for species found in alignment, skipping names as requested + def get_species_names(self, skip=[]): + if not isinstance(skip, list): + skip = [skip] names = self.sequences.keys() for name in skip: - try: names.remove( name ) - except: pass + try: + names.remove(name) + except Exception: + pass return names - #returns the sequence for a species - def get_sequence( self, species ): - self.sequences[species].seek( 0 ) + # returns the sequence for a species + def get_sequence(self, species): + self.sequences[species].seek(0) return self.sequences[species].read() - #returns the reverse complement of the sequence for a species - def get_sequence_reverse_complement( self, species ): - complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] + # returns the reverse complement of the sequence for a species + def get_sequence_reverse_complement(self, species): + complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)] complement.reverse() - return "".join( complement ) + return "".join(complement) + + # sets a position for a species + def set_position(self, index, species, base): + if len(base) != 1: + raise Exception("A genomic position can only have a length of 1.") + return self.set_range(index, species, base) + # sets a range for a species - #sets a position for a species - def set_position( self, index, species, base ): - if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." ) - return self.set_range( index, species, base ) - #sets a range for a species - def set_range( self, index, species, bases ): - if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) ) - if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." ) - if species not in self.sequences.keys(): self.add_species( species ) - self.sequences[species].seek( index ) - self.sequences[species].write( bases ) + def set_range(self, index, species, bases): + if index >= self.size or index < 0: + raise Exception("Your index (%i) is out of range (0 - %i)." % (index, self.size - 1)) + if len(bases) == 0: + raise Exception("A set of genomic positions can only have a positive length.") + if species not in self.sequences.keys(): + self.add_species(species) + self.sequences[species].seek(index) + self.sequences[species].write(bases) - #Flush temp file of specified species, or all species - def flush( self, species = None ): + # Flush temp file of specified species, or all species + def flush(self, species=None): if species is None: species = self.sequences.keys() - elif not isinstance( species, list ): + elif not isinstance(species, list): species = [species] for spec in species: self.sequences[spec].flush() -class GenomicRegionAlignment( RegionAlignment ): + +class GenomicRegionAlignment(RegionAlignment): - def __init__( self, start, end, species = [] ): - RegionAlignment.__init__( self, end - start, species ) + def __init__(self, start, end, species=[]): + RegionAlignment.__init__(self, end - start, species) self.start = start self.end = end -class SplicedAlignment( object ): - DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" ) +class SplicedAlignment(object): + + DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca") - def __init__( self, exon_starts, exon_ends, species = [] ): - if not isinstance( exon_starts, list ): + def __init__(self, exon_starts, exon_ends, species=[]): + if not isinstance(exon_starts, list): exon_starts = [exon_starts] - if not isinstance( exon_ends, list ): + if not isinstance(exon_ends, list): exon_ends = [exon_ends] - assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes." + assert len(exon_starts) == len(exon_ends), "The number of starts does not match the number of sizes." self.exons = [] - for i in range( len( exon_starts ) ): - self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) ) + for i in range(len(exon_starts)): + self.exons.append(GenomicRegionAlignment(exon_starts[i], exon_ends[i], species)) - #returns the names for species found in alignment, skipping names as requested - def get_species_names( self, skip = [] ): - if not isinstance( skip, list ): skip = [skip] + # returns the names for species found in alignment, skipping names as requested + def get_species_names(self, skip=[]): + if not isinstance(skip, list): + skip = [skip] names = [] for exon in self.exons: - for name in exon.get_species_names( skip = skip ): + for name in exon.get_species_names(skip=skip): if name not in names: - names.append( name ) + names.append(name) return names - #returns the sequence for a species - def get_sequence( self, species ): + # returns the sequence for a species + def get_sequence(self, species): sequence = tempfile.TemporaryFile() for exon in self.exons: if species in exon.get_species_names(): - sequence.write( exon.get_sequence( species ) ) + sequence.write(exon.get_sequence(species)) else: - sequence.write( "-" * exon.size ) - sequence.seek( 0 ) + sequence.write("-" * exon.size) + sequence.seek(0) return sequence.read() - #returns the reverse complement of the sequence for a species - def get_sequence_reverse_complement( self, species ): - complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )] + # returns the reverse complement of the sequence for a species + def get_sequence_reverse_complement(self, species): + complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)] complement.reverse() - return "".join( complement ) + return "".join(complement) - #Start and end of coding region + # Start and end of coding region @property - def start( self ): + def start(self): return self.exons[0].start + @property - def end( self ): + def end(self): return self.exons[-1].end -#Open a MAF index using a UID -def maf_index_by_uid( maf_uid, index_location_file ): - for line in open( index_location_file ): +# Open a MAF index using a UID + + +def maf_index_by_uid(maf_uid, index_location_file): + for line in open(index_location_file): try: - #read each line, if not enough fields, go to next line - if line[0:1] == "#" : continue + # read each line, if not enough fields, go to next line + if line[0:1] == "#": + continue fields = line.split('\t') if maf_uid == fields[1]: try: - maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," ) - return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False ) - except Exception, e: - raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) ) - except: + maf_files = fields[4].replace("\n", "").replace("\r", "").split(",") + return bx.align.maf.MultiIndexed(maf_files, keep_open=True, parse_e_rows=False) + except Exception as e: + raise Exception('MAF UID (%s) found, but configuration appears to be malformed: %s' % (maf_uid, e)) + except Exception: pass return None -#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created -def open_or_build_maf_index( maf_file, index_filename, species = None ): +# return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created + + +def open_or_build_maf_index(maf_file, index_filename, species=None): try: - return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None ) - except: - return build_maf_index( maf_file, species = species ) + return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), None) + except Exception: + return build_maf_index(maf_file, species=species) -def build_maf_index_species_chromosomes( filename, index_species = None ): + +def build_maf_index_species_chromosomes(filename, index_species=None): species = [] species_chromosomes = {} indexes = bx.interval_index_file.Indexes() blocks = 0 try: - maf_reader = bx.align.maf.Reader( open( filename ) ) + maf_reader = bx.align.maf.Reader(open(filename)) while True: pos = maf_reader.file.tell() block = maf_reader.next() @@ -203,398 +232,441 @@ spec = c.src chrom = None if "." in spec: - spec, chrom = spec.split( ".", 1 ) + spec, chrom = spec.split(".", 1) if spec not in species: - species.append( spec ) + species.append(spec) species_chromosomes[spec] = [] if chrom and chrom not in species_chromosomes[spec]: - species_chromosomes[spec].append( chrom ) + species_chromosomes[spec].append(chrom) if index_species is None or spec in index_species: forward_strand_start = c.forward_strand_start forward_strand_end = c.forward_strand_end try: - forward_strand_start = int( forward_strand_start ) - forward_strand_end = int( forward_strand_end ) + forward_strand_start = int(forward_strand_start) + forward_strand_end = int(forward_strand_end) except ValueError: - continue #start and end are not integers, can't add component to index, goto next component - #this likely only occurs when parse_e_rows is True? - #could a species exist as only e rows? should the + continue # start and end are not integers, can't add component to index, goto next component + # this likely only occurs when parse_e_rows is True? + # could a species exist as only e rows? should the if forward_strand_end > forward_strand_start: - #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed - indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size ) - except Exception, e: - #most likely a bad MAF - log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) ) - return ( None, [], {}, 0 ) - return ( indexes, species, species_chromosomes, blocks ) + # require positive length; i.e. certain lines have start = end = 0 and cannot be indexed + indexes.add(c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size) + except Exception as e: + # most likely a bad MAF + log.debug('Building MAF index on %s failed: %s' % (filename, e)) + return (None, [], {}, 0) + return (indexes, species, species_chromosomes, blocks) -#builds and returns ( index, index_filename ) for specified maf_file -def build_maf_index( maf_file, species = None ): - indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species ) +# builds and returns ( index, index_filename ) for specified maf_file + + +def build_maf_index(maf_file, species=None): + indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes(maf_file, species) if indexes is not None: fd, index_filename = tempfile.mkstemp() - out = os.fdopen( fd, 'w' ) - indexes.write( out ) + out = os.fdopen(fd, 'w') + indexes.write(out) out.close() - return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename ) - return ( None, None ) + return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), index_filename) + return (None, None) + -def component_overlaps_region( c, region ): - if c is None: return False +def component_overlaps_region(c, region): + if c is None: + return False start, end = c.get_forward_strand_start(), c.get_forward_strand_end() if region.start >= end or region.end <= start: return False return True -def chop_block_by_region( block, src, region, species = None, mincols = 0 ): + +def chop_block_by_region(block, src, region, species=None, mincols=0): # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far: # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end ) # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency # comments welcome - slice_start = block.text_size #max for the min() - slice_end = 0 #min for the max() - old_score = block.score #save old score for later use + slice_start = block.text_size # max for the min() + slice_end = 0 # min for the max() + old_score = block.score # save old score for later use # We no longer assume only one occurance of src per block, so we need to check them all - for c in iter_components_by_src( block, src ): - if component_overlaps_region( c, region ): + for c in iter_components_by_src(block, src): + if component_overlaps_region(c, region): if c.text is not None: rev_strand = False if c.strand == "-": - #We want our coord_to_col coordinates to be returned from positive stranded component + # We want our coord_to_col coordinates to be returned from positive stranded component rev_strand = True c = c.reverse_complement() - start = max( region.start, c.start ) - end = min( region.end, c.end ) - start = c.coord_to_col( start ) - end = c.coord_to_col( end ) + start = max(region.start, c.start) + end = min(region.end, c.end) + start = c.coord_to_col(start) + end = c.coord_to_col(end) if rev_strand: - #need to orient slice coordinates to the original block direction + # need to orient slice coordinates to the original block direction slice_len = end - start - end = len( c.text ) - start + end = len(c.text) - start start = end - slice_len - slice_start = min( start, slice_start ) - slice_end = max( end, slice_end ) + slice_start = min(start, slice_start) + slice_end = max(end, slice_end) if slice_start < slice_end: - block = block.slice( slice_start, slice_end ) + block = block.slice(slice_start, slice_end) if block.text_size > mincols: # restore old score, may not be accurate, but it is better than 0 for everything? block.score = old_score if species is not None: - block = block.limit_to_species( species ) + block = block.limit_to_species(species) block.remove_all_gap_columns() return block return None -def orient_block_by_region( block, src, region, force_strand = None ): - #loop through components matching src, - #make sure each of these components overlap region - #cache strand for each of overlaping regions - #if force_strand / region.strand not in strand cache, reverse complement - ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing - strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ] - if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ): + +def orient_block_by_region(block, src, region, force_strand=None): + # loop through components matching src, + # make sure each of these components overlap region + # cache strand for each of overlaping regions + # if force_strand / region.strand not in strand cache, reverse complement + # we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing + strands = [c.strand for c in iter_components_by_src(block, src) if component_overlaps_region(c, region)] + if strands and (force_strand is None and region.strand not in strands) or (force_strand is not None and force_strand not in strands): block = block.reverse_complement() return block -def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): - for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ): + +def get_oriented_chopped_blocks_for_region(index, src, region, species=None, mincols=0, force_strand=None): + for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols, force_strand): yield block -def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): - for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): - yield orient_block_by_region( block, src, region, force_strand ), idx, offset + + +def get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0, force_strand=None): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols): + yield orient_block_by_region(block, src, region, force_strand), idx, offset -#split a block with multiple occurances of src into one block per src -def iter_blocks_split_by_src( block, src ): - for src_c in iter_components_by_src( block, src ): - new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) +# split a block with multiple occurances of src into one block per src + + +def iter_blocks_split_by_src(block, src): + for src_c in iter_components_by_src(block, src): + new_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes)) new_block.text_size = block.text_size for c in block.components: if c == src_c or c.src != src: - new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components + new_block.add_component(deepcopy(c)) # components have reference to alignment, dont want to loose reference to original alignment block in original components yield new_block -#split a block into multiple blocks with all combinations of a species appearing only once per block -def iter_blocks_split_by_species( block, species = None ): - def __split_components_by_species( components_by_species, new_block ): +# split a block into multiple blocks with all combinations of a species appearing only once per block + + +def iter_blocks_split_by_species(block, species=None): + def __split_components_by_species(components_by_species, new_block): if components_by_species: - #more species with components to add to this block - components_by_species = deepcopy( components_by_species ) - spec_comps = components_by_species.pop( 0 ) + # more species with components to add to this block + components_by_species = deepcopy(components_by_species) + spec_comps = components_by_species.pop(0) for c in spec_comps: - newer_block = deepcopy( new_block ) - newer_block.add_component( deepcopy( c ) ) - for value in __split_components_by_species( components_by_species, newer_block ): + newer_block = deepcopy(new_block) + newer_block.add_component(deepcopy(c)) + for value in __split_components_by_species(components_by_species, newer_block): yield value else: - #no more components to add, yield this block + # no more components to add, yield this block yield new_block - #divide components by species + # divide components by species spec_dict = {} if not species: species = [] for c in block.components: - spec, chrom = src_split( c.src ) + spec, chrom = src_split(c.src) if spec not in spec_dict: - spec_dict[ spec ] = [] - species.append( spec ) - spec_dict[ spec ].append( c ) + spec_dict[spec] = [] + species.append(spec) + spec_dict[spec].append(c) else: for spec in species: - spec_dict[ spec ] = [] - for c in iter_components_by_src_start( block, spec ): - spec_dict[ spec ].append( c ) + spec_dict[spec] = [] + for c in iter_components_by_src_start(block, spec): + spec_dict[spec].append(c) - empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes? + empty_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes)) # should we copy attributes? empty_block.text_size = block.text_size - #call recursive function to split into each combo of spec/blocks - for value in __split_components_by_species( spec_dict.values(), empty_block ): - sort_block_components_by_block( value, block ) #restore original component order + # call recursive function to split into each combo of spec/blocks + for value in __split_components_by_species(spec_dict.values(), empty_block): + sort_block_components_by_block(value, block) # restore original component order yield value -#generator yielding only chopped and valid blocks for a specified region -def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ): - for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ): +# generator yielding only chopped and valid blocks for a specified region +def get_chopped_blocks_for_region(index, src, region, species=None, mincols=0): + for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols): yield block -def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ): - for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): - block = chop_block_by_region( block, src, region, species, mincols ) + + +def get_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0): + for block, idx, offset in index.get_as_iterator_with_index_and_offset(src, region.start, region.end): + block = chop_block_by_region(block, src, region, species, mincols) if block is not None: yield block, idx, offset -#returns a filled region alignment for specified regions -def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): - if species is not None: alignment = RegionAlignment( end - start, species ) - else: alignment = RegionAlignment( end - start, primary_species ) - return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps ) +# returns a filled region alignment for specified regions + -#reduces a block to only positions exisiting in the src provided -def reduce_block_by_primary_genome( block, species, chromosome, region_start ): - #returns ( startIndex, {species:texts} - #where texts' contents are reduced to only positions existing in the primary genome - src = "%s.%s" % ( species, chromosome ) - ref = block.get_component_by_src( src ) +def get_region_alignment(index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True): + if species is not None: + alignment = RegionAlignment(end - start, species) + else: + alignment = RegionAlignment(end - start, primary_species) + return fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps) + +# reduces a block to only positions exisiting in the src provided + + +def reduce_block_by_primary_genome(block, species, chromosome, region_start): + # returns ( startIndex, {species:texts} + # where texts' contents are reduced to only positions existing in the primary genome + src = "%s.%s" % (species, chromosome) + ref = block.get_component_by_src(src) start_offset = ref.start - region_start species_texts = {} for c in block.components: - species_texts[ c.src.split( '.' )[0] ] = list( c.text ) - #remove locations which are gaps in the primary species, starting from the downstream end - for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): - if species_texts[ species ][i] == '-': + species_texts[c.src.split('.')[0]] = list(c.text) + # remove locations which are gaps in the primary species, starting from the downstream end + for i in range(len(species_texts[species]) - 1, -1, -1): + if species_texts[species][i] == '-': for text in species_texts.values(): - text.pop( i ) + text.pop(i) for spec, text in species_texts.items(): - species_texts[spec] = ''.join( text ) - return ( start_offset, species_texts ) + species_texts[spec] = ''.join(text) + return (start_offset, species_texts) -#fills a region alignment -def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): - region = bx.intervals.Interval( start, end ) +# fills a region alignment + + +def fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True): + region = bx.intervals.Interval(start, end) region.chrom = chrom region.strand = strand - primary_src = "%s.%s" % ( primary_species, chrom ) + primary_src = "%s.%s" % (primary_species, chrom) - #Order blocks overlaping this position by score, lowest first + # Order blocks overlaping this position by score, lowest first blocks = [] - for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ): - score = float( block.score ) - for i in range( 0, len( blocks ) ): + for block, idx, offset in index.get_as_iterator_with_index_and_offset(primary_src, start, end): + score = float(block.score) + for i in range(0, len(blocks)): if score < blocks[i][0]: - blocks.insert( i, ( score, idx, offset ) ) + blocks.insert(i, (score, idx, offset)) break else: - blocks.append( ( score, idx, offset ) ) + blocks.append((score, idx, offset)) - #gap_chars_tuple = tuple( GAP_CHARS ) - gap_chars_str = ''.join( GAP_CHARS ) - #Loop through ordered blocks and layer by increasing score + # gap_chars_tuple = tuple( GAP_CHARS ) + gap_chars_str = ''.join(GAP_CHARS) + # Loop through ordered blocks and layer by increasing score for block_dict in blocks: - for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately - if component_overlaps_region( block.get_component_by_src( primary_src ), region ): - block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block - block = orient_block_by_region( block, primary_src, region ) #orient block - start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) + for block in iter_blocks_split_by_species(block_dict[1].get_at_offset(block_dict[2])): # need to handle each occurance of sequence in block seperately + if component_overlaps_region(block.get_component_by_src(primary_src), region): + block = chop_block_by_region(block, primary_src, region, species, mincols) # chop block + block = orient_block_by_region(block, primary_src, region) # orient block + start_offset, species_texts = reduce_block_by_primary_genome(block, primary_species, chrom, start) for spec, text in species_texts.items(): - #we should trim gaps from both sides, since these are not positions in this species genome (sequence) - text = text.rstrip( gap_chars_str ) + # we should trim gaps from both sides, since these are not positions in this species genome (sequence) + text = text.rstrip(gap_chars_str) gap_offset = 0 - while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith() - #while text.startswith( gap_chars_tuple ): + while True in [text.startswith(gap_char) for gap_char in GAP_CHARS]: # python2.4 doesn't accept a tuple for .startswith() + # while text.startswith( gap_chars_tuple ): gap_offset += 1 text = text[1:] if not text: break if text: if overwrite_with_gaps: - alignment.set_range( start_offset + gap_offset, spec, text ) + alignment.set_range(start_offset + gap_offset, spec, text) else: - for i, char in enumerate( text ): + for i, char in enumerate(text): if char not in GAP_CHARS: - alignment.set_position( start_offset + gap_offset + i, spec, char ) + alignment.set_position(start_offset + gap_offset + i, spec, char) return alignment -#returns a filled spliced region alignment for specified region with start and end lists -def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ): - #create spliced alignment object - if species is not None: alignment = SplicedAlignment( starts, ends, species ) - else: alignment = SplicedAlignment( starts, ends, [primary_species] ) +# returns a filled spliced region alignment for specified region with start and end lists + + +def get_spliced_region_alignment(index, primary_species, chrom, starts, ends, strand='+', species=None, mincols=0, overwrite_with_gaps=True): + # create spliced alignment object + if species is not None: + alignment = SplicedAlignment(starts, ends, species) + else: + alignment = SplicedAlignment(starts, ends, [primary_species]) for exon in alignment.exons: - fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps ) + fill_region_alignment(exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps) return alignment -#loop through string array, only return non-commented lines -def line_enumerator( lines, comment_start = '#' ): +# loop through string array, only return non-commented lines + + +def line_enumerator(lines, comment_start='#'): i = 0 for line in lines: - if not line.startswith( comment_start ): + if not line.startswith(comment_start): i += 1 - yield ( i, line ) + yield (i, line) -#read a GeneBed file, return list of starts, ends, raw fields -def get_starts_ends_fields_from_gene_bed( line ): - #Starts and ends for exons +# read a GeneBed file, return list of starts, ends, raw fields + + +def get_starts_ends_fields_from_gene_bed(line): + # Starts and ends for exons starts = [] ends = [] fields = line.split() - #Requires atleast 12 BED columns + # Requires atleast 12 BED columns if len(fields) < 12: - raise Exception( "Not a proper 12 column BED line (%s)." % line ) - chrom = fields[0] - tx_start = int( fields[1] ) - tx_end = int( fields[2] ) - name = fields[3] - strand = fields[5] - if strand != '-': strand='+' #Default strand is + - cds_start = int( fields[6] ) - cds_end = int( fields[7] ) + raise Exception("Not a proper 12 column BED line (%s)." % line) + tx_start = int(fields[1]) + strand = fields[5] + if strand != '-': + strand = '+' # Default strand is + + cds_start = int(fields[6]) + cds_end = int(fields[7]) - #Calculate and store starts and ends of coding exons + # Calculate and store starts and ends of coding exons region_start, region_end = cds_start, cds_end - exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) ) - exon_starts = map( ( lambda x: x + tx_start ), exon_starts ) - exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) ) - exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends ); - for start, end in zip( exon_starts, exon_ends ): - start = max( start, region_start ) - end = min( end, region_end ) + exon_starts = map(int, fields[11].rstrip(',\n').split(',')) + exon_starts = map((lambda x: x + tx_start), exon_starts) + exon_ends = map(int, fields[10].rstrip(',').split(',')) + exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends) + for start, end in zip(exon_starts, exon_ends): + start = max(start, region_start) + end = min(end, region_end) if start < end: - starts.append( start ) - ends.append( end ) - return ( starts, ends, fields ) + starts.append(start) + ends.append(end) + return (starts, ends, fields) -def iter_components_by_src( block, src ): + +def iter_components_by_src(block, src): for c in block.components: if c.src == src: yield c -def get_components_by_src( block, src ): - return [ value for value in iter_components_by_src( block, src ) ] + +def get_components_by_src(block, src): + return [value for value in iter_components_by_src(block, src)] -def iter_components_by_src_start( block, src ): + +def iter_components_by_src_start(block, src): for c in block.components: - if c.src.startswith( src ): + if c.src.startswith(src): yield c -def get_components_by_src_start( block, src ): - return [ value for value in iter_components_by_src_start( block, src ) ] + +def get_components_by_src_start(block, src): + return [value for value in iter_components_by_src_start(block, src)] + -def sort_block_components_by_block( block1, block2 ): - #orders the components in block1 by the index of the component in block2 - #block1 must be a subset of block2 - #occurs in-place - return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) ) +def sort_block_components_by_block(block1, block2): + # orders the components in block1 by the index of the component in block2 + # block1 must be a subset of block2 + # occurs in-place + return block1.components.sort(cmp=lambda x, y: block2.components.index(x) - block2.components.index(y)) + -def get_species_in_maf( maf_filename ): +def get_species_in_maf(maf_filename): species = [] - for block in bx.align.maf.Reader( open( maf_filename ) ): - for spec in get_species_in_block( block ): + for block in bx.align.maf.Reader(open(maf_filename)): + for spec in get_species_in_block(block): if spec not in species: - species.append( spec ) + species.append(spec) return species -def parse_species_option( species ): + +def parse_species_option(species): if species: - species = species.split( ',' ) + species = species.split(',') if 'None' not in species: return species - return None #provided species was '', None, or had 'None' in it + return None # provided species was '', None, or had 'None' in it + -def remove_temp_index_file( index_filename ): - try: os.unlink( index_filename ) - except: pass - -#Below are methods to deal with FASTA files +def remove_temp_index_file(index_filename): + try: + os.unlink(index_filename) + except Exception: + pass -def get_fasta_header( component, attributes = {}, suffix = None ): - header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() ) +# Below are methods to deal with FASTA files + + +def get_fasta_header(component, attributes={}, suffix=None): + header = ">%s(%s):%i-%i|" % (component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end()) for key, value in attributes.iteritems(): - header = "%s%s=%s|" % ( header, key, value ) + header = "%s%s=%s|" % (header, key, value) if suffix: - header = "%s%s" % ( header, suffix ) + header = "%s%s" % (header, suffix) else: - header = "%s%s" % ( header, src_split( component.src )[ 0 ] ) + header = "%s%s" % (header, src_split(component.src)[0]) return header -def get_attributes_from_fasta_header( header ): - if not header: return {} + +def get_attributes_from_fasta_header(header): + if not header: + return {} attributes = {} - header = header.lstrip( '>' ) + header = header.lstrip('>') header = header.strip() - fields = header.split( '|' ) + fields = header.split('|') try: region = fields[0] - region = region.split( '(', 1 ) - temp = region[0].split( '.', 1 ) + region = region.split('(', 1) + temp = region[0].split('.', 1) attributes['species'] = temp[0] - if len( temp ) == 2: + if len(temp) == 2: attributes['chrom'] = temp[1] else: attributes['chrom'] = temp[0] - region = region[1].split( ')', 1 ) + region = region[1].split(')', 1) attributes['strand'] = region[0] - region = region[1].lstrip( ':' ).split( '-' ) - attributes['start'] = int( region[0] ) - attributes['end'] = int( region[1] ) - except: - #fields 0 is not a region coordinate + region = region[1].lstrip(':').split('-') + attributes['start'] = int(region[0]) + attributes['end'] = int(region[1]) + except Exception: + # fields 0 is not a region coordinate pass - if len( fields ) > 2: - for i in xrange( 1, len( fields ) - 1 ): - prop = fields[i].split( '=', 1 ) - if len( prop ) == 2: - attributes[ prop[0] ] = prop[1] - if len( fields ) > 1: + if len(fields) > 2: + for i in range(1, len(fields) - 1): + prop = fields[i].split('=', 1) + if len(prop) == 2: + attributes[prop[0]] = prop[1] + if len(fields) > 1: attributes['__suffix__'] = fields[-1] return attributes -def iter_fasta_alignment( filename ): + +def iter_fasta_alignment(filename): class fastaComponent: - def __init__( self, species, text = "" ): + def __init__(self, species, text=""): self.species = species self.text = text - def extend( self, text ): - self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip() - #yields a list of fastaComponents for a FASTA file - f = open( filename, 'rb' ) - components = [] - #cur_component = None - while True: - line = f.readline() - if not line: - if components: - yield components - return - line = line.strip() - if not line: - if components: - yield components - components = [] - elif line.startswith( '>' ): - attributes = get_attributes_from_fasta_header( line ) - components.append( fastaComponent( attributes['species'] ) ) - elif components: - components[-1].extend( line ) + def extend(self, text): + self.text = self.text + text.replace('\n', '').replace('\r', '').strip() + # yields a list of fastaComponents for a FASTA file + with open(filename, 'r') as f: + components = [] + # cur_component = None + while True: + line = f.readline() + if not line: + if components: + yield components + return + line = line.strip() + if not line: + if components: + yield components + components = [] + elif line.startswith('>'): + attributes = get_attributes_from_fasta_header(line) + components.append(fastaComponent(attributes['species'])) + elif components: + components[-1].extend(line)
--- a/utils/odict.py Mon Nov 17 10:15:05 2014 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -""" -Ordered dictionary implementation. -""" - -from UserDict import UserDict - -class odict(UserDict): - """ - http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747 - - This dictionary class extends UserDict to record the order in which items are - added. Calling keys(), values(), items(), etc. will return results in this - order. - """ - def __init__( self, dict = None ): - self._keys = [] - UserDict.__init__( self, dict ) - - def __delitem__( self, key ): - UserDict.__delitem__( self, key ) - self._keys.remove( key ) - - def __setitem__( self, key, item ): - UserDict.__setitem__( self, key, item ) - if key not in self._keys: - self._keys.append( key ) - - def clear( self ): - UserDict.clear( self ) - self._keys = [] - - def copy(self): - new = odict() - new.update( self ) - return new - - def items( self ): - return zip( self._keys, self.values() ) - - def keys( self ): - return self._keys[:] - - def popitem( self ): - try: - key = self._keys[-1] - except IndexError: - raise KeyError( 'dictionary is empty' ) - val = self[ key ] - del self[ key ] - return ( key, val ) - - def setdefault( self, key, failobj=None ): - if key not in self._keys: - self._keys.append( key ) - return UserDict.setdefault( self, key, failobj ) - - def update( self, dict ): - for ( key, val ) in dict.items(): - self.__setitem__( key, val ) - - def values( self ): - return map( self.get, self._keys ) - - def iterkeys( self ): - return iter( self._keys ) - - def itervalues( self ): - for key in self._keys: - yield self.get( key ) - - def iteritems( self ): - for key in self._keys: - yield key, self.get( key ) - - def __iter__( self ): - for key in self._keys: - yield key - - def reverse( self ): - self._keys.reverse() - - def insert( self, index, key, item ): - if key not in self._keys: - self._keys.insert( index, key ) - UserDict.__setitem__( self, key, item )