changeset 2:16df616b39e5 draft

"planemo upload for repository https://github.com/galaxyproject/tools-devteam/tree/master/tools/fasta_concatenate_by_species commit cd1ed08574b749eee2a3f6e6151dbb0c8ca15bbf"
author devteam
date Sun, 01 Mar 2020 07:24:26 -0500
parents 717aee069681
children 25b8736c627a
files fasta_concatenate_by_species.py fasta_concatenate_by_species.xml tool_dependencies.xml utils/maf_utilities.py utils/odict.py
diffstat 5 files changed, 442 insertions(+), 453 deletions(-) [+]
line wrap: on
line diff
--- a/fasta_concatenate_by_species.py	Mon Nov 17 10:15:05 2014 -0500
+++ b/fasta_concatenate_by_species.py	Sun Mar 01 07:24:26 2020 -0500
@@ -1,39 +1,43 @@
 #!/usr/bin/env python
-#Dan Blankenberg
+# Dan Blankenberg
 """
-Takes a Multiple Alignment FASTA file and concatenates 
-sequences for each species, resulting in one sequence 
+Takes a Multiple Alignment FASTA file and concatenates
+sequences for each species, resulting in one sequence
 alignment per species.
 """
 
-import sys, tempfile
+import sys
+import tempfile
+from collections import OrderedDict
+
 from utils.maf_utilities import iter_fasta_alignment
-from utils.odict import odict
+
 
 def __main__():
     input_filename = sys.argv[1]
     output_filename = sys.argv[2]
-    species = odict()
+    species = OrderedDict()
     cur_size = 0
-    for components in iter_fasta_alignment( input_filename ):
-        species_not_written = species.keys()
+    for components in iter_fasta_alignment(input_filename):
+        species_not_written = list(species.keys())
         for component in components:
             if component.species not in species:
-                species[component.species] = tempfile.TemporaryFile()
-                species[component.species].write( "-" * cur_size )
-            species[component.species].write( component.text )
+                species[component.species] = tempfile.TemporaryFile(mode="r+")
+                species[component.species].write("-" * cur_size)
+            species[component.species].write(component.text)
             try:
-                species_not_written.remove( component.species )
+                species_not_written.remove(component.species)
             except ValueError:
-                #this is a new species
+                # this is a new species
                 pass
         for spec in species_not_written:
-            species[spec].write( "-" * len( components[0].text ) )
-        cur_size += len( components[0].text )
-    out = open( output_filename, 'wb' )
-    for spec, f in species.iteritems():
-        f.seek( 0 )
-        out.write( ">%s\n%s\n" % ( spec, f.read() ) )
-    out.close()
+            species[spec].write("-" * len(components[0].text))
+        cur_size += len(components[0].text)
+    with open(output_filename, 'w') as out:
+        for spec, f in species.items():
+            f.seek(0)
+            out.write(">%s\n%s\n" % (spec, f.read()))
 
-if __name__ == "__main__" : __main__()
+
+if __name__ == "__main__":
+    __main__()
--- a/fasta_concatenate_by_species.xml	Mon Nov 17 10:15:05 2014 -0500
+++ b/fasta_concatenate_by_species.xml	Sun Mar 01 07:24:26 2020 -0500
@@ -1,9 +1,13 @@
-<tool id="fasta_concatenate0" name="Concatenate" version="0.0.0">
+<tool id="fasta_concatenate0" name="Concatenate" version="0.0.1" profile="16.04">
   <description>FASTA alignment by species</description>
   <requirements>
-    <requirement type="package" version="0.7.1">bx-python</requirement>
+    <requirement type="package" version="0.8.8">bx-python</requirement>
   </requirements>
-  <command interpreter="python">fasta_concatenate_by_species.py $input1 $out_file1</command>
+  <command>
+    python '$__tool_directory__/fasta_concatenate_by_species.py'
+      '$input1'
+      '$out_file1'
+  </command>
   <inputs>
     <param name="input1" type="data" format="fasta" label="FASTA alignment"/>
   </inputs>
@@ -16,7 +20,7 @@
       <output name="out_file1" file="fasta_concatenate_out.fasta" />
     </test>
   </tests>
-  <help>
+  <help><![CDATA[
   
 **What it does**
   
@@ -71,5 +75,5 @@
 
  This tool will only work properly on files with Galaxy style FASTA headers.
 
-</help>
+   ]]></help>
 </tool>
--- a/tool_dependencies.xml	Mon Nov 17 10:15:05 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-<?xml version="1.0"?>
-<tool_dependency>
-    <package name="bx-python" version="0.7.1">
-        <repository changeset_revision="2d0c08728bca" name="package_bx_python_0_7" owner="devteam" prior_installation_required="False" toolshed="http://toolshed.g2.bx.psu.edu" />
-    </package>
-</tool_dependency>
--- a/utils/maf_utilities.py	Mon Nov 17 10:15:05 2014 -0500
+++ b/utils/maf_utilities.py	Sun Mar 01 07:24:26 2020 -0500
@@ -2,197 +2,226 @@
 """
 Provides wrappers and utilities for working with MAF files and alignments.
 """
-#Dan Blankenberg
+# Dan Blankenberg
 import bx.align.maf
 import bx.intervals
 import bx.interval_index_file
-import sys, os, string, tempfile
+import sys
+import os
+import tempfile
 import logging
 from copy import deepcopy
 
-assert sys.version_info[:2] >= ( 2, 4 )
+try:
+    maketrans = str.maketrans
+except AttributeError:
+    from string import maketrans
+
+assert sys.version_info[:2] >= (2, 4)
 
 log = logging.getLogger(__name__)
 
 
-GAP_CHARS = [ '-' ]
+GAP_CHARS = ['-']
 SRC_SPLIT_CHAR = '.'
 
-def src_split( src ):
-    fields = src.split( SRC_SPLIT_CHAR, 1 )
-    spec = fields.pop( 0 )
+
+def src_split(src):
+    fields = src.split(SRC_SPLIT_CHAR, 1)
+    spec = fields.pop(0)
     if fields:
-        chrom = fields.pop( 0 )
+        chrom = fields.pop(0)
     else:
         chrom = spec
     return spec, chrom
 
-def src_merge( spec, chrom, contig = None ):
-    if None in [ spec, chrom ]:
+
+def src_merge(spec, chrom, contig=None):
+    if None in [spec, chrom]:
         spec = chrom = spec or chrom
-    return bx.align.maf.src_merge( spec, chrom, contig )
+    return bx.align.maf.src_merge(spec, chrom, contig)
 
-def get_species_in_block( block ):
+
+def get_species_in_block(block):
     species = []
     for c in block.components:
-        spec, chrom = src_split( c.src )
+        spec, chrom = src_split(c.src)
         if spec not in species:
-            species.append( spec )
+            species.append(spec)
     return species
 
-def tool_fail( msg = "Unknown Error" ):
-    print >> sys.stderr, "Fatal Error: %s" % msg
-    sys.exit()
+
+def tool_fail(msg="Unknown Error"):
+    msg = "Fatal Error: %s" % msg
+    sys.exit(msg)
+
+# an object corresponding to a reference layered alignment
 
-#an object corresponding to a reference layered alignment
-class RegionAlignment( object ):
+
+class RegionAlignment(object):
 
-    DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
-    MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
+    DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca")
 
-    def __init__( self, size, species = [] ):
-        assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
+    def __init__(self, size, species=[]):
         self.size = size
         self.sequences = {}
-        if not isinstance( species, list ):
+        if not isinstance(species, list):
             species = [species]
         for spec in species:
-            self.add_species( spec )
+            self.add_species(spec)
 
-    #add a species to the alignment
-    def add_species( self, species ):
-        #make temporary sequence files
+    # add a species to the alignment
+    def add_species(self, species):
+        # make temporary sequence files
         self.sequences[species] = tempfile.TemporaryFile()
-        self.sequences[species].write( "-" * self.size )
+        self.sequences[species].write("-" * self.size)
 
-    #returns the names for species found in alignment, skipping names as requested
-    def get_species_names( self, skip = [] ):
-        if not isinstance( skip, list ): skip = [skip]
+    # returns the names for species found in alignment, skipping names as requested
+    def get_species_names(self, skip=[]):
+        if not isinstance(skip, list):
+            skip = [skip]
         names = self.sequences.keys()
         for name in skip:
-            try: names.remove( name )
-            except: pass
+            try:
+                names.remove(name)
+            except Exception:
+                pass
         return names
 
-    #returns the sequence for a species
-    def get_sequence( self, species ):
-        self.sequences[species].seek( 0 )
+    # returns the sequence for a species
+    def get_sequence(self, species):
+        self.sequences[species].seek(0)
         return self.sequences[species].read()
 
-    #returns the reverse complement of the sequence for a species
-    def get_sequence_reverse_complement( self, species ):
-        complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
+    # returns the reverse complement of the sequence for a species
+    def get_sequence_reverse_complement(self, species):
+        complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)]
         complement.reverse()
-        return "".join( complement )
+        return "".join(complement)
+
+    # sets a position for a species
+    def set_position(self, index, species, base):
+        if len(base) != 1:
+            raise Exception("A genomic position can only have a length of 1.")
+        return self.set_range(index, species, base)
+    # sets a range for a species
 
-    #sets a position for a species
-    def set_position( self, index, species, base ):
-        if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
-        return self.set_range( index, species, base )
-    #sets a range for a species
-    def set_range( self, index, species, bases ):
-        if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
-        if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
-        if species not in self.sequences.keys(): self.add_species( species )
-        self.sequences[species].seek( index )
-        self.sequences[species].write( bases )
+    def set_range(self, index, species, bases):
+        if index >= self.size or index < 0:
+            raise Exception("Your index (%i) is out of range (0 - %i)." % (index, self.size - 1))
+        if len(bases) == 0:
+            raise Exception("A set of genomic positions can only have a positive length.")
+        if species not in self.sequences.keys():
+            self.add_species(species)
+        self.sequences[species].seek(index)
+        self.sequences[species].write(bases)
 
-    #Flush temp file of specified species, or all species
-    def flush( self, species = None ):
+    # Flush temp file of specified species, or all species
+    def flush(self, species=None):
         if species is None:
             species = self.sequences.keys()
-        elif not isinstance( species, list ):
+        elif not isinstance(species, list):
             species = [species]
         for spec in species:
             self.sequences[spec].flush()
 
-class GenomicRegionAlignment( RegionAlignment ):
+
+class GenomicRegionAlignment(RegionAlignment):
 
-    def __init__( self, start, end, species = [] ):
-        RegionAlignment.__init__( self, end - start, species )
+    def __init__(self, start, end, species=[]):
+        RegionAlignment.__init__(self, end - start, species)
         self.start = start
         self.end = end
 
-class SplicedAlignment( object ):
 
-    DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
+class SplicedAlignment(object):
+
+    DNA_COMPLEMENT = maketrans("ACGTacgt", "TGCAtgca")
 
-    def __init__( self, exon_starts, exon_ends, species = [] ):
-        if not isinstance( exon_starts, list ):
+    def __init__(self, exon_starts, exon_ends, species=[]):
+        if not isinstance(exon_starts, list):
             exon_starts = [exon_starts]
-        if not isinstance( exon_ends, list ):
+        if not isinstance(exon_ends, list):
             exon_ends = [exon_ends]
-        assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
+        assert len(exon_starts) == len(exon_ends), "The number of starts does not match the number of sizes."
         self.exons = []
-        for i in range( len( exon_starts ) ):
-            self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
+        for i in range(len(exon_starts)):
+            self.exons.append(GenomicRegionAlignment(exon_starts[i], exon_ends[i], species))
 
-    #returns the names for species found in alignment, skipping names as requested
-    def get_species_names( self, skip = [] ):
-        if not isinstance( skip, list ): skip = [skip]
+    # returns the names for species found in alignment, skipping names as requested
+    def get_species_names(self, skip=[]):
+        if not isinstance(skip, list):
+            skip = [skip]
         names = []
         for exon in self.exons:
-            for name in exon.get_species_names( skip = skip ):
+            for name in exon.get_species_names(skip=skip):
                 if name not in names:
-                    names.append( name )
+                    names.append(name)
         return names
 
-    #returns the sequence for a species
-    def get_sequence( self, species ):
+    # returns the sequence for a species
+    def get_sequence(self, species):
         sequence = tempfile.TemporaryFile()
         for exon in self.exons:
             if species in exon.get_species_names():
-                sequence.write( exon.get_sequence( species ) )
+                sequence.write(exon.get_sequence(species))
             else:
-                sequence.write( "-" * exon.size )
-        sequence.seek( 0 )
+                sequence.write("-" * exon.size)
+        sequence.seek(0)
         return sequence.read()
 
-    #returns the reverse complement of the sequence for a species
-    def get_sequence_reverse_complement( self, species ):
-        complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
+    # returns the reverse complement of the sequence for a species
+    def get_sequence_reverse_complement(self, species):
+        complement = [base for base in self.get_sequence(species).translate(self.DNA_COMPLEMENT)]
         complement.reverse()
-        return "".join( complement )
+        return "".join(complement)
 
-    #Start and end of coding region
+    # Start and end of coding region
     @property
-    def start( self ):
+    def start(self):
         return self.exons[0].start
+
     @property
-    def end( self ):
+    def end(self):
         return self.exons[-1].end
 
-#Open a MAF index using a UID
-def maf_index_by_uid( maf_uid, index_location_file ):
-    for line in open( index_location_file ):
+# Open a MAF index using a UID
+
+
+def maf_index_by_uid(maf_uid, index_location_file):
+    for line in open(index_location_file):
         try:
-            #read each line, if not enough fields, go to next line
-            if line[0:1] == "#" : continue
+            # read each line, if not enough fields, go to next line
+            if line[0:1] == "#":
+                continue
             fields = line.split('\t')
             if maf_uid == fields[1]:
                 try:
-                    maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
-                    return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
-                except Exception, e:
-                    raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
-        except:
+                    maf_files = fields[4].replace("\n", "").replace("\r", "").split(",")
+                    return bx.align.maf.MultiIndexed(maf_files, keep_open=True, parse_e_rows=False)
+                except Exception as e:
+                    raise Exception('MAF UID (%s) found, but configuration appears to be malformed: %s' % (maf_uid, e))
+        except Exception:
             pass
     return None
 
-#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
-def open_or_build_maf_index( maf_file, index_filename, species = None ):
+# return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
+
+
+def open_or_build_maf_index(maf_file, index_filename, species=None):
     try:
-        return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
-    except:
-        return build_maf_index( maf_file, species = species )
+        return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), None)
+    except Exception:
+        return build_maf_index(maf_file, species=species)
 
-def build_maf_index_species_chromosomes( filename, index_species = None ):
+
+def build_maf_index_species_chromosomes(filename, index_species=None):
     species = []
     species_chromosomes = {}
     indexes = bx.interval_index_file.Indexes()
     blocks = 0
     try:
-        maf_reader = bx.align.maf.Reader( open( filename ) )
+        maf_reader = bx.align.maf.Reader(open(filename))
         while True:
             pos = maf_reader.file.tell()
             block = maf_reader.next()
@@ -203,398 +232,441 @@
                 spec = c.src
                 chrom = None
                 if "." in spec:
-                    spec, chrom = spec.split( ".", 1 )
+                    spec, chrom = spec.split(".", 1)
                 if spec not in species:
-                    species.append( spec )
+                    species.append(spec)
                     species_chromosomes[spec] = []
                 if chrom and chrom not in species_chromosomes[spec]:
-                    species_chromosomes[spec].append( chrom )
+                    species_chromosomes[spec].append(chrom)
                 if index_species is None or spec in index_species:
                     forward_strand_start = c.forward_strand_start
                     forward_strand_end = c.forward_strand_end
                     try:
-                        forward_strand_start = int( forward_strand_start )
-                        forward_strand_end = int( forward_strand_end )
+                        forward_strand_start = int(forward_strand_start)
+                        forward_strand_end = int(forward_strand_end)
                     except ValueError:
-                        continue #start and end are not integers, can't add component to index, goto next component
-                        #this likely only occurs when parse_e_rows is True?
-                        #could a species exist as only e rows? should the
+                        continue  # start and end are not integers, can't add component to index, goto next component
+                        # this likely only occurs when parse_e_rows is True?
+                        # could a species exist as only e rows? should the
                     if forward_strand_end > forward_strand_start:
-                        #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
-                        indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
-    except Exception, e:
-        #most likely a bad MAF
-        log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
-        return ( None, [], {}, 0 )
-    return ( indexes, species, species_chromosomes, blocks )
+                        # require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
+                        indexes.add(c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size)
+    except Exception as e:
+        # most likely a bad MAF
+        log.debug('Building MAF index on %s failed: %s' % (filename, e))
+        return (None, [], {}, 0)
+    return (indexes, species, species_chromosomes, blocks)
 
-#builds and returns ( index, index_filename ) for specified maf_file
-def build_maf_index( maf_file, species = None ):
-    indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
+# builds and returns ( index, index_filename ) for specified maf_file
+
+
+def build_maf_index(maf_file, species=None):
+    indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes(maf_file, species)
     if indexes is not None:
         fd, index_filename = tempfile.mkstemp()
-        out = os.fdopen( fd, 'w' )
-        indexes.write( out )
+        out = os.fdopen(fd, 'w')
+        indexes.write(out)
         out.close()
-        return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
-    return ( None, None )
+        return (bx.align.maf.Indexed(maf_file, index_filename=index_filename, keep_open=True, parse_e_rows=False), index_filename)
+    return (None, None)
+
 
-def component_overlaps_region( c, region ):
-    if c is None: return False
+def component_overlaps_region(c, region):
+    if c is None:
+        return False
     start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
     if region.start >= end or region.end <= start:
         return False
     return True
 
-def chop_block_by_region( block, src, region, species = None, mincols = 0 ):
+
+def chop_block_by_region(block, src, region, species=None, mincols=0):
     # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
     #   behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
     #   whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
     # comments welcome
-    slice_start = block.text_size #max for the min()
-    slice_end = 0 #min for the max()
-    old_score = block.score #save old score for later use
+    slice_start = block.text_size  # max for the min()
+    slice_end = 0  # min for the max()
+    old_score = block.score  # save old score for later use
     # We no longer assume only one occurance of src per block, so we need to check them all
-    for c in iter_components_by_src( block, src ):
-        if component_overlaps_region( c, region ):
+    for c in iter_components_by_src(block, src):
+        if component_overlaps_region(c, region):
             if c.text is not None:
                 rev_strand = False
                 if c.strand == "-":
-                    #We want our coord_to_col coordinates to be returned from positive stranded component
+                    # We want our coord_to_col coordinates to be returned from positive stranded component
                     rev_strand = True
                     c = c.reverse_complement()
-                start = max( region.start, c.start )
-                end = min( region.end, c.end )
-                start = c.coord_to_col( start )
-                end = c.coord_to_col( end )
+                start = max(region.start, c.start)
+                end = min(region.end, c.end)
+                start = c.coord_to_col(start)
+                end = c.coord_to_col(end)
                 if rev_strand:
-                    #need to orient slice coordinates to the original block direction
+                    # need to orient slice coordinates to the original block direction
                     slice_len = end - start
-                    end = len( c.text ) - start
+                    end = len(c.text) - start
                     start = end - slice_len
-                slice_start = min( start, slice_start )
-                slice_end = max( end, slice_end )
+                slice_start = min(start, slice_start)
+                slice_end = max(end, slice_end)
 
     if slice_start < slice_end:
-        block = block.slice( slice_start, slice_end )
+        block = block.slice(slice_start, slice_end)
         if block.text_size > mincols:
             # restore old score, may not be accurate, but it is better than 0 for everything?
             block.score = old_score
             if species is not None:
-                block = block.limit_to_species( species )
+                block = block.limit_to_species(species)
                 block.remove_all_gap_columns()
             return block
     return None
 
-def orient_block_by_region( block, src, region, force_strand = None ):
-    #loop through components matching src,
-    #make sure each of these components overlap region
-    #cache strand for each of overlaping regions
-    #if force_strand / region.strand not in strand cache, reverse complement
-    ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
-    strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
-    if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
+
+def orient_block_by_region(block, src, region, force_strand=None):
+    # loop through components matching src,
+    # make sure each of these components overlap region
+    # cache strand for each of overlaping regions
+    # if force_strand / region.strand not in strand cache, reverse complement
+    # we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
+    strands = [c.strand for c in iter_components_by_src(block, src) if component_overlaps_region(c, region)]
+    if strands and (force_strand is None and region.strand not in strands) or (force_strand is not None and force_strand not in strands):
         block = block.reverse_complement()
     return block
 
-def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
-    for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
+
+def get_oriented_chopped_blocks_for_region(index, src, region, species=None, mincols=0, force_strand=None):
+    for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols, force_strand):
         yield block
-def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
-    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
-        yield orient_block_by_region( block, src, region, force_strand ), idx, offset
+
+
+def get_oriented_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0, force_strand=None):
+    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols):
+        yield orient_block_by_region(block, src, region, force_strand), idx, offset
 
-#split a block with multiple occurances of src into one block per src
-def iter_blocks_split_by_src( block, src ):
-    for src_c in iter_components_by_src( block, src ):
-        new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) )
+# split a block with multiple occurances of src into one block per src
+
+
+def iter_blocks_split_by_src(block, src):
+    for src_c in iter_components_by_src(block, src):
+        new_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes))
         new_block.text_size = block.text_size
         for c in block.components:
             if c == src_c or c.src != src:
-                new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components
+                new_block.add_component(deepcopy(c))  # components have reference to alignment, dont want to loose reference to original alignment block in original components
         yield new_block
 
-#split a block into multiple blocks with all combinations of a species appearing only once per block
-def iter_blocks_split_by_species( block, species = None ):
-    def __split_components_by_species( components_by_species, new_block ):
+# split a block into multiple blocks with all combinations of a species appearing only once per block
+
+
+def iter_blocks_split_by_species(block, species=None):
+    def __split_components_by_species(components_by_species, new_block):
         if components_by_species:
-            #more species with components to add to this block
-            components_by_species = deepcopy( components_by_species )
-            spec_comps = components_by_species.pop( 0 )
+            # more species with components to add to this block
+            components_by_species = deepcopy(components_by_species)
+            spec_comps = components_by_species.pop(0)
             for c in spec_comps:
-                newer_block = deepcopy( new_block )
-                newer_block.add_component( deepcopy( c ) )
-                for value in __split_components_by_species( components_by_species, newer_block ):
+                newer_block = deepcopy(new_block)
+                newer_block.add_component(deepcopy(c))
+                for value in __split_components_by_species(components_by_species, newer_block):
                     yield value
         else:
-            #no more components to add, yield this block
+            # no more components to add, yield this block
             yield new_block
 
-    #divide components by species
+    # divide components by species
     spec_dict = {}
     if not species:
         species = []
         for c in block.components:
-            spec, chrom = src_split( c.src )
+            spec, chrom = src_split(c.src)
             if spec not in spec_dict:
-                spec_dict[ spec ] = []
-                species.append( spec )
-            spec_dict[ spec ].append( c )
+                spec_dict[spec] = []
+                species.append(spec)
+            spec_dict[spec].append(c)
     else:
         for spec in species:
-            spec_dict[ spec ] = []
-            for c in iter_components_by_src_start( block, spec ):
-                spec_dict[ spec ].append( c )
+            spec_dict[spec] = []
+            for c in iter_components_by_src_start(block, spec):
+                spec_dict[spec].append(c)
 
-    empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
+    empty_block = bx.align.Alignment(score=block.score, attributes=deepcopy(block.attributes))  # should we copy attributes?
     empty_block.text_size = block.text_size
-    #call recursive function to split into each combo of spec/blocks
-    for value in __split_components_by_species( spec_dict.values(), empty_block ):
-        sort_block_components_by_block( value, block ) #restore original component order
+    # call recursive function to split into each combo of spec/blocks
+    for value in __split_components_by_species(spec_dict.values(), empty_block):
+        sort_block_components_by_block(value, block)  # restore original component order
         yield value
 
 
-#generator yielding only chopped and valid blocks for a specified region
-def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
-    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
+# generator yielding only chopped and valid blocks for a specified region
+def get_chopped_blocks_for_region(index, src, region, species=None, mincols=0):
+    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region(index, src, region, species, mincols):
         yield block
-def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
-    for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
-        block = chop_block_by_region( block, src, region, species, mincols )
+
+
+def get_chopped_blocks_with_index_offset_for_region(index, src, region, species=None, mincols=0):
+    for block, idx, offset in index.get_as_iterator_with_index_and_offset(src, region.start, region.end):
+        block = chop_block_by_region(block, src, region, species, mincols)
         if block is not None:
             yield block, idx, offset
 
-#returns a filled region alignment for specified regions
-def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
-    if species is not None: alignment = RegionAlignment( end - start, species )
-    else: alignment = RegionAlignment( end - start, primary_species )
-    return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
+# returns a filled region alignment for specified regions
+
 
-#reduces a block to only positions exisiting in the src provided
-def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
-    #returns ( startIndex, {species:texts}
-    #where texts' contents are reduced to only positions existing in the primary genome
-    src = "%s.%s" % ( species, chromosome )
-    ref = block.get_component_by_src( src )
+def get_region_alignment(index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
+    if species is not None:
+        alignment = RegionAlignment(end - start, species)
+    else:
+        alignment = RegionAlignment(end - start, primary_species)
+    return fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps)
+
+# reduces a block to only positions exisiting in the src provided
+
+
+def reduce_block_by_primary_genome(block, species, chromosome, region_start):
+    # returns ( startIndex, {species:texts}
+    # where texts' contents are reduced to only positions existing in the primary genome
+    src = "%s.%s" % (species, chromosome)
+    ref = block.get_component_by_src(src)
     start_offset = ref.start - region_start
     species_texts = {}
     for c in block.components:
-        species_texts[ c.src.split( '.' )[0] ] = list( c.text )
-    #remove locations which are gaps in the primary species, starting from the downstream end
-    for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
-        if species_texts[ species ][i] == '-':
+        species_texts[c.src.split('.')[0]] = list(c.text)
+    # remove locations which are gaps in the primary species, starting from the downstream end
+    for i in range(len(species_texts[species]) - 1, -1, -1):
+        if species_texts[species][i] == '-':
             for text in species_texts.values():
-                text.pop( i )
+                text.pop(i)
     for spec, text in species_texts.items():
-        species_texts[spec] = ''.join( text )
-    return ( start_offset, species_texts )
+        species_texts[spec] = ''.join(text)
+    return (start_offset, species_texts)
 
-#fills a region alignment
-def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
-    region = bx.intervals.Interval( start, end )
+# fills a region alignment
+
+
+def fill_region_alignment(alignment, index, primary_species, chrom, start, end, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
+    region = bx.intervals.Interval(start, end)
     region.chrom = chrom
     region.strand = strand
-    primary_src = "%s.%s" % ( primary_species, chrom )
+    primary_src = "%s.%s" % (primary_species, chrom)
 
-    #Order blocks overlaping this position by score, lowest first
+    # Order blocks overlaping this position by score, lowest first
     blocks = []
-    for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
-        score = float( block.score )
-        for i in range( 0, len( blocks ) ):
+    for block, idx, offset in index.get_as_iterator_with_index_and_offset(primary_src, start, end):
+        score = float(block.score)
+        for i in range(0, len(blocks)):
             if score < blocks[i][0]:
-                blocks.insert( i, ( score, idx, offset ) )
+                blocks.insert(i, (score, idx, offset))
                 break
         else:
-            blocks.append( ( score, idx, offset ) )
+            blocks.append((score, idx, offset))
 
-    #gap_chars_tuple = tuple( GAP_CHARS )
-    gap_chars_str = ''.join( GAP_CHARS )
-    #Loop through ordered blocks and layer by increasing score
+    # gap_chars_tuple = tuple( GAP_CHARS )
+    gap_chars_str = ''.join(GAP_CHARS)
+    # Loop through ordered blocks and layer by increasing score
     for block_dict in blocks:
-        for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
-            if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
-                block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
-                block = orient_block_by_region( block, primary_src, region ) #orient block
-                start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
+        for block in iter_blocks_split_by_species(block_dict[1].get_at_offset(block_dict[2])):  # need to handle each occurance of sequence in block seperately
+            if component_overlaps_region(block.get_component_by_src(primary_src), region):
+                block = chop_block_by_region(block, primary_src, region, species, mincols)  # chop block
+                block = orient_block_by_region(block, primary_src, region)  # orient block
+                start_offset, species_texts = reduce_block_by_primary_genome(block, primary_species, chrom, start)
                 for spec, text in species_texts.items():
-                    #we should trim gaps from both sides, since these are not positions in this species genome (sequence)
-                    text = text.rstrip( gap_chars_str )
+                    # we should trim gaps from both sides, since these are not positions in this species genome (sequence)
+                    text = text.rstrip(gap_chars_str)
                     gap_offset = 0
-                    while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith()
-                    #while text.startswith( gap_chars_tuple ):
+                    while True in [text.startswith(gap_char) for gap_char in GAP_CHARS]:  # python2.4 doesn't accept a tuple for .startswith()
+                        # while text.startswith( gap_chars_tuple ):
                         gap_offset += 1
                         text = text[1:]
                         if not text:
                             break
                     if text:
                         if overwrite_with_gaps:
-                            alignment.set_range( start_offset + gap_offset, spec, text )
+                            alignment.set_range(start_offset + gap_offset, spec, text)
                         else:
-                            for i, char in enumerate( text ):
+                            for i, char in enumerate(text):
                                 if char not in GAP_CHARS:
-                                    alignment.set_position( start_offset + gap_offset + i, spec, char )
+                                    alignment.set_position(start_offset + gap_offset + i, spec, char)
     return alignment
 
-#returns a filled spliced region alignment for specified region with start and end lists
-def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
-    #create spliced alignment object
-    if species is not None: alignment = SplicedAlignment( starts, ends, species )
-    else: alignment = SplicedAlignment( starts, ends, [primary_species] )
+# returns a filled spliced region alignment for specified region with start and end lists
+
+
+def get_spliced_region_alignment(index, primary_species, chrom, starts, ends, strand='+', species=None, mincols=0, overwrite_with_gaps=True):
+    # create spliced alignment object
+    if species is not None:
+        alignment = SplicedAlignment(starts, ends, species)
+    else:
+        alignment = SplicedAlignment(starts, ends, [primary_species])
     for exon in alignment.exons:
-        fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
+        fill_region_alignment(exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps)
     return alignment
 
-#loop through string array, only return non-commented lines
-def line_enumerator( lines, comment_start = '#' ):
+# loop through string array, only return non-commented lines
+
+
+def line_enumerator(lines, comment_start='#'):
     i = 0
     for line in lines:
-        if not line.startswith( comment_start ):
+        if not line.startswith(comment_start):
             i += 1
-            yield ( i, line )
+            yield (i, line)
 
-#read a GeneBed file, return list of starts, ends, raw fields
-def get_starts_ends_fields_from_gene_bed( line ):
-    #Starts and ends for exons
+# read a GeneBed file, return list of starts, ends, raw fields
+
+
+def get_starts_ends_fields_from_gene_bed(line):
+    # Starts and ends for exons
     starts = []
     ends = []
 
     fields = line.split()
-    #Requires atleast 12 BED columns
+    # Requires atleast 12 BED columns
     if len(fields) < 12:
-        raise Exception( "Not a proper 12 column BED line (%s)." % line )
-    chrom     = fields[0]
-    tx_start  = int( fields[1] )
-    tx_end    = int( fields[2] )
-    name      = fields[3]
-    strand    = fields[5]
-    if strand != '-': strand='+' #Default strand is +
-    cds_start = int( fields[6] )
-    cds_end   = int( fields[7] )
+        raise Exception("Not a proper 12 column BED line (%s)." % line)
+    tx_start = int(fields[1])
+    strand = fields[5]
+    if strand != '-':
+        strand = '+'  # Default strand is +
+    cds_start = int(fields[6])
+    cds_end = int(fields[7])
 
-    #Calculate and store starts and ends of coding exons
+    # Calculate and store starts and ends of coding exons
     region_start, region_end = cds_start, cds_end
-    exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
-    exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
-    exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
-    exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
-    for start, end in zip( exon_starts, exon_ends ):
-        start = max( start, region_start )
-        end = min( end, region_end )
+    exon_starts = map(int, fields[11].rstrip(',\n').split(','))
+    exon_starts = map((lambda x: x + tx_start), exon_starts)
+    exon_ends = map(int, fields[10].rstrip(',').split(','))
+    exon_ends = map((lambda x, y: x + y), exon_starts, exon_ends)
+    for start, end in zip(exon_starts, exon_ends):
+        start = max(start, region_start)
+        end = min(end, region_end)
         if start < end:
-            starts.append( start )
-            ends.append( end )
-    return ( starts, ends, fields )
+            starts.append(start)
+            ends.append(end)
+    return (starts, ends, fields)
 
-def iter_components_by_src( block, src ):
+
+def iter_components_by_src(block, src):
     for c in block.components:
         if c.src == src:
             yield c
 
-def get_components_by_src( block, src ):
-    return [ value for value in iter_components_by_src( block, src ) ]
+
+def get_components_by_src(block, src):
+    return [value for value in iter_components_by_src(block, src)]
 
-def iter_components_by_src_start( block, src ):
+
+def iter_components_by_src_start(block, src):
     for c in block.components:
-        if c.src.startswith( src ):
+        if c.src.startswith(src):
             yield c
 
-def get_components_by_src_start( block, src ):
-    return [ value for value in iter_components_by_src_start( block, src ) ]
+
+def get_components_by_src_start(block, src):
+    return [value for value in iter_components_by_src_start(block, src)]
+
 
-def sort_block_components_by_block( block1, block2 ):
-    #orders the components in block1 by the index of the component in block2
-    #block1 must be a subset of block2
-    #occurs in-place
-    return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
+def sort_block_components_by_block(block1, block2):
+    # orders the components in block1 by the index of the component in block2
+    # block1 must be a subset of block2
+    # occurs in-place
+    return block1.components.sort(cmp=lambda x, y: block2.components.index(x) - block2.components.index(y))
+
 
-def get_species_in_maf( maf_filename ):
+def get_species_in_maf(maf_filename):
     species = []
-    for block in bx.align.maf.Reader( open( maf_filename ) ):
-        for spec in get_species_in_block( block ):
+    for block in bx.align.maf.Reader(open(maf_filename)):
+        for spec in get_species_in_block(block):
             if spec not in species:
-                species.append( spec )
+                species.append(spec)
     return species
 
-def parse_species_option( species ):
+
+def parse_species_option(species):
     if species:
-        species = species.split( ',' )
+        species = species.split(',')
         if 'None' not in species:
             return species
-    return None #provided species was '', None, or had 'None' in it
+    return None  # provided species was '', None, or had 'None' in it
+
 
-def remove_temp_index_file( index_filename ):
-    try: os.unlink( index_filename )
-    except: pass
-
-#Below are methods to deal with FASTA files
+def remove_temp_index_file(index_filename):
+    try:
+        os.unlink(index_filename)
+    except Exception:
+        pass
 
-def get_fasta_header( component, attributes = {}, suffix = None ):
-    header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
+# Below are methods to deal with FASTA files
+
+
+def get_fasta_header(component, attributes={}, suffix=None):
+    header = ">%s(%s):%i-%i|" % (component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end())
     for key, value in attributes.iteritems():
-        header = "%s%s=%s|" % ( header, key, value )
+        header = "%s%s=%s|" % (header, key, value)
     if suffix:
-        header = "%s%s" % ( header, suffix )
+        header = "%s%s" % (header, suffix)
     else:
-        header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
+        header = "%s%s" % (header, src_split(component.src)[0])
     return header
 
-def get_attributes_from_fasta_header( header ):
-    if not header: return {}
+
+def get_attributes_from_fasta_header(header):
+    if not header:
+        return {}
     attributes = {}
-    header = header.lstrip( '>' )
+    header = header.lstrip('>')
     header = header.strip()
-    fields = header.split( '|' )
+    fields = header.split('|')
     try:
         region = fields[0]
-        region = region.split( '(', 1 )
-        temp = region[0].split( '.', 1 )
+        region = region.split('(', 1)
+        temp = region[0].split('.', 1)
         attributes['species'] = temp[0]
-        if len( temp ) == 2:
+        if len(temp) == 2:
             attributes['chrom'] = temp[1]
         else:
             attributes['chrom'] = temp[0]
-        region = region[1].split( ')', 1 )
+        region = region[1].split(')', 1)
         attributes['strand'] = region[0]
-        region = region[1].lstrip( ':' ).split( '-' )
-        attributes['start'] = int( region[0] )
-        attributes['end'] = int( region[1] )
-    except:
-        #fields 0 is not a region coordinate
+        region = region[1].lstrip(':').split('-')
+        attributes['start'] = int(region[0])
+        attributes['end'] = int(region[1])
+    except Exception:
+        # fields 0 is not a region coordinate
         pass
-    if len( fields ) > 2:
-        for i in xrange( 1, len( fields ) - 1 ):
-            prop = fields[i].split( '=', 1 )
-            if len( prop ) == 2:
-                attributes[ prop[0] ] = prop[1]
-    if len( fields ) > 1:
+    if len(fields) > 2:
+        for i in range(1, len(fields) - 1):
+            prop = fields[i].split('=', 1)
+            if len(prop) == 2:
+                attributes[prop[0]] = prop[1]
+    if len(fields) > 1:
         attributes['__suffix__'] = fields[-1]
     return attributes
 
-def iter_fasta_alignment( filename ):
+
+def iter_fasta_alignment(filename):
     class fastaComponent:
-        def __init__( self, species, text = "" ):
+        def __init__(self, species, text=""):
             self.species = species
             self.text = text
-        def extend( self, text ):
-            self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
-    #yields a list of fastaComponents for a FASTA file
-    f = open( filename, 'rb' )
-    components = []
-    #cur_component = None
-    while True:
-        line = f.readline()
-        if not line:
-            if components:
-                yield components
-            return
-        line = line.strip()
-        if not line:
-            if components:
-                yield components
-            components = []
-        elif line.startswith( '>' ):
-            attributes = get_attributes_from_fasta_header( line )
-            components.append( fastaComponent( attributes['species'] ) )
-        elif components:
-            components[-1].extend( line )
 
+        def extend(self, text):
+            self.text = self.text + text.replace('\n', '').replace('\r', '').strip()
+    # yields a list of fastaComponents for a FASTA file
+    with open(filename, 'r') as f:
+        components = []
+        # cur_component = None
+        while True:
+            line = f.readline()
+            if not line:
+                if components:
+                    yield components
+                return
+            line = line.strip()
+            if not line:
+                if components:
+                    yield components
+                components = []
+            elif line.startswith('>'):
+                attributes = get_attributes_from_fasta_header(line)
+                components.append(fastaComponent(attributes['species']))
+            elif components:
+                components[-1].extend(line)
--- a/utils/odict.py	Mon Nov 17 10:15:05 2014 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-"""
-Ordered dictionary implementation.
-"""
-
-from UserDict import UserDict
-
-class odict(UserDict):
-    """
-    http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/107747
-
-    This dictionary class extends UserDict to record the order in which items are
-    added. Calling keys(), values(), items(), etc. will return results in this
-    order.
-    """
-    def __init__( self, dict = None ):
-        self._keys = []
-        UserDict.__init__( self, dict )
-
-    def __delitem__( self, key ):
-        UserDict.__delitem__( self, key )
-        self._keys.remove( key )
-
-    def __setitem__( self, key, item ):
-        UserDict.__setitem__( self, key, item )
-        if key not in self._keys:
-            self._keys.append( key )
-
-    def clear( self ):
-        UserDict.clear( self )
-        self._keys = []
-
-    def copy(self):
-        new = odict()
-        new.update( self )
-        return new
-
-    def items( self ):
-        return zip( self._keys, self.values() )
-
-    def keys( self ):
-        return self._keys[:]
-
-    def popitem( self ):
-        try:
-            key = self._keys[-1]
-        except IndexError:
-            raise KeyError( 'dictionary is empty' )
-        val = self[ key ]
-        del self[ key ]
-        return ( key, val )
-
-    def setdefault( self, key, failobj=None ):
-        if key not in self._keys:
-            self._keys.append( key )
-        return UserDict.setdefault( self, key, failobj )
-
-    def update( self, dict ):
-        for ( key, val ) in dict.items():
-            self.__setitem__( key, val )
-
-    def values( self ):
-        return map( self.get, self._keys )
-
-    def iterkeys( self ):
-        return iter( self._keys )
-
-    def itervalues( self ):
-        for key in self._keys:
-            yield self.get( key )
-
-    def iteritems( self ):
-        for key in self._keys:
-            yield key, self.get( key )
-
-    def __iter__( self ):
-        for key in self._keys:
-            yield key
-
-    def reverse( self ):
-        self._keys.reverse()
-
-    def insert( self, index, key, item ):
-        if key not in self._keys:
-            self._keys.insert( index, key )
-            UserDict.__setitem__( self, key, item )