annotate utils/maf_utilities.py @ 0:2126e1b833a2

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:30 -0400
parents
children 717aee069681
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
2 """
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
3 Provides wrappers and utilities for working with MAF files and alignments.
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
4 """
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
5 #Dan Blankenberg
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
6 import pkg_resources; pkg_resources.require( "bx-python" )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
7 import bx.align.maf
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
8 import bx.intervals
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
9 import bx.interval_index_file
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
10 import sys, os, string, tempfile
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
11 import logging
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
12 from copy import deepcopy
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
13
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
14 assert sys.version_info[:2] >= ( 2, 4 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
15
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
16 log = logging.getLogger(__name__)
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
17
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
18
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
19 GAP_CHARS = [ '-' ]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
20 SRC_SPLIT_CHAR = '.'
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
21
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
22 def src_split( src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
23 fields = src.split( SRC_SPLIT_CHAR, 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
24 spec = fields.pop( 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
25 if fields:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
26 chrom = fields.pop( 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
27 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
28 chrom = spec
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
29 return spec, chrom
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
30
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
31 def src_merge( spec, chrom, contig = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
32 if None in [ spec, chrom ]:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
33 spec = chrom = spec or chrom
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
34 return bx.align.maf.src_merge( spec, chrom, contig )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
35
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
36 def get_species_in_block( block ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
37 species = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
38 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
39 spec, chrom = src_split( c.src )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
40 if spec not in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
41 species.append( spec )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
42 return species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
43
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
44 def tool_fail( msg = "Unknown Error" ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
45 print >> sys.stderr, "Fatal Error: %s" % msg
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
46 sys.exit()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
47
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
48 #an object corresponding to a reference layered alignment
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
49 class RegionAlignment( object ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
50
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
51 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
52 MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
53
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
54 def __init__( self, size, species = [] ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
55 assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
56 self.size = size
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
57 self.sequences = {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
58 if not isinstance( species, list ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
59 species = [species]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
60 for spec in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
61 self.add_species( spec )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
62
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
63 #add a species to the alignment
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
64 def add_species( self, species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
65 #make temporary sequence files
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
66 self.sequences[species] = tempfile.TemporaryFile()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
67 self.sequences[species].write( "-" * self.size )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
68
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
69 #returns the names for species found in alignment, skipping names as requested
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
70 def get_species_names( self, skip = [] ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
71 if not isinstance( skip, list ): skip = [skip]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
72 names = self.sequences.keys()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
73 for name in skip:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
74 try: names.remove( name )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
75 except: pass
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
76 return names
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
77
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
78 #returns the sequence for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
79 def get_sequence( self, species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
80 self.sequences[species].seek( 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
81 return self.sequences[species].read()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
82
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
83 #returns the reverse complement of the sequence for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
84 def get_sequence_reverse_complement( self, species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
85 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
86 complement.reverse()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
87 return "".join( complement )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
88
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
89 #sets a position for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
90 def set_position( self, index, species, base ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
91 if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
92 return self.set_range( index, species, base )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
93 #sets a range for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
94 def set_range( self, index, species, bases ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
95 if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
96 if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
97 if species not in self.sequences.keys(): self.add_species( species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
98 self.sequences[species].seek( index )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
99 self.sequences[species].write( bases )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
100
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
101 #Flush temp file of specified species, or all species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
102 def flush( self, species = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
103 if species is None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
104 species = self.sequences.keys()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
105 elif not isinstance( species, list ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
106 species = [species]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
107 for spec in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
108 self.sequences[spec].flush()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
109
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
110 class GenomicRegionAlignment( RegionAlignment ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
111
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
112 def __init__( self, start, end, species = [] ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
113 RegionAlignment.__init__( self, end - start, species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
114 self.start = start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
115 self.end = end
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
116
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
117 class SplicedAlignment( object ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
118
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
119 DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
120
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
121 def __init__( self, exon_starts, exon_ends, species = [] ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
122 if not isinstance( exon_starts, list ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
123 exon_starts = [exon_starts]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
124 if not isinstance( exon_ends, list ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
125 exon_ends = [exon_ends]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
126 assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
127 self.exons = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
128 for i in range( len( exon_starts ) ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
129 self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
130
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
131 #returns the names for species found in alignment, skipping names as requested
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
132 def get_species_names( self, skip = [] ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
133 if not isinstance( skip, list ): skip = [skip]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
134 names = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
135 for exon in self.exons:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
136 for name in exon.get_species_names( skip = skip ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
137 if name not in names:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
138 names.append( name )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
139 return names
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
140
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
141 #returns the sequence for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
142 def get_sequence( self, species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
143 sequence = tempfile.TemporaryFile()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
144 for exon in self.exons:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
145 if species in exon.get_species_names():
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
146 sequence.write( exon.get_sequence( species ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
147 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
148 sequence.write( "-" * exon.size )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
149 sequence.seek( 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
150 return sequence.read()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
151
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
152 #returns the reverse complement of the sequence for a species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
153 def get_sequence_reverse_complement( self, species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
154 complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
155 complement.reverse()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
156 return "".join( complement )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
157
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
158 #Start and end of coding region
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
159 @property
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
160 def start( self ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
161 return self.exons[0].start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
162 @property
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
163 def end( self ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
164 return self.exons[-1].end
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
165
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
166 #Open a MAF index using a UID
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
167 def maf_index_by_uid( maf_uid, index_location_file ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
168 for line in open( index_location_file ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
169 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
170 #read each line, if not enough fields, go to next line
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
171 if line[0:1] == "#" : continue
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
172 fields = line.split('\t')
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
173 if maf_uid == fields[1]:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
174 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
175 maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
176 return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
177 except Exception, e:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
178 raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
179 except:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
180 pass
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
181 return None
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
182
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
183 #return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
184 def open_or_build_maf_index( maf_file, index_filename, species = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
185 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
186 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
187 except:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
188 return build_maf_index( maf_file, species = species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
189
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
190 def build_maf_index_species_chromosomes( filename, index_species = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
191 species = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
192 species_chromosomes = {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
193 indexes = bx.interval_index_file.Indexes()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
194 blocks = 0
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
195 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
196 maf_reader = bx.align.maf.Reader( open( filename ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
197 while True:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
198 pos = maf_reader.file.tell()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
199 block = maf_reader.next()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
200 if block is None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
201 break
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
202 blocks += 1
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
203 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
204 spec = c.src
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
205 chrom = None
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
206 if "." in spec:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
207 spec, chrom = spec.split( ".", 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
208 if spec not in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
209 species.append( spec )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
210 species_chromosomes[spec] = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
211 if chrom and chrom not in species_chromosomes[spec]:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
212 species_chromosomes[spec].append( chrom )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
213 if index_species is None or spec in index_species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
214 forward_strand_start = c.forward_strand_start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
215 forward_strand_end = c.forward_strand_end
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
216 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
217 forward_strand_start = int( forward_strand_start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
218 forward_strand_end = int( forward_strand_end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
219 except ValueError:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
220 continue #start and end are not integers, can't add component to index, goto next component
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
221 #this likely only occurs when parse_e_rows is True?
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
222 #could a species exist as only e rows? should the
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
223 if forward_strand_end > forward_strand_start:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
224 #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
225 indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
226 except Exception, e:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
227 #most likely a bad MAF
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
228 log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
229 return ( None, [], {}, 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
230 return ( indexes, species, species_chromosomes, blocks )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
231
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
232 #builds and returns ( index, index_filename ) for specified maf_file
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
233 def build_maf_index( maf_file, species = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
234 indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
235 if indexes is not None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
236 fd, index_filename = tempfile.mkstemp()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
237 out = os.fdopen( fd, 'w' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
238 indexes.write( out )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
239 out.close()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
240 return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
241 return ( None, None )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
242
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
243 def component_overlaps_region( c, region ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
244 if c is None: return False
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
245 start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
246 if region.start >= end or region.end <= start:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
247 return False
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
248 return True
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
249
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
250 def chop_block_by_region( block, src, region, species = None, mincols = 0 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
251 # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
252 # behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
253 # whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
254 # comments welcome
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
255 slice_start = block.text_size #max for the min()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
256 slice_end = 0 #min for the max()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
257 old_score = block.score #save old score for later use
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
258 # We no longer assume only one occurance of src per block, so we need to check them all
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
259 for c in iter_components_by_src( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
260 if component_overlaps_region( c, region ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
261 if c.text is not None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
262 rev_strand = False
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
263 if c.strand == "-":
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
264 #We want our coord_to_col coordinates to be returned from positive stranded component
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
265 rev_strand = True
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
266 c = c.reverse_complement()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
267 start = max( region.start, c.start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
268 end = min( region.end, c.end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
269 start = c.coord_to_col( start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
270 end = c.coord_to_col( end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
271 if rev_strand:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
272 #need to orient slice coordinates to the original block direction
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
273 slice_len = end - start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
274 end = len( c.text ) - start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
275 start = end - slice_len
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
276 slice_start = min( start, slice_start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
277 slice_end = max( end, slice_end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
278
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
279 if slice_start < slice_end:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
280 block = block.slice( slice_start, slice_end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
281 if block.text_size > mincols:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
282 # restore old score, may not be accurate, but it is better than 0 for everything?
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
283 block.score = old_score
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
284 if species is not None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
285 block = block.limit_to_species( species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
286 block.remove_all_gap_columns()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
287 return block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
288 return None
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
289
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
290 def orient_block_by_region( block, src, region, force_strand = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
291 #loop through components matching src,
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
292 #make sure each of these components overlap region
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
293 #cache strand for each of overlaping regions
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
294 #if force_strand / region.strand not in strand cache, reverse complement
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
295 ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
296 strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
297 if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
298 block = block.reverse_complement()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
299 return block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
300
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
301 def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
302 for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
303 yield block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
304 def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
305 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
306 yield orient_block_by_region( block, src, region, force_strand ), idx, offset
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
307
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
308 #split a block with multiple occurances of src into one block per src
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
309 def iter_blocks_split_by_src( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
310 for src_c in iter_components_by_src( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
311 new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
312 new_block.text_size = block.text_size
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
313 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
314 if c == src_c or c.src != src:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
315 new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
316 yield new_block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
317
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
318 #split a block into multiple blocks with all combinations of a species appearing only once per block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
319 def iter_blocks_split_by_species( block, species = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
320 def __split_components_by_species( components_by_species, new_block ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
321 if components_by_species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
322 #more species with components to add to this block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
323 components_by_species = deepcopy( components_by_species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
324 spec_comps = components_by_species.pop( 0 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
325 for c in spec_comps:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
326 newer_block = deepcopy( new_block )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
327 newer_block.add_component( deepcopy( c ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
328 for value in __split_components_by_species( components_by_species, newer_block ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
329 yield value
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
330 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
331 #no more components to add, yield this block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
332 yield new_block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
333
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
334 #divide components by species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
335 spec_dict = {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
336 if not species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
337 species = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
338 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
339 spec, chrom = src_split( c.src )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
340 if spec not in spec_dict:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
341 spec_dict[ spec ] = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
342 species.append( spec )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
343 spec_dict[ spec ].append( c )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
344 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
345 for spec in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
346 spec_dict[ spec ] = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
347 for c in iter_components_by_src_start( block, spec ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
348 spec_dict[ spec ].append( c )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
349
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
350 empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
351 empty_block.text_size = block.text_size
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
352 #call recursive function to split into each combo of spec/blocks
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
353 for value in __split_components_by_species( spec_dict.values(), empty_block ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
354 sort_block_components_by_block( value, block ) #restore original component order
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
355 yield value
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
356
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
357
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
358 #generator yielding only chopped and valid blocks for a specified region
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
359 def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
360 for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
361 yield block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
362 def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
363 for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
364 block = chop_block_by_region( block, src, region, species, mincols )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
365 if block is not None:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
366 yield block, idx, offset
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
367
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
368 #returns a filled region alignment for specified regions
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
369 def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
370 if species is not None: alignment = RegionAlignment( end - start, species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
371 else: alignment = RegionAlignment( end - start, primary_species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
372 return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
373
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
374 #reduces a block to only positions exisiting in the src provided
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
375 def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
376 #returns ( startIndex, {species:texts}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
377 #where texts' contents are reduced to only positions existing in the primary genome
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
378 src = "%s.%s" % ( species, chromosome )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
379 ref = block.get_component_by_src( src )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
380 start_offset = ref.start - region_start
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
381 species_texts = {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
382 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
383 species_texts[ c.src.split( '.' )[0] ] = list( c.text )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
384 #remove locations which are gaps in the primary species, starting from the downstream end
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
385 for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
386 if species_texts[ species ][i] == '-':
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
387 for text in species_texts.values():
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
388 text.pop( i )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
389 for spec, text in species_texts.items():
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
390 species_texts[spec] = ''.join( text )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
391 return ( start_offset, species_texts )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
392
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
393 #fills a region alignment
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
394 def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
395 region = bx.intervals.Interval( start, end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
396 region.chrom = chrom
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
397 region.strand = strand
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
398 primary_src = "%s.%s" % ( primary_species, chrom )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
399
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
400 #Order blocks overlaping this position by score, lowest first
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
401 blocks = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
402 for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
403 score = float( block.score )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
404 for i in range( 0, len( blocks ) ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
405 if score < blocks[i][0]:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
406 blocks.insert( i, ( score, idx, offset ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
407 break
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
408 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
409 blocks.append( ( score, idx, offset ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
410
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
411 #gap_chars_tuple = tuple( GAP_CHARS )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
412 gap_chars_str = ''.join( GAP_CHARS )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
413 #Loop through ordered blocks and layer by increasing score
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
414 for block_dict in blocks:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
415 for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
416 if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
417 block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
418 block = orient_block_by_region( block, primary_src, region ) #orient block
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
419 start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
420 for spec, text in species_texts.items():
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
421 #we should trim gaps from both sides, since these are not positions in this species genome (sequence)
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
422 text = text.rstrip( gap_chars_str )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
423 gap_offset = 0
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
424 while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
425 #while text.startswith( gap_chars_tuple ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
426 gap_offset += 1
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
427 text = text[1:]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
428 if not text:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
429 break
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
430 if text:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
431 if overwrite_with_gaps:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
432 alignment.set_range( start_offset + gap_offset, spec, text )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
433 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
434 for i, char in enumerate( text ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
435 if char not in GAP_CHARS:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
436 alignment.set_position( start_offset + gap_offset + i, spec, char )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
437 return alignment
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
438
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
439 #returns a filled spliced region alignment for specified region with start and end lists
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
440 def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
441 #create spliced alignment object
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
442 if species is not None: alignment = SplicedAlignment( starts, ends, species )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
443 else: alignment = SplicedAlignment( starts, ends, [primary_species] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
444 for exon in alignment.exons:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
445 fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
446 return alignment
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
447
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
448 #loop through string array, only return non-commented lines
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
449 def line_enumerator( lines, comment_start = '#' ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
450 i = 0
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
451 for line in lines:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
452 if not line.startswith( comment_start ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
453 i += 1
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
454 yield ( i, line )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
455
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
456 #read a GeneBed file, return list of starts, ends, raw fields
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
457 def get_starts_ends_fields_from_gene_bed( line ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
458 #Starts and ends for exons
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
459 starts = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
460 ends = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
461
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
462 fields = line.split()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
463 #Requires atleast 12 BED columns
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
464 if len(fields) < 12:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
465 raise Exception( "Not a proper 12 column BED line (%s)." % line )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
466 chrom = fields[0]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
467 tx_start = int( fields[1] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
468 tx_end = int( fields[2] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
469 name = fields[3]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
470 strand = fields[5]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
471 if strand != '-': strand='+' #Default strand is +
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
472 cds_start = int( fields[6] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
473 cds_end = int( fields[7] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
474
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
475 #Calculate and store starts and ends of coding exons
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
476 region_start, region_end = cds_start, cds_end
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
477 exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
478 exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
479 exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
480 exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
481 for start, end in zip( exon_starts, exon_ends ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
482 start = max( start, region_start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
483 end = min( end, region_end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
484 if start < end:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
485 starts.append( start )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
486 ends.append( end )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
487 return ( starts, ends, fields )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
488
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
489 def iter_components_by_src( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
490 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
491 if c.src == src:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
492 yield c
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
493
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
494 def get_components_by_src( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
495 return [ value for value in iter_components_by_src( block, src ) ]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
496
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
497 def iter_components_by_src_start( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
498 for c in block.components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
499 if c.src.startswith( src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
500 yield c
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
501
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
502 def get_components_by_src_start( block, src ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
503 return [ value for value in iter_components_by_src_start( block, src ) ]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
504
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
505 def sort_block_components_by_block( block1, block2 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
506 #orders the components in block1 by the index of the component in block2
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
507 #block1 must be a subset of block2
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
508 #occurs in-place
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
509 return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
510
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
511 def get_species_in_maf( maf_filename ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
512 species = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
513 for block in bx.align.maf.Reader( open( maf_filename ) ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
514 for spec in get_species_in_block( block ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
515 if spec not in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
516 species.append( spec )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
517 return species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
518
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
519 def parse_species_option( species ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
520 if species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
521 species = species.split( ',' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
522 if 'None' not in species:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
523 return species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
524 return None #provided species was '', None, or had 'None' in it
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
525
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
526 def remove_temp_index_file( index_filename ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
527 try: os.unlink( index_filename )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
528 except: pass
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
529
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
530 #Below are methods to deal with FASTA files
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
531
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
532 def get_fasta_header( component, attributes = {}, suffix = None ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
533 header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
534 for key, value in attributes.iteritems():
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
535 header = "%s%s=%s|" % ( header, key, value )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
536 if suffix:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
537 header = "%s%s" % ( header, suffix )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
538 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
539 header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
540 return header
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
541
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
542 def get_attributes_from_fasta_header( header ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
543 if not header: return {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
544 attributes = {}
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
545 header = header.lstrip( '>' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
546 header = header.strip()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
547 fields = header.split( '|' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
548 try:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
549 region = fields[0]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
550 region = region.split( '(', 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
551 temp = region[0].split( '.', 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
552 attributes['species'] = temp[0]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
553 if len( temp ) == 2:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
554 attributes['chrom'] = temp[1]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
555 else:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
556 attributes['chrom'] = temp[0]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
557 region = region[1].split( ')', 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
558 attributes['strand'] = region[0]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
559 region = region[1].lstrip( ':' ).split( '-' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
560 attributes['start'] = int( region[0] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
561 attributes['end'] = int( region[1] )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
562 except:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
563 #fields 0 is not a region coordinate
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
564 pass
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
565 if len( fields ) > 2:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
566 for i in xrange( 1, len( fields ) - 1 ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
567 prop = fields[i].split( '=', 1 )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
568 if len( prop ) == 2:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
569 attributes[ prop[0] ] = prop[1]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
570 if len( fields ) > 1:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
571 attributes['__suffix__'] = fields[-1]
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
572 return attributes
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
573
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
574 def iter_fasta_alignment( filename ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
575 class fastaComponent:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
576 def __init__( self, species, text = "" ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
577 self.species = species
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
578 self.text = text
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
579 def extend( self, text ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
580 self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
581 #yields a list of fastaComponents for a FASTA file
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
582 f = open( filename, 'rb' )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
583 components = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
584 #cur_component = None
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
585 while True:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
586 line = f.readline()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
587 if not line:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
588 if components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
589 yield components
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
590 return
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
591 line = line.strip()
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
592 if not line:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
593 if components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
594 yield components
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
595 components = []
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
596 elif line.startswith( '>' ):
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
597 attributes = get_attributes_from_fasta_header( line )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
598 components.append( fastaComponent( attributes['species'] ) )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
599 elif components:
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
600 components[-1].extend( line )
2126e1b833a2 Imported from capsule None
devteam
parents:
diff changeset
601