annotate annotation_profiler_for_interval.py @ 0:3b33da018e74 draft default tip

Imported from capsule None
author devteam
date Mon, 19 May 2014 12:33:42 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
1 #!/usr/bin/env python
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
2 #Dan Blankenberg
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
3 #For a set of intervals, this tool returns the same set of intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
4 #with 2 additional fields: the name of a Table/Feature and the number of
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
5 #bases covered. The original intervals are repeated for each Table/Feature.
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
6
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
7 import sys, struct, optparse, os, random
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
8 import bx.intervals.io
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
9 import bx.bitset
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
10 try:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
11 import psyco
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
12 psyco.full()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
13 except:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
14 pass
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
15
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
16 assert sys.version_info[:2] >= ( 2, 4 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
17
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
18 class CachedRangesInFile:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
19 DEFAULT_STRUCT_FORMAT = '<I'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
20 def __init__( self, filename, profiler_info ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
21 self.file_size = os.stat( filename ).st_size
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
22 self.file = open( filename, 'rb' )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
23 self.filename = filename
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
24 self.fmt = profiler_info.get( 'profiler_struct_format', self.DEFAULT_STRUCT_FORMAT )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
25 self.fmt_size = int( profiler_info.get( 'profiler_struct_size', struct.calcsize( self.fmt ) ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
26 self.length = int( self.file_size / self.fmt_size / 2 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
27 self._cached_ranges = [ None for i in xrange( self.length ) ]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
28 def __getitem__( self, i ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
29 if self._cached_ranges[i] is not None:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
30 return self._cached_ranges[i]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
31 if i < 0: i = self.length + i
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
32 offset = i * self.fmt_size * 2
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
33 self.file.seek( offset )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
34 try:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
35 start = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
36 end = struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
37 except Exception, e:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
38 raise IndexError, e
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
39 self._cached_ranges[i] = ( start, end )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
40 return start, end
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
41 def __len__( self ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
42 return self.length
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
43
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
44 class RegionCoverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
45 def __init__( self, filename_base, profiler_info ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
46 try:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
47 self._coverage = CachedRangesInFile( "%s.covered" % filename_base, profiler_info )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
48 except Exception, e:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
49 #print "Error loading coverage file %s: %s" % ( "%s.covered" % filename_base, e )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
50 self._coverage = []
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
51 try:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
52 self._total_coverage = int( open( "%s.total_coverage" % filename_base ).read() )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
53 except Exception, e:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
54 #print "Error loading total coverage file %s: %s" % ( "%s.total_coverage" % filename_base, e )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
55 self._total_coverage = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
56 def get_start_index( self, start ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
57 #binary search: returns index of range closest to start
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
58 if start > self._coverage[-1][1]:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
59 return len( self._coverage ) - 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
60 i = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
61 j = len( self._coverage) - 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
62 while i < j:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
63 k = ( i + j ) / 2
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
64 if start <= self._coverage[k][1]:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
65 j = k
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
66 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
67 i = k + 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
68 return i
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
69 def get_coverage( self, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
70 return self.get_coverage_regions_overlap( start, end )[0]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
71 def get_coverage_regions_overlap( self, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
72 return self.get_coverage_regions_index_overlap( start, end )[0:2]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
73 def get_coverage_regions_index_overlap( self, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
74 if len( self._coverage ) < 1 or start > self._coverage[-1][1] or end < self._coverage[0][0]:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
75 return 0, 0, 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
76 if self._total_coverage and start <= self._coverage[0][0] and end >= self._coverage[-1][1]:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
77 return self._total_coverage, len( self._coverage ), 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
78 coverage = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
79 region_count = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
80 start_index = self.get_start_index( start )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
81 for i in xrange( start_index, len( self._coverage ) ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
82 c_start, c_end = self._coverage[i]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
83 if c_start > end:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
84 break
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
85 if c_start <= end and c_end >= start:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
86 coverage += min( end, c_end ) - max( start, c_start )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
87 region_count += 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
88 return coverage, region_count, start_index
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
89
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
90 class CachedCoverageReader:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
91 def __init__( self, base_file_path, buffer = 10, table_names = None, profiler_info = None ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
92 self._base_file_path = base_file_path
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
93 self._buffer = buffer #number of chromosomes to keep in memory at a time
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
94 self._coverage = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
95 if table_names is None: table_names = [ table_dir for table_dir in os.listdir( self._base_file_path ) if os.path.isdir( os.path.join( self._base_file_path, table_dir ) ) ]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
96 for tablename in table_names: self._coverage[tablename] = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
97 if profiler_info is None: profiler_info = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
98 self._profiler_info = profiler_info
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
99 def iter_table_coverage_by_region( self, chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
100 for tablename, coverage, regions in self.iter_table_coverage_regions_by_region( chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
101 yield tablename, coverage
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
102 def iter_table_coverage_regions_by_region( self, chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
103 for tablename, coverage, regions, index in self.iter_table_coverage_regions_index_by_region( chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
104 yield tablename, coverage, regions
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
105 def iter_table_coverage_regions_index_by_region( self, chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
106 for tablename, chromosomes in self._coverage.iteritems():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
107 if chrom not in chromosomes:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
108 if len( chromosomes ) >= self._buffer:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
109 #randomly remove one chromosome from this table
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
110 del chromosomes[ chromosomes.keys().pop( random.randint( 0, self._buffer - 1 ) ) ]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
111 chromosomes[chrom] = RegionCoverage( os.path.join ( self._base_file_path, tablename, chrom ), self._profiler_info )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
112 coverage, regions, index = chromosomes[chrom].get_coverage_regions_index_overlap( start, end )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
113 yield tablename, coverage, regions, index
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
114
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
115 class TableCoverageSummary:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
116 def __init__( self, coverage_reader, chrom_lengths ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
117 self.coverage_reader = coverage_reader
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
118 self.chrom_lengths = chrom_lengths
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
119 self.chromosome_coverage = {} #dict of bitset by chromosome holding user's collapsed input intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
120 self.total_interval_size = 0 #total size of user's input intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
121 self.total_interval_count = 0 #total number of user's input intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
122 self.table_coverage = {} #dict of total coverage by user's input intervals by table
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
123 self.table_chromosome_size = {} #dict of dict of table:chrom containing total coverage of table for a chrom
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
124 self.table_chromosome_count = {} #dict of dict of table:chrom containing total number of coverage ranges of table for a chrom
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
125 self.table_regions_overlaped_count = {} #total number of table regions overlaping user's input intervals (non unique)
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
126 self.interval_table_overlap_count = {} #total number of user input intervals which overlap table
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
127 self.region_size_errors = {} #dictionary of lists of invalid ranges by chromosome
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
128 def add_region( self, chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
129 chrom_length = self.chrom_lengths.get( chrom )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
130 region_start = min( start, chrom_length )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
131 region_end = min( end, chrom_length )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
132 region_length = region_end - region_start
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
133
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
134 if region_length < 1 or region_start != start or region_end != end:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
135 if chrom not in self.region_size_errors:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
136 self.region_size_errors[chrom] = []
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
137 self.region_size_errors[chrom].append( ( start, end ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
138 if region_length < 1: return
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
139
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
140 self.total_interval_size += region_length
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
141 self.total_interval_count += 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
142 if chrom not in self.chromosome_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
143 self.chromosome_coverage[chrom] = bx.bitset.BitSet( chrom_length )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
144
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
145 self.chromosome_coverage[chrom].set_range( region_start, region_length )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
146 for table_name, coverage, regions in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, region_start, region_end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
147 if table_name not in self.table_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
148 self.table_coverage[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
149 self.table_chromosome_size[table_name] = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
150 self.table_regions_overlaped_count[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
151 self.interval_table_overlap_count[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
152 self.table_chromosome_count[table_name] = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
153 if chrom not in self.table_chromosome_size[table_name]:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
154 self.table_chromosome_size[table_name][chrom] = self.coverage_reader._coverage[table_name][chrom]._total_coverage
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
155 self.table_chromosome_count[table_name][chrom] = len( self.coverage_reader._coverage[table_name][chrom]._coverage )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
156 self.table_coverage[table_name] += coverage
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
157 if coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
158 self.interval_table_overlap_count[table_name] += 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
159 self.table_regions_overlaped_count[table_name] += regions
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
160 def iter_table_coverage( self ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
161 def get_nr_coverage():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
162 #returns non-redundant coverage, where user's input intervals have been collapse to resolve overlaps
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
163 table_coverage = {} #dictionary of tables containing number of table bases overlaped by nr intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
164 interval_table_overlap_count = {} #dictionary of tables containing number of nr intervals overlaping table
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
165 table_regions_overlap_count = {} #dictionary of tables containing number of regions overlaped (unique)
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
166 interval_count = 0 #total number of nr intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
167 interval_size = 0 #holds total size of nr intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
168 region_start_end = {} #holds absolute start,end for each user input chromosome
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
169 for chrom, chromosome_bitset in self.chromosome_coverage.iteritems():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
170 #loop through user's collapsed input intervals
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
171 end = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
172 last_end_index = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
173 interval_size += chromosome_bitset.count_range()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
174 while True:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
175 if end >= chromosome_bitset.size: break
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
176 start = chromosome_bitset.next_set( end )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
177 if start >= chromosome_bitset.size: break
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
178 end = chromosome_bitset.next_clear( start )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
179 interval_count += 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
180 if chrom not in region_start_end:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
181 region_start_end[chrom] = [start, end]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
182 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
183 region_start_end[chrom][1] = end
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
184 for table_name, coverage, region_count, start_index in self.coverage_reader.iter_table_coverage_regions_index_by_region( chrom, start, end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
185 if table_name not in table_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
186 table_coverage[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
187 interval_table_overlap_count[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
188 table_regions_overlap_count[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
189 table_coverage[table_name] += coverage
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
190 if coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
191 interval_table_overlap_count[table_name] += 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
192 table_regions_overlap_count[table_name] += region_count
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
193 if table_name in last_end_index and last_end_index[table_name] == start_index:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
194 table_regions_overlap_count[table_name] -= 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
195 last_end_index[table_name] = start_index + region_count - 1
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
196 table_region_coverage = {} #total coverage for tables by bounding nr interval region
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
197 table_region_count = {} #total number for tables by bounding nr interval region
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
198 for chrom, start_end in region_start_end.items():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
199 for table_name, coverage, region_count in self.coverage_reader.iter_table_coverage_regions_by_region( chrom, start_end[0], start_end[1] ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
200 if table_name not in table_region_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
201 table_region_coverage[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
202 table_region_count[table_name] = 0
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
203 table_region_coverage[table_name] += coverage
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
204 table_region_count[table_name] += region_count
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
205 return table_region_coverage, table_region_count, interval_count, interval_size, table_coverage, table_regions_overlap_count, interval_table_overlap_count
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
206 table_region_coverage, table_region_count, nr_interval_count, nr_interval_size, nr_table_coverage, nr_table_regions_overlap_count, nr_interval_table_overlap_count = get_nr_coverage()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
207 for table_name in self.table_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
208 #TODO: determine a type of statistic, then calculate and report here
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
209 yield table_name, sum( self.table_chromosome_size.get( table_name, {} ).values() ), sum( self.table_chromosome_count.get( table_name, {} ).values() ), table_region_coverage.get( table_name, 0 ), table_region_count.get( table_name, 0 ), self.total_interval_count, self.total_interval_size, self.table_coverage[table_name], self.table_regions_overlaped_count.get( table_name, 0), self.interval_table_overlap_count.get( table_name, 0 ), nr_interval_count, nr_interval_size, nr_table_coverage[table_name], nr_table_regions_overlap_count.get( table_name, 0 ), nr_interval_table_overlap_count.get( table_name, 0 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
210
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
211 def profile_per_interval( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
212 out = open( out_filename, 'wb' )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
213 for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
214 for table_name, coverage, region_count in coverage_reader.iter_table_coverage_regions_by_region( region.chrom, region.start, region.end ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
215 if keep_empty or coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
216 #only output regions that have atleast 1 base covered unless empty are requested
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
217 out.write( "%s\t%s\t%s\t%s\n" % ( "\t".join( region.fields ), table_name, coverage, region_count ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
218 out.close()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
219
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
220 def profile_summary( interval_filename, chrom_col, start_col, end_col, out_filename, keep_empty, coverage_reader, chrom_lengths ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
221 out = open( out_filename, 'wb' )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
222 table_coverage_summary = TableCoverageSummary( coverage_reader, chrom_lengths )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
223 for region in bx.intervals.io.NiceReaderWrapper( open( interval_filename, 'rb' ), chrom_col = chrom_col, start_col = start_col, end_col = end_col, fix_strand = True, return_header = False, return_comments = False ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
224 table_coverage_summary.add_region( region.chrom, region.start, region.end )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
225
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
226 out.write( "#tableName\ttableChromosomeCoverage\ttableChromosomeCount\ttableRegionCoverage\ttableRegionCount\tallIntervalCount\tallIntervalSize\tallCoverage\tallTableRegionsOverlaped\tallIntervalsOverlapingTable\tnrIntervalCount\tnrIntervalSize\tnrCoverage\tnrTableRegionsOverlaped\tnrIntervalsOverlapingTable\n" )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
227 for table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count in table_coverage_summary.iter_table_coverage():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
228 if keep_empty or total_coverage:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
229 #only output tables that have atleast 1 base covered unless empty are requested
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
230 out.write( "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % ( table_name, table_chromosome_size, table_chromosome_count, table_region_coverage, table_region_count, total_interval_count, total_interval_size, total_coverage, table_regions_overlaped_count, interval_region_overlap_count, nr_interval_count, nr_interval_size, nr_coverage, nr_table_regions_overlaped_count, nr_interval_table_overlap_count ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
231 out.close()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
232
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
233 #report chrom size errors as needed:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
234 if table_coverage_summary.region_size_errors:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
235 print "Regions provided extended beyond known chromosome lengths, and have been truncated as necessary, for the following intervals:"
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
236 for chrom, regions in table_coverage_summary.region_size_errors.items():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
237 if len( regions ) > 3:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
238 extra_region_info = ", ... "
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
239 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
240 extra_region_info = ""
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
241 print "%s has max length of %s, exceeded by %s%s." % ( chrom, chrom_lengths.get( chrom ), ", ".join( map( str, regions[:3] ) ), extra_region_info )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
242
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
243 class ChromosomeLengths:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
244 def __init__( self, profiler_info ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
245 self.chroms = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
246 self.default_bitset_size = int( profiler_info.get( 'bitset_size', bx.bitset.MAX ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
247 chroms = profiler_info.get( 'chromosomes', None )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
248 if chroms:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
249 for chrom in chroms.split( ',' ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
250 for fields in chrom.rsplit( '=', 1 ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
251 if len( fields ) == 2:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
252 self.chroms[ fields[0] ] = int( fields[1] )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
253 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
254 self.chroms[ fields[0] ] = self.default_bitset_size
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
255 def get( self, name ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
256 return self.chroms.get( name, self.default_bitset_size )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
257
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
258 def parse_profiler_info( filename ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
259 profiler_info = {}
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
260 try:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
261 for line in open( filename ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
262 fields = line.rstrip( '\n\r' ).split( '\t', 1 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
263 if len( fields ) == 2:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
264 if fields[0] in profiler_info:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
265 if not isinstance( profiler_info[ fields[0] ], list ):
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
266 profiler_info[ fields[0] ] = [ profiler_info[ fields[0] ] ]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
267 profiler_info[ fields[0] ].append( fields[1] )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
268 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
269 profiler_info[ fields[0] ] = fields[1]
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
270 except:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
271 pass #likely missing file
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
272 return profiler_info
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
273
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
274 def __main__():
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
275 parser = optparse.OptionParser()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
276 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
277 '-k','--keep_empty',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
278 action="store_true",
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
279 dest='keep_empty',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
280 default=False,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
281 help='Keep tables with 0 coverage'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
282 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
283 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
284 '-b','--buffer',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
285 dest='buffer',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
286 type='int',default=10,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
287 help='Number of Chromosomes to keep buffered'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
288 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
289 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
290 '-c','--chrom_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
291 dest='chrom_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
292 type='int',default=1,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
293 help='Chromosome column'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
294 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
295 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
296 '-s','--start_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
297 dest='start_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
298 type='int',default=2,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
299 help='Start Column'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
300 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
301 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
302 '-e','--end_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
303 dest='end_col',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
304 type='int',default=3,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
305 help='End Column'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
306 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
307 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
308 '-p','--path',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
309 dest='path',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
310 type='str',default='/galaxy/data/annotation_profiler/hg18',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
311 help='Path to profiled data for this organism'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
312 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
313 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
314 '-t','--table_names',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
315 dest='table_names',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
316 type='str',default='None',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
317 help='Table names requested'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
318 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
319 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
320 '-i','--input',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
321 dest='interval_filename',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
322 type='str',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
323 help='Input Interval File'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
324 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
325 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
326 '-o','--output',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
327 dest='out_filename',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
328 type='str',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
329 help='Input Interval File'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
330 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
331 parser.add_option(
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
332 '-S','--summary',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
333 action="store_true",
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
334 dest='summary',
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
335 default=False,
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
336 help='Display Summary Results'
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
337 )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
338
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
339 options, args = parser.parse_args()
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
340
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
341 assert os.path.isdir( options.path ), IOError( "Configuration error: Table directory is missing (%s)" % options.path )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
342
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
343 #get profiler_info
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
344 profiler_info = parse_profiler_info( os.path.join( options.path, 'profiler_info.txt' ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
345
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
346 table_names = options.table_names.split( "," )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
347 if table_names == ['None']: table_names = None
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
348 coverage_reader = CachedCoverageReader( options.path, buffer = options.buffer, table_names = table_names, profiler_info = profiler_info )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
349
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
350 if options.summary:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
351 profile_summary( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader, ChromosomeLengths( profiler_info ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
352 else:
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
353 profile_per_interval( options.interval_filename, options.chrom_col - 1, options.start_col - 1, options.end_col -1, options.out_filename, options.keep_empty, coverage_reader )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
354
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
355 #print out data version info
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
356 print 'Data version (%s:%s:%s)' % ( profiler_info.get( 'dbkey', 'unknown' ), profiler_info.get( 'profiler_hash', 'unknown' ), profiler_info.get( 'dump_time', 'unknown' ) )
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
357
3b33da018e74 Imported from capsule None
devteam
parents:
diff changeset
358 if __name__ == "__main__": __main__()