0
|
1 #ifndef _MOURISL_CLASSES_SUBEXONGRAPH_HEADER
|
|
2 #define _MOURISL_CLASSES_SUBEXONGRAPH_HEADER
|
|
3
|
|
4 #include "alignments.hpp"
|
|
5 #include "blocks.hpp"
|
|
6
|
|
7 struct _subexon
|
|
8 {
|
|
9 int chrId ;
|
|
10 int geneId ;
|
|
11 int start, end ;
|
|
12 int leftType, rightType ;
|
|
13 double avgDepth ;
|
|
14 //double ratio, classifier ;
|
|
15 double leftRatio, rightRatio ;
|
|
16 double leftClassifier, rightClassifier ;
|
|
17 int lcCnt, rcCnt ;
|
|
18 int leftStrand, rightStrand ;
|
|
19
|
|
20 int nextCnt, prevCnt ;
|
|
21 int *next, *prev ;
|
|
22
|
|
23 bool canBeStart, canBeEnd ;
|
|
24 } ;
|
|
25
|
|
26 struct _geneInterval
|
|
27 {
|
|
28 int startIdx, endIdx ;
|
|
29 int start, end ; // The start and end of a gene interval might be adjusted, so it does not
|
|
30 // need to be match with the corresponding subexons
|
|
31 } ;
|
|
32
|
|
33 class SubexonGraph
|
|
34 {
|
|
35 private:
|
|
36 int *visit ;
|
|
37 double classifierThreshold ;
|
|
38
|
|
39 int usedGeneId ;
|
|
40 int baseGeneId ;
|
|
41
|
|
42 // The function to assign gene ids to subexons.
|
|
43 void SetGeneId( int tag, int strand, struct _subexon *subexons, int seCnt, int id ) ;
|
|
44 void GetGeneBoundary( int tag, int &boundary, int timeStamp ) ;
|
|
45 void UpdateGeneId( struct _subexon *subexons, int seCnt ) ;
|
|
46 public:
|
|
47 std::vector<struct _subexon> subexons ;
|
|
48 std::vector<struct _geneInterval> geneIntervals ;
|
|
49
|
|
50 ~SubexonGraph()
|
|
51 {
|
|
52 int i ;
|
|
53 int size = subexons.size() ;
|
|
54 for ( i = 0 ; i < size ; ++i )
|
|
55 {
|
|
56 if ( subexons[i].next )
|
|
57 delete[] subexons[i].next ;
|
|
58 if ( subexons[i].prev )
|
|
59 delete[] subexons[i].prev ;
|
|
60 }
|
|
61 }
|
|
62
|
|
63 SubexonGraph( double classifierThreshold, Alignments &bam, FILE *fpSubexon )
|
|
64 {
|
|
65 // Read in the subexons
|
|
66 rewind( fpSubexon ) ;
|
|
67 char buffer[2048] ;
|
|
68 int subexonCnt ;
|
|
69 int i, j, k ;
|
|
70 while ( fgets( buffer, sizeof( buffer ), fpSubexon ) != NULL )
|
|
71 {
|
|
72 if ( buffer[0] == '#' )
|
|
73 continue ;
|
|
74
|
|
75 struct _subexon se ;
|
|
76 InputSubexon( buffer, bam, se, true ) ;
|
|
77
|
|
78 // filter.
|
|
79 if ( ( se.leftType == 0 && se.rightType == 0 )
|
|
80 || ( se.leftType == 0 && se.rightType == 1 ) // overhang
|
|
81 || ( se.leftType == 2 && se.rightType == 0 ) // overhang
|
|
82 || ( se.leftType == 2 && se.rightType == 1 ) ) // ir
|
|
83 {
|
|
84 if ( ( se.leftType == 0 && se.rightType == 1 )
|
|
85 || ( se.leftType == 2 && se.rightType == 0 ) ) // if the overhang is too small
|
|
86 {
|
|
87 if ( se.end - se.start + 1 <= 7 )
|
|
88 {
|
|
89 if ( se.next )
|
|
90 delete[] se.next ;
|
|
91 if ( se.prev )
|
|
92 delete[] se.prev ;
|
|
93 continue ;
|
|
94 }
|
|
95 }
|
|
96
|
|
97 if ( se.leftClassifier >= classifierThreshold || se.leftClassifier < 0 )
|
|
98 {
|
|
99 if ( se.next )
|
|
100 delete[] se.next ;
|
|
101 if ( se.prev )
|
|
102 delete[] se.prev ;
|
|
103 continue ;
|
|
104 }
|
|
105 }
|
|
106
|
|
107 // Adjust the coordinate.
|
|
108 subexons.push_back( se ) ;
|
|
109 }
|
|
110
|
|
111 // Convert the coordinate to index
|
|
112 // Note that each coordinate can only associate with one subexon.
|
|
113 subexonCnt = subexons.size() ;
|
|
114 for ( i = 0 ; i < subexonCnt ; ++i )
|
|
115 {
|
|
116 struct _subexon &se = subexons[i] ;
|
|
117 //printf( "hi1 %d: %d %d\n", i, se.prevCnt, se.prev[0] ) ;
|
|
118 int cnt = 0 ;
|
|
119
|
|
120 // due to filter, we may not fully match the coordinate and the subexon
|
|
121 int bound = 0 ;
|
|
122 if ( se.prevCnt > 0 )
|
|
123 bound = se.prev[0] ;
|
|
124 for ( j = i - 1, k = 0 ; k < se.prevCnt && j >= 0 && subexons[j].end >= bound ; --j )
|
|
125 {
|
|
126 //printf( " %d %d: %d %d\n", j, k, se.prev[ se.prevCnt - 1 - k], subexons[j].end ) ;
|
|
127 if ( subexons[j].end == se.prev[se.prevCnt - 1 - k] ) // notice the order is reversed
|
|
128 {
|
|
129 se.prev[se.prevCnt - 1 - cnt] = j ;
|
|
130 ++k ;
|
|
131 ++cnt ;
|
|
132 }
|
|
133 else if ( subexons[j].end < se.prev[ se.prevCnt - 1 - k ] ) // the corresponding subexon gets filtered.
|
|
134 {
|
|
135 ++k ;
|
|
136 ++j ; // counter the --j in the loop
|
|
137 }
|
|
138 }
|
|
139 //printf( "hi2 %d : %d\n", i, se.prevCnt ) ;
|
|
140 // shft the list
|
|
141 for ( j = 0, k = se.prevCnt - cnt ; j < cnt ; ++j, ++k )
|
|
142 {
|
|
143 se.prev[j] = se.prev[k] ;
|
|
144 }
|
|
145 se.prevCnt = cnt ;
|
|
146 cnt = 0 ;
|
|
147 if ( se.nextCnt > 0 )
|
|
148 bound = se.next[ se.nextCnt - 1] ;
|
|
149 for ( j = i + 1, k = 0 ; k < se.nextCnt && j < subexonCnt && subexons[j].start <= bound ; ++j )
|
|
150 {
|
|
151 if ( subexons[j].start == se.next[k] )
|
|
152 {
|
|
153 se.next[cnt] = j ; // cnt is always less than k, so we don't need to worry about overwrite.
|
|
154 ++k ;
|
|
155 ++cnt ;
|
|
156 }
|
|
157 else if ( subexons[j].start > se.next[k] )
|
|
158 {
|
|
159 ++k ;
|
|
160 --j ;
|
|
161 }
|
|
162 }
|
|
163 se.nextCnt = cnt ;
|
|
164 }
|
|
165
|
|
166 // Adjust the coordinate
|
|
167 int seCnt = subexons.size() ;
|
|
168 for ( i = 0 ; i < seCnt ; ++i )
|
|
169 {
|
|
170 --subexons[i].start ;
|
|
171 --subexons[i].end ;
|
|
172 }
|
|
173 rewind( fpSubexon ) ;
|
|
174
|
|
175 // Adjust the classifier for hard boundary, if there is a overhang attached to that region.
|
|
176 for ( i = 0 ; i < seCnt ; ++i )
|
|
177 {
|
|
178 if ( subexons[i].leftType == 1 && subexons[i].leftClassifier < 1 )
|
|
179 {
|
|
180 for ( j = i - 1 ; j >= 0 ; --j )
|
|
181 if ( subexons[j].end < subexons[j + 1].start - 1 )
|
|
182 break ;
|
|
183 if ( subexons[j + 1].leftType == 0 )
|
|
184 subexons[i].leftClassifier = 1 ;
|
|
185 }
|
|
186 if ( subexons[i].rightType == 2 && subexons[i].rightClassifier < 1 )
|
|
187 {
|
|
188 for ( j = i + 1 ; j < seCnt ; ++j )
|
|
189 if ( subexons[j].start > subexons[j - 1].end + 1 )
|
|
190 break ;
|
|
191 if ( subexons[j - 1].rightType == 0 )
|
|
192 subexons[i].rightClassifier = 1 ;
|
|
193 }
|
|
194 }
|
|
195
|
|
196 // For the region of mixture of plus and minus strand subexons, if there is
|
|
197 // no overhang attached to it, we need to let the hard boundary be a candidate terminal sites.
|
|
198 for ( i = 0 ; i < seCnt ; )
|
|
199 {
|
|
200 // [i,j) is a region
|
|
201 int support[2] = {0, 0} ; // the index, 0 is for minus strand, 1 is for plus strand
|
|
202 for ( j = i + 1 ; j < seCnt ; ++j )
|
|
203 {
|
|
204 if ( subexons[j].start > subexons[j - 1].end + 1 )
|
|
205 break ;
|
|
206 }
|
|
207
|
|
208 for ( k = i ; k < j ; ++k )
|
|
209 {
|
|
210 if ( subexons[k].leftStrand != 0 )
|
|
211 ++support[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
|
|
212 if ( subexons[k].rightStrand != 0 )
|
|
213 ++support[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
|
|
214 }
|
|
215 if ( support[0] == 0 || support[1] == 0 )
|
|
216 {
|
|
217 i = j ;
|
|
218 continue ;
|
|
219 }
|
|
220 // a mixture region.
|
|
221 // We force a terminal site if we have only coming-in and no going-out introns.
|
|
222 int leftSupport[2] = {0, 0}, rightSupport[2] = {0, 0};
|
|
223 int l ;
|
|
224 for ( k = i ; k < j ; ++k )
|
|
225 {
|
|
226 int cnt = subexons[k].prevCnt ;
|
|
227 if ( subexons[k].leftStrand != 0 )
|
|
228 for ( l = 0 ; l < cnt ; ++l )
|
|
229 if ( subexons[k].prev[l] < i )
|
|
230 {
|
|
231 ++leftSupport[ ( subexons[k].leftStrand + 1 ) / 2 ] ;
|
|
232 break ;
|
|
233 }
|
|
234 cnt = subexons[k].nextCnt ;
|
|
235 if ( subexons[k].rightStrand != 0 )
|
|
236 for ( l = 0 ; l < cnt ; ++l )
|
|
237 if ( subexons[k].next[l] >= j )
|
|
238 {
|
|
239 ++rightSupport[ ( subexons[k].rightStrand + 1 ) / 2 ] ;
|
|
240 break ;
|
|
241 }
|
|
242 }
|
|
243
|
|
244 if ( ( ( leftSupport[0] > 0 && rightSupport[0] == 0 ) ||
|
|
245 ( leftSupport[1] > 0 && rightSupport[1] == 0 ) ) &&
|
|
246 subexons[j - 1].rightType != 0 )
|
|
247 {
|
|
248 subexons[j - 1].rightClassifier = 0 ;
|
|
249 }
|
|
250
|
|
251 if ( ( ( leftSupport[0] == 0 && rightSupport[0] > 0 ) ||
|
|
252 ( leftSupport[1] == 0 && rightSupport[1] > 0 ) ) &&
|
|
253 subexons[j - 1].leftType != 0 )
|
|
254 {
|
|
255 subexons[j - 1].leftClassifier = 0 ;
|
|
256 }
|
|
257
|
|
258 i = j ;
|
|
259 }
|
|
260
|
|
261 this->classifierThreshold = classifierThreshold ;
|
|
262
|
|
263 usedGeneId = baseGeneId = 0 ;
|
|
264 }
|
|
265
|
|
266 static bool IsSameStrand( int a, int b )
|
|
267 {
|
|
268 if ( a == 0 || b == 0 )
|
|
269 return true ;
|
|
270 if ( a != b )
|
|
271 return false ;
|
|
272 return true ;
|
|
273 }
|
|
274 // Parse the input line
|
|
275 static int InputSubexon( char *in, Alignments &alignments, struct _subexon &se, bool needPrevNext = false )
|
|
276 {
|
|
277 int i ;
|
|
278 char chrName[50] ;
|
|
279 char ls[3], rs[3] ;
|
|
280 sscanf( in, "%s %d %d %d %d %s %s %lf %lf %lf %lf %lf", chrName, &se.start, &se.end, &se.leftType, &se.rightType, ls, rs,
|
|
281 &se.avgDepth, &se.leftRatio, &se.rightRatio,
|
|
282 &se.leftClassifier, &se.rightClassifier ) ;
|
|
283 se.chrId = alignments.GetChromIdFromName( chrName ) ;
|
|
284 se.nextCnt = se.prevCnt = 0 ;
|
|
285 se.next = se.prev = NULL ;
|
|
286 se.lcCnt = se.rcCnt = 0 ;
|
|
287
|
|
288 if ( ls[0] == '+' )
|
|
289 se.leftStrand = 1 ;
|
|
290 else if ( ls[0] == '-' )
|
|
291 se.leftStrand = -1 ;
|
|
292 else
|
|
293 se.leftStrand = 0 ;
|
|
294
|
|
295 if ( rs[0] == '+' )
|
|
296 se.rightStrand = 1 ;
|
|
297 else if ( rs[0] == '-' )
|
|
298 se.rightStrand = -1 ;
|
|
299 else
|
|
300 se.rightStrand = 0 ;
|
|
301
|
|
302 if ( needPrevNext )
|
|
303 {
|
|
304 char *p = in ;
|
|
305 // Locate the offset for prevCnt
|
|
306 for ( i = 0 ; i <= 11 ; ++i )
|
|
307 {
|
|
308 p = strchr( p, ' ' ) ;
|
|
309 ++p ;
|
|
310 }
|
|
311
|
|
312 sscanf( p, "%d", &se.prevCnt ) ;
|
|
313 p = strchr( p, ' ' ) ;
|
|
314 ++p ;
|
|
315 se.prev = new int[ se.prevCnt ] ;
|
|
316 for ( i = 0 ; i < se.prevCnt ; ++i )
|
|
317 {
|
|
318 sscanf( p, "%d", &se.prev[i] ) ;
|
|
319 p = strchr( p, ' ' ) ;
|
|
320 ++p ;
|
|
321 }
|
|
322
|
|
323 sscanf( p, "%d", &se.nextCnt ) ;
|
|
324 p = strchr( p, ' ' ) ;
|
|
325 ++p ;
|
|
326 se.next = new int[ se.nextCnt ] ;
|
|
327 for ( i = 0 ; i < se.nextCnt ; ++i )
|
|
328 {
|
|
329 sscanf( p, "%d", &se.next[i] ) ;
|
|
330 p = strchr( p, ' ' ) ;
|
|
331 ++p ;
|
|
332 }
|
|
333
|
|
334 }
|
|
335 return 1 ;
|
|
336 }
|
|
337
|
|
338 int GetGeneIntervalIdx( int startIdx, int &endIdx, int timeStamp ) ;
|
|
339
|
|
340 //@return: the number of intervals found
|
|
341 int ComputeGeneIntervals() ;
|
|
342
|
|
343 // Return a list of subexons in that interval and in retList the id of subexon
|
|
344 // should be adjusted to start from 0.
|
|
345 int ExtractSubexons( int startIdx, int endIdx, struct _subexon *retList ) ;
|
|
346 } ;
|
|
347
|
|
348 #endif
|