Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/Constraints.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include "Constraints.hpp" | |
2 | |
3 // return whether this constraint is compatible with the subexons. | |
4 bool Constraints::ConvertAlignmentToBitTable( struct _pair *segments, int segCnt, | |
5 struct _subexon *subexons, int seCnt, int seStart, struct _constraint &ct ) | |
6 { | |
7 int i, j, k ; | |
8 k = seStart ; | |
9 ct.vector.Init( seCnt ) ; | |
10 // Each segment of an alignment can cover several subexons. | |
11 // But the first and last segment can partially cover a subexon. | |
12 for ( i = 0 ; i < segCnt ; ++i ) | |
13 { | |
14 int leftIdx, rightIdx ; // the range of subexons covered by this segment. | |
15 leftIdx = -1 ; | |
16 rightIdx = -1 ; | |
17 | |
18 for ( ; k < seCnt ; ++k ) | |
19 { | |
20 //if ( segments[0].b == 110282529 && segCnt == 2 ) | |
21 // printf( "(%d:%d %d):(%d:%d %d)\n", i, (int)segments[i].a, (int)segments[i].b, k, (int)subexons[k].start, (int)subexons[k].end ) ; | |
22 if ( subexons[k].start > segments[i].b ) | |
23 break ; | |
24 if ( segments[i].a > subexons[k].end ) | |
25 continue ; | |
26 | |
27 int relaxedWidth = 0 ; | |
28 if ( ( subexons[k].start >= segments[i].a && subexons[k].end <= segments[i].b ) | |
29 || ( i == 0 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end <= segments[i].b ) | |
30 || ( i == segCnt - 1 && subexons[k].start >= segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) | |
31 || ( i == 0 && i == segCnt - 1 && subexons[k].start - relaxedWidth < segments[i].a && subexons[k].end + relaxedWidth > segments[i].b ) ) | |
32 { | |
33 if ( leftIdx == -1 ) | |
34 leftIdx = k ; | |
35 rightIdx = k ; | |
36 ct.vector.Set( k ) ; | |
37 } | |
38 else | |
39 { | |
40 return false ; | |
41 } | |
42 } | |
43 | |
44 if ( leftIdx == -1 ) | |
45 return false ; | |
46 | |
47 // The cover contradict the boundary. | |
48 if ( !( ( subexons[leftIdx].leftType == 0 || subexons[leftIdx].start <= segments[i].a ) | |
49 && ( subexons[rightIdx].rightType == 0 || subexons[rightIdx].end >= segments[i].b ) ) ) | |
50 return false ; | |
51 | |
52 // The intron must exists in the subexon graph. | |
53 if ( i > 0 ) | |
54 { | |
55 for ( j = 0 ; j < subexons[ ct.last ].nextCnt ; ++j ) | |
56 if ( subexons[ct.last].next[j] == leftIdx ) | |
57 break ; | |
58 if ( j >= subexons[ ct.last ].nextCnt ) | |
59 return false ; | |
60 } | |
61 | |
62 // The subexons must be consecutive | |
63 for ( j = leftIdx + 1 ; j <= rightIdx ; ++j ) | |
64 if ( subexons[j].start > subexons[j - 1].end + 1 ) | |
65 return false ; | |
66 | |
67 if ( i == 0 ) | |
68 ct.first = leftIdx ; | |
69 ct.last = rightIdx ; | |
70 } | |
71 return true ; | |
72 } | |
73 | |
74 void Constraints::CoalesceSameConstraints() | |
75 { | |
76 int i, k ; | |
77 int size = constraints.size() ; | |
78 for ( i = 0 ; i < size ; ++i ) | |
79 { | |
80 constraints[i].info = i ; | |
81 //printf( "constraints %d: %d %d %d\n", i, constraints[i].vector.Test( 0 ), constraints[i].vector.Test(1), constraints[i].support ) ; | |
82 } | |
83 | |
84 std::vector<int> newIdx ; | |
85 newIdx.resize( size, 0 ) ; | |
86 | |
87 // Update the constraints. | |
88 if ( size > 0 ) | |
89 { | |
90 std::sort( constraints.begin(), constraints.end(), CompSortConstraints ) ; | |
91 | |
92 k = 0 ; | |
93 newIdx[ constraints[0].info ] = 0 ; | |
94 for ( i = 1 ; i < size ; ++i ) | |
95 { | |
96 if ( constraints[k].vector.IsEqual( constraints[i].vector ) ) | |
97 { | |
98 constraints[k].weight += constraints[i].weight ; | |
99 constraints[k].support += constraints[i].support ; | |
100 constraints[k].uniqSupport += constraints[i].uniqSupport ; | |
101 constraints[k].maxReadLen = ( constraints[k].maxReadLen > constraints[i].maxReadLen ) ? | |
102 constraints[k].maxReadLen : constraints[i].maxReadLen ; | |
103 constraints[i].vector.Release() ; | |
104 } | |
105 else | |
106 { | |
107 ++k ; | |
108 if ( k != i ) | |
109 constraints[k] = constraints[i] ; | |
110 } | |
111 newIdx[ constraints[i].info ] = k ; | |
112 } | |
113 constraints.resize( k + 1 ) ; | |
114 } | |
115 | |
116 // Update the mate pairs. | |
117 size = matePairs.size() ; | |
118 if ( size > 0 ) | |
119 { | |
120 for ( i = 0 ; i < size ; ++i ) | |
121 { | |
122 //printf( "%d %d: %d => %d | %d =>%d\n", i, newIdx.size(), matePairs[i].i, newIdx[ matePairs[i].i ], | |
123 // matePairs[i].j, newIdx[ matePairs[i].j ] ) ; | |
124 matePairs[i].i = newIdx[ matePairs[i].i ] ; | |
125 matePairs[i].j = newIdx[ matePairs[i].j ] ; | |
126 } | |
127 | |
128 std::sort( matePairs.begin(), matePairs.end(), CompSortMatePairs ) ; | |
129 | |
130 k = 0 ; | |
131 for ( i = 1 ; i < size ; ++i ) | |
132 { | |
133 if ( matePairs[i].i == matePairs[k].i && matePairs[i].j == matePairs[k].j ) | |
134 { | |
135 matePairs[k].support += matePairs[i].support ; | |
136 matePairs[k].uniqSupport += matePairs[i].uniqSupport ; | |
137 } | |
138 else | |
139 { | |
140 ++k ; | |
141 matePairs[k] = matePairs[i] ; | |
142 } | |
143 } | |
144 //printf( "%s: %d\n", __func__, matePairs[1].i) ; | |
145 matePairs.resize( k + 1 ) ; | |
146 } | |
147 | |
148 // Update the data structure for future mate pairs. | |
149 mateReadIds.UpdateIdx( newIdx ) ; | |
150 } | |
151 | |
152 void Constraints::ComputeNormAbund( struct _subexon *subexons ) | |
153 { | |
154 int i, j ; | |
155 int ctSize = constraints.size() ; | |
156 for ( i = 0 ; i < ctSize ; ++i ) | |
157 { | |
158 // spanned more than 2 subexon | |
159 int readLen = constraints[i].maxReadLen ; | |
160 if ( constraints[i].first + 1 < constraints[i].last ) | |
161 { | |
162 std::vector<int> subexonInd ; | |
163 constraints[i].vector.GetOnesIndices( subexonInd ) ; | |
164 int size = subexonInd.size() ; | |
165 for ( j = 1 ; j < size - 1 ; ++j ) | |
166 { | |
167 int a = subexonInd[j] ; | |
168 readLen -= ( subexons[a].end - subexons[a].start + 1 ) ; | |
169 } | |
170 } | |
171 | |
172 int effectiveLength ; | |
173 if ( constraints[i].first == constraints[i].last ) | |
174 { | |
175 effectiveLength = ( subexons[ constraints[i].first ].end - readLen + 1 )- subexons[ constraints[i].first ].start + 1 ; | |
176 if ( effectiveLength <= 0 ) // this happens in the 3',5'-end subexon, where we trimmed the length | |
177 effectiveLength = ( subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 ) / 2 + 1 ; | |
178 } | |
179 else | |
180 { | |
181 int a = constraints[i].first ; | |
182 int b = constraints[i].last ; | |
183 int start, end ; // the range of the possible start sites of a read in subexons[a]. | |
184 start = subexons[a].end + 1 - ( readLen - 1 ) ; | |
185 if ( start < subexons[a].start ) | |
186 start = subexons[a].start ; | |
187 | |
188 if ( subexons[b].end - subexons[b].start + 1 >= readLen - 1 || subexons[b].rightType == 0 ) | |
189 end = subexons[a].end ; | |
190 else | |
191 { | |
192 end = subexons[a].end + 1 - ( readLen - ( subexons[b].end - subexons[b].start + 1 ) ) ; | |
193 } | |
194 | |
195 if ( end < start ) // when we trimmed the subexon. | |
196 end = subexons[a].start ; | |
197 | |
198 effectiveLength = end - start + 1 ; | |
199 } | |
200 //printf( "%d: effectiveLength=%d support=%d\n", i, effectiveLength, constraints[i].support ) ; | |
201 constraints[i].normAbund = (double)constraints[i].weight / (double)effectiveLength ; | |
202 | |
203 if ( ( subexons[ constraints[i].first ].leftType == 0 && subexons[ constraints[i].first ].end - subexons[ constraints[i].first ].start + 1 >= 8 * pAlignments->readLen ) | |
204 || ( subexons[ constraints[i].last ].rightType == 0 && subexons[ constraints[i].last ].end - subexons[ constraints[i].last ].start + 1 >= 8 * pAlignments->readLen ) ) // some random elongation of the sequence might make unnecessary long effective length. | |
205 { | |
206 constraints[i].normAbund *= 2 ; | |
207 } | |
208 constraints[i].abundance = constraints[i].normAbund ; | |
209 } | |
210 | |
211 ctSize = matePairs.size() ; | |
212 for ( i = 0 ; i < ctSize ; ++i ) | |
213 { | |
214 double a = constraints[ matePairs[i].i ].normAbund ; | |
215 double b = constraints[ matePairs[i].j ].normAbund ; | |
216 | |
217 matePairs[i].normAbund = a < b ? a : b ; | |
218 | |
219 if ( matePairs[i].i != matePairs[i].j ) | |
220 { | |
221 if ( subexons[ constraints[ matePairs[i].i ].first ].leftType == 0 | |
222 && constraints[ matePairs[i].i ].first == constraints[ matePairs[i].i ].last | |
223 && a < b ) | |
224 { | |
225 matePairs[i].normAbund = b ; | |
226 } | |
227 else if ( subexons[ constraints[ matePairs[i].j ].last ].rightType == 0 | |
228 && constraints[ matePairs[i].j ].first == constraints[ matePairs[i].j ].last | |
229 && a > b ) | |
230 { | |
231 matePairs[i].normAbund = a ; | |
232 } | |
233 } | |
234 //matePairs[i].normAbund = sqrt( a * b ) ; | |
235 | |
236 matePairs[i].abundance = matePairs[i].normAbund ; | |
237 } | |
238 } | |
239 | |
240 int Constraints::BuildConstraints( struct _subexon *subexons, int seCnt, int start, int end ) | |
241 { | |
242 int i ; | |
243 int tag = 0 ; | |
244 int coalesceThreshold = 16384 ; | |
245 Alignments &alignments = *pAlignments ; | |
246 // Release the memory from previous gene. | |
247 int size = constraints.size() ; | |
248 | |
249 if ( size > 0 ) | |
250 { | |
251 for ( i = 0 ; i < size ; ++i ) | |
252 constraints[i].vector.Release() ; | |
253 std::vector<struct _constraint>().swap( constraints ) ; | |
254 } | |
255 std::vector<struct _matePairConstraint>().swap( matePairs ) ; | |
256 mateReadIds.Clear() ; | |
257 | |
258 // Start to build the constraints. | |
259 bool callNext = false ; // the last used alignment | |
260 if ( alignments.IsAtBegin() ) | |
261 callNext = true ; | |
262 while ( !alignments.IsAtEnd() ) | |
263 { | |
264 if ( callNext ) | |
265 { | |
266 if ( !alignments.Next() ) | |
267 break ; | |
268 } | |
269 else | |
270 callNext = true ; | |
271 | |
272 if ( alignments.GetChromId() < subexons[0].chrId ) | |
273 continue ; | |
274 else if ( alignments.GetChromId() > subexons[0].chrId ) | |
275 break ; | |
276 // locate the first subexon in this region that overlapps with current alignment. | |
277 for ( ; tag < seCnt && subexons[tag].end < alignments.segments[0].a ; ++tag ) | |
278 ; | |
279 | |
280 if ( tag >= seCnt ) | |
281 break ; | |
282 if ( alignments.segments[ alignments.segCnt - 1 ].b < subexons[tag].start ) | |
283 continue ; | |
284 | |
285 int uniqSupport = 0 ; | |
286 if ( usePrimaryAsUnique ) | |
287 uniqSupport = alignments.IsPrimary() ? 1 : 0 ; | |
288 else | |
289 uniqSupport = alignments.IsUnique() ? 1 : 0 ; | |
290 | |
291 struct _constraint ct ; | |
292 ct.vector.Init( seCnt ) ; | |
293 //printf( "%s %d: %lld-%lld | %d-%d\n", __func__, alignments.segCnt, alignments.segments[0].a, alignments.segments[0].b, subexons[tag].start, subexons[tag].end ) ; | |
294 ct.weight = 1.0 / alignments.GetNumberOfHits() ; | |
295 if ( alignments.IsGCRich() ) | |
296 ct.weight *= 10 ; | |
297 ct.normAbund = 0 ; | |
298 ct.support = 1 ; | |
299 ct.uniqSupport = uniqSupport ; | |
300 ct.maxReadLen = alignments.GetRefCoverLength() ; | |
301 | |
302 if ( alignments.IsPrimary() && ConvertAlignmentToBitTable( alignments.segments, alignments.segCnt, | |
303 subexons, seCnt, tag, ct ) ) | |
304 { | |
305 | |
306 //printf( "%s ", alignments.GetReadId() ) ; | |
307 //ct.vector.Print() ; | |
308 | |
309 | |
310 // If the alignment has clipped end or tail. We only keep those clipped in the 3'/5'-end | |
311 bool validClip = true ; | |
312 if ( alignments.HasClipHead() ) | |
313 { | |
314 if ( ( ct.first < seCnt - 1 && subexons[ct.first].end + 1 == subexons[ct.first + 1].start ) | |
315 || subexons[ct.first].prevCnt > 0 | |
316 || alignments.segments[0].b - alignments.segments[0].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) | |
317 validClip = false ; | |
318 } | |
319 if ( alignments.HasClipTail() ) | |
320 { | |
321 int tmp = alignments.segCnt - 1 ; | |
322 if ( ( ct.last > 0 && subexons[ct.last].start - 1 == subexons[ct.last - 1].end ) | |
323 || subexons[ct.last].nextCnt > 0 | |
324 || alignments.segments[tmp].b - alignments.segments[tmp].a + 1 <= alignments.GetRefCoverLength() / 3.0 ) | |
325 validClip = false ; | |
326 } | |
327 | |
328 if ( validClip ) | |
329 { | |
330 constraints.push_back( ct ) ; // if we just coalesced but the list size does not decrease, this will force capacity increase. | |
331 //if ( !strcmp( alignments.GetReadId(), "ERR188021.8489052" ) ) | |
332 // ct.vector.Print() ; | |
333 // Add the mate-pair information. | |
334 int mateChrId ; | |
335 int64_t matePos ; | |
336 alignments.GetMatePosition( mateChrId, matePos ) ; | |
337 if ( alignments.GetChromId() == mateChrId ) | |
338 { | |
339 if ( matePos < alignments.segments[0].a ) | |
340 { | |
341 int mateIdx = mateReadIds.Query( alignments.GetReadId(), alignments.segments[0].a ) ; | |
342 if ( mateIdx != -1 ) | |
343 { | |
344 struct _matePairConstraint nm ; | |
345 nm.i = mateIdx ; | |
346 nm.j = constraints.size() - 1 ; | |
347 nm.abundance = 0 ; | |
348 nm.support = 1 ; | |
349 nm.uniqSupport = uniqSupport ; | |
350 nm.effectiveCount = 2 ; | |
351 matePairs.push_back( nm ) ; | |
352 } | |
353 } | |
354 else if ( matePos > alignments.segments[0].a ) | |
355 { | |
356 mateReadIds.Insert( alignments.GetReadId(), alignments.segments[0].a, constraints.size() - 1, matePos ) ; | |
357 } | |
358 else // two mates have the same coordinate. | |
359 { | |
360 if ( alignments.IsFirstMate() ) | |
361 { | |
362 struct _matePairConstraint nm ; | |
363 nm.i = constraints.size() - 1 ; | |
364 nm.j = constraints.size() - 1 ; | |
365 nm.abundance = 0 ; | |
366 nm.support = 1 ; | |
367 nm.uniqSupport = uniqSupport ; | |
368 nm.effectiveCount = 2 ; | |
369 matePairs.push_back( nm ) ; | |
370 } | |
371 } | |
372 } | |
373 } | |
374 else | |
375 ct.vector.Release() ; | |
376 | |
377 // Coalesce if necessary. | |
378 size = constraints.size() ; | |
379 if ( (int)size > coalesceThreshold && size == (int)constraints.capacity() ) | |
380 { | |
381 | |
382 //printf( "start coalescing. %d\n", constraints.capacity() ) ; | |
383 CoalesceSameConstraints() ; | |
384 | |
385 // Not coalesce enough | |
386 if ( constraints.size() >= constraints.capacity() / 2 ) | |
387 { | |
388 coalesceThreshold *= 2 ; | |
389 } | |
390 } | |
391 } | |
392 else | |
393 { | |
394 //printf( "not compatible\n" ) ; | |
395 ct.vector.Release() ; | |
396 } | |
397 } | |
398 //printf( "start coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; | |
399 CoalesceSameConstraints() ; | |
400 //printf( "after coalescing. %d %d\n", constraints.size(), matePairs.size() ) ; | |
401 //for ( i = 0 ; i < matePairs.size() ; ++i ) | |
402 // printf( "matePair: %d %d %d\n", matePairs[i].i, matePairs[i].j, matePairs[i].support ) ; | |
403 // single-end data set | |
404 //if ( matePairs.size() == 0 ) | |
405 if ( alignments.fragStdev == 0 ) | |
406 { | |
407 int size = constraints.size() ; | |
408 matePairs.clear() ; | |
409 | |
410 for ( i = 0 ; i < size ; ++i ) | |
411 { | |
412 struct _matePairConstraint nm ; | |
413 nm.i = i ; | |
414 nm.j = i ; | |
415 nm.abundance = 0 ; | |
416 nm.support = constraints[i].support ; | |
417 nm.uniqSupport = constraints[i].uniqSupport ; | |
418 nm.effectiveCount = 1 ; | |
419 | |
420 matePairs.push_back( nm ) ; | |
421 } | |
422 } | |
423 | |
424 ComputeNormAbund( subexons ) ; | |
425 | |
426 /*for ( i = 0 ; i < constraints.size() ; ++i ) | |
427 { | |
428 printf( "constraints %d: %lf %d %d %d ", i, constraints[i].normAbund, constraints[i].first, constraints[i].last, constraints[i].support ) ; | |
429 constraints[i].vector.Print() ; | |
430 } | |
431 | |
432 for ( i = 0 ; i < matePairs.size() ; ++i ) | |
433 { | |
434 printf( "mates %d: %lf %d %d %d %d\n", i, matePairs[i].normAbund, matePairs[i].i, matePairs[i].j, matePairs[i].support, matePairs[i].uniqSupport ) ; | |
435 }*/ | |
436 | |
437 return 0 ; | |
438 } |