Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/TranscriptDecider.hpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #ifndef _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER | |
2 #define _MOURISL_CLASSES_TRANSCRIPTDECIDER_HEADER | |
3 | |
4 #include <pthread.h> | |
5 #include <map> | |
6 #include <time.h> | |
7 #include <stdarg.h> | |
8 | |
9 #include "alignments.hpp" | |
10 #include "SubexonGraph.hpp" | |
11 #include "SubexonCorrelation.hpp" | |
12 #include "BitTable.hpp" | |
13 #include "Constraints.hpp" | |
14 | |
15 #define HASH_MAX 100003 // default HASH_MAX | |
16 #define USE_DP 200000 | |
17 | |
18 struct _transcript | |
19 { | |
20 BitTable seVector ; | |
21 double abundance ; | |
22 double correlationScore ; | |
23 double FPKM ; | |
24 double *constraintsSupport ; // record the assign ment of constraints. | |
25 | |
26 int first, last ; // indicate the index of the first and last subexons. | |
27 bool partial ; // wehther this is a partial transcript. | |
28 int id ; // the id for various usage: i.e transcript index in the alltranscripts. | |
29 } ; | |
30 | |
31 struct _outputTranscript | |
32 { | |
33 int chrId ; | |
34 int geneId, transcriptId ; | |
35 struct _pair32 *exons ; | |
36 int ecnt ; | |
37 char strand ; | |
38 int sampleId ; | |
39 | |
40 double FPKM ; | |
41 double TPM ; | |
42 double cov ; | |
43 } ; | |
44 | |
45 struct _dp | |
46 { | |
47 BitTable seVector ; | |
48 int first, last ; | |
49 // The "cnt" is for the hash structure. | |
50 // the first cnt set bits represent the subexons that are the key of the hash | |
51 // the remaining set bits are the optimal subtranscript follow the key. | |
52 int cnt ; | |
53 double cover ; | |
54 | |
55 double minAbundance ; | |
56 int timeStamp ; | |
57 int strand ; | |
58 } ; | |
59 | |
60 | |
61 struct _dpAttribute | |
62 { | |
63 struct _dp *f1, **f2 ; | |
64 struct _dp *hash ; | |
65 | |
66 struct _transcript bufferTxpt ; | |
67 | |
68 bool forAbundance ; | |
69 | |
70 struct _subexon *subexons ; | |
71 int seCnt ; | |
72 | |
73 std::map<uint64_t, int> uncoveredPair ; | |
74 | |
75 double minAbundance ; | |
76 int timeStamp ; | |
77 } ; | |
78 | |
79 class MultiThreadOutputTranscript ; | |
80 | |
81 struct _transcriptDeciderThreadArg | |
82 { | |
83 int tid ; | |
84 struct _subexon *subexons ; | |
85 int seCnt ; | |
86 int sampleCnt ; | |
87 int numThreads ; | |
88 | |
89 int maxDpConstraintSize ; | |
90 double FPKMFraction, classifierThreshold, txptMinReadDepth ; | |
91 Alignments *alignments ; | |
92 std::vector<Constraints> constraints ; | |
93 SubexonCorrelation subexonCorrelation ; | |
94 MultiThreadOutputTranscript *outputHandler ; | |
95 | |
96 int *freeThreads ; // the stack for free threads | |
97 int *ftCnt ; | |
98 pthread_mutex_t *ftLock ; | |
99 pthread_cond_t *fullWorkCond ; | |
100 } ; | |
101 | |
102 class MultiThreadOutputTranscript | |
103 { | |
104 private: | |
105 std::vector<struct _outputTranscript> outputQueue ; | |
106 pthread_t *threads ; | |
107 pthread_mutex_t outputLock ; | |
108 int sampleCnt ; | |
109 int numThreads ; | |
110 std::vector<FILE *> outputFPs ; | |
111 Alignments &alignments ; | |
112 | |
113 public: | |
114 static int CompTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) | |
115 { | |
116 int i ; | |
117 if ( a.geneId != b.geneId ) | |
118 return a.geneId - b.geneId ; | |
119 if ( a.ecnt != b.ecnt ) | |
120 return a.ecnt - b.ecnt ; | |
121 | |
122 for ( i = 0 ; i < a.ecnt ; ++i ) | |
123 { | |
124 if ( a.exons[i].a != b.exons[i].a ) | |
125 return a.exons[i].a - b.exons[i].a ; | |
126 | |
127 if ( a.exons[i].b != b.exons[i].b ) | |
128 return a.exons[i].b - b.exons[i].b ; | |
129 } | |
130 return 0 ; | |
131 } | |
132 | |
133 static bool CompSortTranscripts( const struct _outputTranscript &a, const struct _outputTranscript &b ) | |
134 { | |
135 int tmp = CompTranscripts( a, b ) ; | |
136 if ( tmp < 0 ) | |
137 return true ; | |
138 else if ( tmp > 0 ) | |
139 return false ; | |
140 else | |
141 return a.sampleId < b.sampleId ; | |
142 } | |
143 | |
144 MultiThreadOutputTranscript( int cnt, Alignments &a ): alignments( a ) | |
145 { | |
146 sampleCnt = cnt ; | |
147 pthread_mutex_init( &outputLock, NULL ) ; | |
148 } | |
149 ~MultiThreadOutputTranscript() | |
150 { | |
151 pthread_mutex_destroy( &outputLock ) ; | |
152 int i ; | |
153 for ( i = 0 ; i < sampleCnt ; ++i ) | |
154 fclose( outputFPs[i] ) ; | |
155 } | |
156 | |
157 void SetThreadsPointer( pthread_t *t, int n ) | |
158 { | |
159 threads = t ; | |
160 numThreads = n ; | |
161 } | |
162 | |
163 void SetOutputFPs( char *outputPrefix ) | |
164 { | |
165 int i ; | |
166 char buffer[1024] ; | |
167 for ( i = 0 ; i < sampleCnt ; ++i ) | |
168 { | |
169 if ( outputPrefix[0] ) | |
170 sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; | |
171 else | |
172 sprintf( buffer, "sample_%d.gtf", i ) ; | |
173 FILE *fp = fopen( buffer, "w" ) ; | |
174 outputFPs.push_back( fp ) ; | |
175 } | |
176 } | |
177 | |
178 void Add( struct _outputTranscript &t ) | |
179 { | |
180 pthread_mutex_lock( &outputLock ) ; | |
181 outputQueue.push_back( t ) ; | |
182 pthread_mutex_unlock( &outputLock ) ; | |
183 } | |
184 | |
185 void Add_SingleThread( struct _outputTranscript &t ) | |
186 { | |
187 outputQueue.push_back( t ) ; | |
188 } | |
189 | |
190 void ComputeFPKMTPM( std::vector<Alignments> &alignmentFiles ) | |
191 { | |
192 int i ; | |
193 int qsize = outputQueue.size() ; | |
194 double *totalFPK = new double[ sampleCnt ] ; | |
195 memset( totalFPK, 0, sizeof( double ) * sampleCnt ) ; | |
196 for ( i = 0 ; i < qsize ; ++i ) | |
197 { | |
198 totalFPK[ outputQueue[i].sampleId ] += outputQueue[i].FPKM ; | |
199 } | |
200 | |
201 for ( i = 0 ; i < qsize ; ++i ) | |
202 { | |
203 outputQueue[i].TPM = outputQueue[i].FPKM / ( totalFPK[ outputQueue[i].sampleId ] / 1000000.0 ) ; | |
204 outputQueue[i].FPKM /= ( alignmentFiles[ outputQueue[i].sampleId ].totalReadCnt / 1000000.0 ) ; | |
205 } | |
206 | |
207 delete[] totalFPK ; | |
208 } | |
209 | |
210 void OutputCommandInfo( int argc, char *argv[] ) | |
211 { | |
212 int i ; | |
213 int j ; | |
214 for ( i = 0 ; i < sampleCnt ; ++i ) | |
215 { | |
216 fprintf( outputFPs[i], "#PsiCLASS_v1.0.1\n#" ) ; | |
217 for ( j = 0 ; j < argc - 1 ; ++j ) | |
218 { | |
219 fprintf( outputFPs[i], "%s ", argv[j] ) ; | |
220 } | |
221 fprintf( outputFPs[i], "%s\n", argv[j] ) ; | |
222 } | |
223 } | |
224 | |
225 void OutputCommentToSampleGTF( int sampleId, char *s ) | |
226 { | |
227 fprintf( outputFPs[ sampleId ], "#%s\n", s ) ; | |
228 } | |
229 | |
230 void Flush() | |
231 { | |
232 std::sort( outputQueue.begin(), outputQueue.end(), CompSortTranscripts ) ; | |
233 int i, j ; | |
234 int qsize = outputQueue.size() ; | |
235 char prefix[10] = "" ; | |
236 | |
237 // Recompute the transcript id | |
238 int gid = -1 ; | |
239 int tid = 0 ; | |
240 for ( i = 0 ; i < qsize ; ) | |
241 { | |
242 for ( j = i + 1 ; j < qsize ; ++j ) | |
243 { | |
244 if ( CompTranscripts( outputQueue[i], outputQueue[j] ) ) | |
245 break ; | |
246 } | |
247 int l ; | |
248 if ( outputQueue[i].geneId != gid ) | |
249 { | |
250 gid = outputQueue[i].geneId ; | |
251 tid = 0 ; | |
252 } | |
253 else | |
254 ++tid ; | |
255 | |
256 for ( l = i ; l < j ; ++l ) | |
257 outputQueue[l].transcriptId = tid ; | |
258 | |
259 i = j ; | |
260 } | |
261 | |
262 // output | |
263 for ( i = 0 ; i < qsize ; ++i ) | |
264 { | |
265 struct _outputTranscript &t = outputQueue[i] ; | |
266 char *chrom = alignments.GetChromName( t.chrId ) ; | |
267 | |
268 fprintf( outputFPs[t.sampleId], "%s\tPsiCLASS\ttranscript\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; transcript_id \"%s%s.%d.%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"%.6lf\";\n", | |
269 chrom, t.exons[0].a, t.exons[t.ecnt - 1].b, t.strand, | |
270 prefix, chrom, t.geneId, | |
271 prefix, chrom, t.geneId, t.transcriptId, t.FPKM, t.TPM, t.cov ) ; | |
272 for ( j = 0 ; j < t.ecnt ; ++j ) | |
273 { | |
274 fprintf( outputFPs[ t.sampleId ], "%s\tPsiCLASS\texon\t%d\t%d\t1000\t%c\t.\tgene_id \"%s%s.%d\"; " | |
275 "transcript_id \"%s%s.%d.%d\"; exon_number \"%d\"; FPKM \"%.6lf\"; TPM \"%.6lf\"; cov \"\%.6lf\";\n", | |
276 chrom, t.exons[j].a, t.exons[j].b, t.strand, | |
277 prefix, chrom, t.geneId, | |
278 prefix, chrom, t.geneId, t.transcriptId, | |
279 j + 1, t.FPKM, t.TPM, t.cov ) ; | |
280 } | |
281 delete []t.exons ; | |
282 } | |
283 } | |
284 } ; | |
285 | |
286 class TranscriptDecider | |
287 { | |
288 private: | |
289 int sampleCnt ; | |
290 int numThreads ; | |
291 double FPKMFraction ; | |
292 double txptMinReadDepth ; | |
293 int hashMax ; | |
294 int maxDpConstraintSize ; | |
295 | |
296 Constraints *constraints ; | |
297 //struct _subexon *subexons ; | |
298 //int seCnt ; | |
299 | |
300 int usedGeneId ; | |
301 int baseGeneId, defaultGeneId[2] ; | |
302 | |
303 int *transcriptId ; // the next transcript id for each gene id (we shift the gene id to 0 in this array.) | |
304 Alignments &alignments ; // for obtain the chromosome names. | |
305 | |
306 std::vector<FILE *> outputFPs ; | |
307 | |
308 BitTable compatibleTestVectorT, compatibleTestVectorC ; | |
309 double canBeSoftBoundaryThreshold ; | |
310 | |
311 MultiThreadOutputTranscript *outputHandler ; | |
312 | |
313 // Test whether subexon tag is a start subexon in a mixture region that corresponds to the start of a gene on another strand. | |
314 bool IsStartOfMixtureStrandRegion( int tag, struct _subexon *subexons, int seCnt ) ; | |
315 | |
316 // The functions to pick transcripts through dynamic programming | |
317 struct _dp *dpHash ; | |
318 void SearchSubTranscript( int tag, int strand, int parents[], int pcnt, struct _dp &pdp, int visit[], int vcnt, int extends[], int extendCnt, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; | |
319 struct _dp SolveSubTranscript( int visit[], int vcnt, int strand, std::vector<struct _constraint> &tc, int tcStartInd, struct _dpAttribute &attr ) ; | |
320 void PickTranscriptsByDP( struct _subexon *subexons, int seCnt, int iterBound, Constraints &constraints, SubexonCorrelation &correlation, struct _dpAttribute &attr, std::vector<struct _transcript> &allTranscripts ) ; | |
321 | |
322 void SetDpContent( struct _dp &a, struct _dp &b, const struct _dpAttribute &attr ) | |
323 { | |
324 a.seVector.Assign( b.seVector ) ; | |
325 a.first = b.first ; | |
326 a.last = b.last ; | |
327 a.cnt = b.cnt ; | |
328 a.cover = b.cover ; | |
329 | |
330 a.strand = b.strand ; | |
331 a.minAbundance = attr.minAbundance ; | |
332 a.timeStamp = attr.timeStamp ; | |
333 } | |
334 | |
335 void ResetDpContent( struct _dp &d ) | |
336 { | |
337 d.seVector.Reset() ; | |
338 d.first = -1 ; | |
339 d.last = -1 ; | |
340 d.cnt = -1 ; | |
341 d.cover = -1 ; | |
342 d.minAbundance = -1 ; | |
343 d.timeStamp = -1 ; | |
344 } | |
345 | |
346 void AugmentTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, int limit, bool extend ) ; | |
347 // Test whether a constraints is compatible with the transcript. | |
348 // Return 0 - uncompatible or does not overlap at all. 1 - fully compatible. 2 - Head of the constraints compatible with the tail of the transcript | |
349 int IsConstraintInTranscript( struct _transcript transcript, struct _constraint &c ) ; | |
350 int IsConstraintInTranscriptDebug( struct _transcript transcript, struct _constraint &c ) ; | |
351 | |
352 // Count how many transcripts are possible starting from subexons[tag]. | |
353 int SubTranscriptCount( int tag, struct _subexon *subexons, int f[] ) ; | |
354 | |
355 // The methods when there is no need for DP | |
356 void EnumerateTranscript( int tag, int strand, int visit[], int vcnt, struct _subexon *subexons, SubexonCorrelation &correlation, double correlationScore, std::vector<struct _transcript> &alltranscripts, int &atcnt ) ; | |
357 // For the simpler case, we can pick sample by sample. | |
358 void PickTranscripts( struct _subexon *subexons, std::vector<struct _transcript> &alltranscripts, Constraints &constraints, SubexonCorrelation &seCorrelation, std::vector<struct _transcript> &transcripts ) ; | |
359 | |
360 static bool CompSortTranscripts( const struct _transcript &a, const struct _transcript &b ) | |
361 { | |
362 if ( a.first < b.first ) | |
363 return true ; | |
364 else if ( a.first > b.first ) | |
365 return false ; | |
366 | |
367 int diffPos = a.seVector.GetFirstDifference( b.seVector ) ; | |
368 if ( diffPos == -1 ) | |
369 return false ; | |
370 if ( a.seVector.Test( diffPos ) ) | |
371 return true ; | |
372 else | |
373 return false ; | |
374 } | |
375 | |
376 static bool CompSortPairs( const struct _pair32 &x, const struct _pair32 &y ) | |
377 { | |
378 if ( x.a != y.a ) | |
379 return x.a < y.a ; | |
380 else | |
381 return x.b < y.b ; | |
382 } | |
383 | |
384 static bool CompSortPairsByB( const struct _pair32 &x, const struct _pair32 &y ) | |
385 { | |
386 return x.b < y.b ; | |
387 } | |
388 | |
389 static int CompPairsByB( const void *p1, const void *p2 ) | |
390 { | |
391 return ((struct _pair32 *)p1)->b - ((struct _pair32 *)p2)->b ; | |
392 } | |
393 | |
394 double ComputeScore( double cnt, double weight, double a, double A, double correlation ) | |
395 { | |
396 if ( a > A * 0.1 ) | |
397 return ( cnt * weight ) * ( 1 + pow( a / A, 0.25 ) ) + correlation ; | |
398 else | |
399 return ( cnt * weight ) * ( 1 + a / A ) + correlation ; | |
400 //return ( cnt ) * ( exp( 1 + a / A ) ) + correlation ; | |
401 } | |
402 | |
403 int GetFather( int f, int *father ) ; | |
404 | |
405 void ConvertTranscriptAbundanceToFPKM( struct _subexon *subexons, struct _transcript &t, int readCnt = 1000000 ) | |
406 { | |
407 int txptLen = 0 ; | |
408 int i, size ; | |
409 | |
410 std::vector<int> subexonInd ; | |
411 t.seVector.GetOnesIndices( subexonInd ) ; | |
412 size = subexonInd.size() ; | |
413 for ( i = 0 ; i < size ; ++i ) | |
414 txptLen += ( subexons[ subexonInd[i] ].end - subexons[ subexonInd[i] ].start + 1 ) ; | |
415 double factor = 1 ; | |
416 if ( alignments.matePaired ) | |
417 factor = 0.5 ; | |
418 t.FPKM = t.abundance * factor / ( ( readCnt / 1000000.0 ) * ( txptLen / 1000.0 ) ) ; | |
419 } | |
420 | |
421 int GetTranscriptLengthFromAbundanceAndFPKM( double abundance, double FPKM, int readCnt = 1000000 ) | |
422 { | |
423 double factor = 1 ; | |
424 if ( alignments.matePaired ) | |
425 factor = 0.5 ; | |
426 return int( abundance * factor / ( FPKM / 1000.0 ) / ( readCnt / 1000000.0 ) + 0.5 ) ; | |
427 } | |
428 | |
429 void CoalesceSameTranscripts( std::vector<struct _transcript> &t ) ; | |
430 | |
431 | |
432 // Initialize the structure to store transcript id | |
433 void InitTranscriptId() ; | |
434 | |
435 int GetTranscriptGeneId( std::vector<int> &subexonInd, struct _subexon *subexons ) ; | |
436 int GetTranscriptGeneId( struct _transcript &t, struct _subexon *subexons ) ; | |
437 int RemoveNegativeAbundTranscripts( std::vector<struct _transcript> &transcripts ) | |
438 { | |
439 int i, j ; | |
440 int tcnt = transcripts.size() ; | |
441 j = 0 ; | |
442 for ( i = 0 ; i < tcnt ; ++i ) | |
443 { | |
444 if ( transcripts[i].abundance < 0 ) | |
445 { | |
446 transcripts[i].seVector.Release() ; // Don't forget release the memory. | |
447 continue ; | |
448 } | |
449 transcripts[j] = transcripts[i] ; | |
450 ++j ; | |
451 } | |
452 transcripts.resize( j ) ; | |
453 return j ; | |
454 } | |
455 | |
456 void AbundanceEstimation( struct _subexon *subexons, int seCnt, Constraints &constraints, std::vector<struct _transcript> &transcripts ) ; | |
457 | |
458 int RefineTranscripts( struct _subexon *subexons, int seCnt, bool aggressive, std::map<int, int> *subexonChainSupport, int *txptSampleSupport, std::vector<struct _transcript> &transcripts, Constraints &constraints ) ; | |
459 | |
460 void ComputeTranscriptsScore( struct _subexon *subexons, int seCnt, std::map<int, int> *subexonChainSupport, std::vector<struct _transcript> &transcripts ) ; | |
461 | |
462 void OutputTranscript( int sampleId, struct _subexon *subexons, struct _transcript &transcript ) ; | |
463 | |
464 void PrintLog( const char *fmt, ... ) | |
465 { | |
466 char buffer[10021] ; | |
467 va_list args ; | |
468 va_start( args, fmt ) ; | |
469 vsprintf( buffer, fmt, args ) ; | |
470 | |
471 time_t mytime = time(NULL) ; | |
472 struct tm *localT = localtime( &mytime ) ; | |
473 char stime[500] ; | |
474 strftime( stime, sizeof( stime ), "%c", localT ) ; | |
475 fprintf( stderr, "[%s] %s\n", stime, buffer ) ; | |
476 } | |
477 | |
478 | |
479 public: | |
480 TranscriptDecider( double f, double c, double d, int sampleCnt, Alignments &a ): alignments( a ) | |
481 { | |
482 FPKMFraction = f ; | |
483 canBeSoftBoundaryThreshold = c ; | |
484 txptMinReadDepth = d ; | |
485 usedGeneId = 0 ; | |
486 defaultGeneId[0] = -1 ; | |
487 defaultGeneId[1] = -1 ; | |
488 maxDpConstraintSize = -1 ; | |
489 numThreads = 1 ; | |
490 this->sampleCnt = sampleCnt ; | |
491 dpHash = new struct _dp[ HASH_MAX ] ; // pre-allocated buffer to hold dp information. | |
492 } | |
493 ~TranscriptDecider() | |
494 { | |
495 int i ; | |
496 if ( numThreads == 1 ) | |
497 { | |
498 int size = outputFPs.size() ; | |
499 for ( i = 0 ; i < size ; ++i ) | |
500 { | |
501 fclose( outputFPs[i] ) ; | |
502 } | |
503 } | |
504 delete[] dpHash ; | |
505 } | |
506 | |
507 | |
508 // @return: the number of assembled transcript | |
509 int Solve( struct _subexon *subexons, int seCnt, std::vector<Constraints> &constraints, SubexonCorrelation &subexonCorrelation ) ; | |
510 | |
511 void SetOutputFPs( char *outputPrefix ) | |
512 { | |
513 int i ; | |
514 char buffer[1024] ; | |
515 for ( i = 0 ; i < sampleCnt ; ++i ) | |
516 { | |
517 if ( outputPrefix[0] ) | |
518 sprintf( buffer, "%s_sample_%d.gtf", outputPrefix, i ) ; | |
519 else | |
520 sprintf( buffer, "sample_%d.gtf", i ) ; | |
521 FILE *fp = fopen( buffer, "w" ) ; | |
522 outputFPs.push_back( fp ) ; | |
523 } | |
524 } | |
525 | |
526 void SetMultiThreadOutputHandler( MultiThreadOutputTranscript *h ) | |
527 { | |
528 outputHandler = h ; | |
529 } | |
530 | |
531 void SetNumThreads( int t ) | |
532 { | |
533 numThreads = t ; | |
534 } | |
535 | |
536 void SetMaxDpConstraintSize(int size) | |
537 { | |
538 maxDpConstraintSize = size ; | |
539 } | |
540 } ; | |
541 | |
542 void *TranscriptDeciderSolve_Wrapper( void *arg ) ; | |
543 | |
544 #endif |