Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/classes.cpp @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #include <stdio.h> | |
2 #include <getopt.h> | |
3 #include <vector> | |
4 #include <pthread.h> | |
5 | |
6 #include "alignments.hpp" | |
7 #include "SubexonGraph.hpp" | |
8 #include "SubexonCorrelation.hpp" | |
9 #include "Constraints.hpp" | |
10 #include "TranscriptDecider.hpp" | |
11 | |
12 char usage[] = "./classes [OPTIONS]:\n" | |
13 "Required:\n" | |
14 "\t-s STRING: path to the subexon file.\n" | |
15 "\t-b STRING: path to the BAM file.\n" | |
16 "\t\tor\n" | |
17 "\t--lb STRING: path to the file of the list of BAM files.\n" | |
18 "Optional:\n" | |
19 "\t-p INT: number of threads. (default: 1)\n" | |
20 "\t-o STRING: the prefix of the output file. (default: not used)\n" | |
21 "\t-c FLOAT: only use the subexons with classifier score <= than the given number. (default: 0.05)\n" | |
22 "\t-f FLOAT: filter the transcript from the gene if its abundance is lower than the given number percent of the most abundant one. (default: 0.05)\n" | |
23 "\t-d FLOAT: filter the transcript whose average read depth is less than the given number. (default: 2.5)\n" | |
24 "\t--ls STRING: path to the file of the list of single-sample subexon files. (default: not used)\n" | |
25 "\t--hasMateIdSuffix: the read id has suffix such as .1, .2 for a mate pair. (default: false)\n" | |
26 "\t--maxDpConstraintSize: the maximum number of subexons a constraint can cover in dynamic programming. (default: 7; -1 for inf)\n" | |
27 "\t--primaryParalog: use primary alignment to retain paralog genes instead of unique alignments. (default: not used)\n" | |
28 ; | |
29 | |
30 static const char *short_options = "s:b:f:o:d:p:c:h" ; | |
31 static struct option long_options[] = | |
32 { | |
33 { "ls", required_argument, 0, 10000 }, | |
34 { "lb", required_argument, 0, 10001 }, | |
35 { "hasMateIdSuffix", no_argument, 0, 10002 }, | |
36 { "primaryParalog", no_argument, 0, 10003 }, | |
37 { "maxDpConstraintSize", required_argument, 0, 10004 }, | |
38 { (char *)0, 0, 0, 0} | |
39 } ; | |
40 | |
41 struct _getAlignmentsInfoThreadArg | |
42 { | |
43 std::vector<Alignments> *pAlignmentFiles ; | |
44 int numThreads ; | |
45 int tid ; | |
46 } ; | |
47 | |
48 struct _getConstraintsThreadArg | |
49 { | |
50 std::vector<Constraints> *pMultiSampleConstraints ; | |
51 int numThreads ; | |
52 int tid ; | |
53 | |
54 struct _subexon *subexons ; | |
55 int seCnt ; | |
56 int start, end ; | |
57 } ; | |
58 | |
59 void *GetAlignmentsInfo_Thread( void *pArg ) | |
60 { | |
61 int i ; | |
62 std::vector <Alignments> &alignmentFiles = *( ( (struct _getAlignmentsInfoThreadArg *)pArg )->pAlignmentFiles ) ; | |
63 int tid = ( (struct _getAlignmentsInfoThreadArg *)pArg )->tid ; | |
64 int numThreads = ( (struct _getAlignmentsInfoThreadArg *)pArg )->numThreads ; | |
65 int size = alignmentFiles.size() ; | |
66 for ( i = 0 ; i < size ; ++i ) | |
67 { | |
68 if ( i % numThreads == tid ) | |
69 { | |
70 alignmentFiles[i].GetGeneralInfo( true ) ; | |
71 alignmentFiles[i].Rewind() ; | |
72 } | |
73 } | |
74 | |
75 pthread_exit( NULL ) ; | |
76 } | |
77 | |
78 | |
79 void *GetConstraints_Thread( void *pArg ) | |
80 { | |
81 int i ; | |
82 struct _getConstraintsThreadArg &arg = *( (struct _getConstraintsThreadArg *)pArg ) ; | |
83 std::vector<Constraints> &multiSampleConstraints = *( arg.pMultiSampleConstraints ) ; | |
84 int tid = arg.tid ; | |
85 int numThreads = arg.numThreads ; | |
86 int size = multiSampleConstraints.size() ; | |
87 for ( i = 0 ; i < size ; ++i ) | |
88 { | |
89 if ( i % numThreads == tid ) | |
90 multiSampleConstraints[i].BuildConstraints( arg.subexons, arg.seCnt, arg.start, arg.end ) ; | |
91 } | |
92 pthread_exit( NULL ) ; | |
93 } | |
94 | |
95 int main( int argc, char *argv[] ) | |
96 { | |
97 int i, j ; | |
98 int size ; | |
99 | |
100 if ( argc <= 1 ) | |
101 { | |
102 printf( "%s", usage ) ; | |
103 return 0 ; | |
104 } | |
105 | |
106 int c, option_index ; // For getopt | |
107 option_index = 0 ; | |
108 FILE *fpSubexon = NULL ; | |
109 double FPKMFraction = 0.05 ; | |
110 double classifierThreshold ; | |
111 double txptMinReadDepth = 2.5 ; | |
112 char outputPrefix[1024] = "" ; | |
113 int numThreads = 1 ; | |
114 bool hasMateReadIdSuffix = false ; | |
115 bool usePrimaryAsUnique = false ; | |
116 int maxDpConstraintSize = 7 ; | |
117 | |
118 std::vector<Alignments> alignmentFiles ; | |
119 SubexonCorrelation subexonCorrelation ; | |
120 | |
121 classifierThreshold = 0.05 ; | |
122 while ( 1 ) | |
123 { | |
124 c = getopt_long( argc, argv, short_options, long_options, &option_index ) ; | |
125 if ( c == -1 ) | |
126 break ; | |
127 | |
128 if ( c == 's' ) | |
129 { | |
130 fpSubexon = fopen( optarg, "r" ) ; | |
131 } | |
132 else if ( c == 'b' ) | |
133 { | |
134 Alignments a ; | |
135 a.Open( optarg ) ; | |
136 //a.SetAllowClip( false ) ; | |
137 alignmentFiles.push_back( a ) ; | |
138 } | |
139 else if ( c == 'c' ) | |
140 { | |
141 classifierThreshold = atof( optarg ) ; | |
142 } | |
143 else if ( c == 'f' ) | |
144 { | |
145 FPKMFraction = atof( optarg ) ; | |
146 } | |
147 else if ( c == 'o' ) | |
148 { | |
149 strcpy( outputPrefix, optarg ) ; | |
150 } | |
151 else if ( c == 'd' ) | |
152 { | |
153 txptMinReadDepth = atof( optarg ) ; | |
154 } | |
155 else if ( c == 'p' ) | |
156 { | |
157 numThreads = atoi( optarg ) ; | |
158 } | |
159 else if ( c == 10000 ) // the list of subexon files. | |
160 { | |
161 subexonCorrelation.Initialize( optarg ) ; | |
162 } | |
163 else if ( c == 10001 ) // the list of bam files. | |
164 { | |
165 FILE *fp = fopen( optarg, "r" ) ; | |
166 char buffer[1024] ; | |
167 while ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) | |
168 { | |
169 int len = strlen( buffer ) ; | |
170 if ( buffer[len - 1] == '\n' ) | |
171 { | |
172 buffer[len - 1] = '\0' ; | |
173 --len ; | |
174 | |
175 } | |
176 Alignments a ; | |
177 a.Open( buffer ) ; | |
178 alignmentFiles.push_back( a ) ; | |
179 } | |
180 fclose( fp ) ; | |
181 } | |
182 else if ( c == 10002 ) // the mate pair read id has suffix. | |
183 { | |
184 hasMateReadIdSuffix = true ; | |
185 } | |
186 else if ( c == 10003 ) // do not check unique alignment | |
187 { | |
188 usePrimaryAsUnique = true ; | |
189 } | |
190 else if ( c == 10004 ) // maxDpConstraintSize | |
191 { | |
192 maxDpConstraintSize = atoi(optarg) ; | |
193 } | |
194 else | |
195 { | |
196 printf( "%s", usage ) ; | |
197 exit( 1 ) ; | |
198 } | |
199 } | |
200 if ( fpSubexon == NULL ) | |
201 { | |
202 printf( "Cannot find combined subexon file.\n" ) ; | |
203 exit( 1 ) ; | |
204 } | |
205 if ( alignmentFiles.size() < 1 ) | |
206 { | |
207 printf( "Must use -b option to specify BAM files.\n" ) ; | |
208 exit( 1 ) ; | |
209 } | |
210 | |
211 | |
212 if ( alignmentFiles.size() < 50 ) | |
213 { | |
214 size = alignmentFiles.size() ; | |
215 for ( i = 0 ; i < size ; ++i ) | |
216 { | |
217 alignmentFiles[i].GetGeneralInfo( true ) ; | |
218 alignmentFiles[i].Rewind() ; | |
219 } | |
220 } | |
221 else | |
222 { | |
223 struct _getAlignmentsInfoThreadArg *args = new struct _getAlignmentsInfoThreadArg[ numThreads ] ; | |
224 pthread_attr_t pthreadAttr ; | |
225 pthread_t *threads ; | |
226 | |
227 pthread_attr_init( &pthreadAttr ) ; | |
228 pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; | |
229 | |
230 threads = new pthread_t[ numThreads ] ; | |
231 | |
232 for ( i = 0 ; i < numThreads ; ++i ) | |
233 { | |
234 args[i].pAlignmentFiles = &alignmentFiles ; | |
235 args[i].tid = i ; | |
236 args[i].numThreads = numThreads ; | |
237 | |
238 pthread_create( &threads[i], &pthreadAttr, GetAlignmentsInfo_Thread, &args[i] ) ; | |
239 } | |
240 | |
241 for ( i = 0 ; i < numThreads ; ++i ) | |
242 { | |
243 pthread_join( threads[i], NULL ) ; | |
244 } | |
245 | |
246 pthread_attr_destroy( &pthreadAttr ) ; | |
247 delete[] args ; | |
248 delete[] threads ; | |
249 } | |
250 | |
251 // Build the subexon graph | |
252 SubexonGraph subexonGraph( classifierThreshold, alignmentFiles[0], fpSubexon ) ; | |
253 subexonGraph.ComputeGeneIntervals() ; | |
254 | |
255 // Solve gene by gene | |
256 int sampleCnt = alignmentFiles.size() ; | |
257 std::vector<Constraints> multiSampleConstraints ; | |
258 for ( i = 0 ; i < sampleCnt ; ++i ) | |
259 { | |
260 Constraints constraints( &alignmentFiles[i] ) ; | |
261 constraints.SetHasMateReadIdSuffix( hasMateReadIdSuffix ) ; | |
262 constraints.SetUsePrimaryAsUnique( usePrimaryAsUnique ) ; | |
263 multiSampleConstraints.push_back( constraints ) ; | |
264 } | |
265 MultiThreadOutputTranscript outputHandler( sampleCnt, alignmentFiles[0] ) ; | |
266 outputHandler.SetOutputFPs( outputPrefix ) ; | |
267 | |
268 if ( numThreads <= 1 ) | |
269 { | |
270 TranscriptDecider transcriptDecider( FPKMFraction, classifierThreshold, txptMinReadDepth, sampleCnt, alignmentFiles[0] ) ; | |
271 | |
272 transcriptDecider.SetMultiThreadOutputHandler( &outputHandler ) ; | |
273 transcriptDecider.SetNumThreads( numThreads ) ; | |
274 transcriptDecider.SetMaxDpConstraintSize( maxDpConstraintSize ) ; | |
275 | |
276 int giCnt = subexonGraph.geneIntervals.size() ; | |
277 for ( i = 0 ; i < giCnt ; ++i ) | |
278 { | |
279 struct _geneInterval gi = subexonGraph.geneIntervals[i] ; | |
280 struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; | |
281 subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; | |
282 printf( "%d: %d %s %d %d\n", i, gi.endIdx - gi.startIdx + 1, | |
283 alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), | |
284 gi.start + 1, gi.end + 1 ) ; | |
285 fflush( stdout ) ; | |
286 | |
287 subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; | |
288 for ( j = 0 ; j < sampleCnt ; ++j ) | |
289 multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; | |
290 | |
291 transcriptDecider.Solve( intervalSubexons, gi.endIdx - gi.startIdx + 1, multiSampleConstraints, subexonCorrelation ) ; | |
292 | |
293 for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) | |
294 { | |
295 delete[] intervalSubexons[j].prev ; | |
296 delete[] intervalSubexons[j].next ; | |
297 } | |
298 delete[] intervalSubexons ; | |
299 } | |
300 } | |
301 else // multi-thread case. | |
302 { | |
303 --numThreads ; // one thread is used for read in the data. | |
304 | |
305 // Allocate memory | |
306 struct _transcriptDeciderThreadArg *pArgs = new struct _transcriptDeciderThreadArg[ numThreads ] ; | |
307 pthread_mutex_t ftLock ; | |
308 int *freeThreads ; | |
309 int ftCnt ; | |
310 pthread_cond_t fullWorkCond ; | |
311 pthread_attr_t pthreadAttr ; | |
312 pthread_t *threads ; | |
313 pthread_t *getConstraintsThreads ; | |
314 bool *initThreads ; | |
315 | |
316 pthread_mutex_init( &ftLock, NULL ) ; | |
317 pthread_cond_init( &fullWorkCond, NULL ) ; | |
318 pthread_attr_init( &pthreadAttr ) ; | |
319 pthread_attr_setdetachstate( &pthreadAttr, PTHREAD_CREATE_JOINABLE ) ; | |
320 | |
321 threads = new pthread_t[ numThreads ] ; | |
322 getConstraintsThreads = new pthread_t[ numThreads ] ; | |
323 initThreads = new bool[numThreads] ; | |
324 freeThreads = new int[ numThreads ] ; | |
325 ftCnt = numThreads ; | |
326 for ( i = 0 ; i < numThreads ; ++i ) | |
327 { | |
328 pArgs[i].tid = i ; | |
329 pArgs[i].sampleCnt = sampleCnt ; | |
330 pArgs[i].numThreads = numThreads ; | |
331 pArgs[i].maxDpConstraintSize = maxDpConstraintSize ; | |
332 pArgs[i].FPKMFraction = FPKMFraction ; | |
333 pArgs[i].classifierThreshold = classifierThreshold ; | |
334 pArgs[i].txptMinReadDepth = txptMinReadDepth ; | |
335 pArgs[i].alignments = &alignmentFiles[0] ; | |
336 //pArgs[i].constraints = new std::vector<Constraints> ; | |
337 pArgs[i].outputHandler = &outputHandler ; | |
338 | |
339 freeThreads[i] = i ; | |
340 pArgs[i].freeThreads = freeThreads ; | |
341 pArgs[i].ftCnt = &ftCnt ; | |
342 pArgs[i].ftLock = &ftLock ; | |
343 pArgs[i].fullWorkCond = &fullWorkCond ; | |
344 | |
345 for ( j = 0 ; j < sampleCnt ; ++j ) | |
346 { | |
347 Constraints constraints( &alignmentFiles[j] ) ; | |
348 pArgs[i].constraints.push_back( constraints ) ; | |
349 } | |
350 | |
351 initThreads[i] = false ; | |
352 } | |
353 | |
354 | |
355 // Read in and distribute the work | |
356 int giCnt = subexonGraph.geneIntervals.size() ; | |
357 for ( i = 0 ; i < giCnt ; ++i ) | |
358 { | |
359 struct _geneInterval gi = subexonGraph.geneIntervals[i] ; | |
360 struct _subexon *intervalSubexons = new struct _subexon[ gi.endIdx - gi.startIdx + 1 ] ; | |
361 subexonGraph.ExtractSubexons( gi.startIdx, gi.endIdx, intervalSubexons ) ; | |
362 subexonCorrelation.ComputeCorrelation( intervalSubexons, gi.endIdx - gi.startIdx + 1, alignmentFiles[0] ) ; | |
363 printf( "%d: %d %s %d %d. Free threads: %d/%d\n", i, gi.endIdx - gi.startIdx + 1, | |
364 alignmentFiles[0].GetChromName( intervalSubexons[0].chrId ), | |
365 gi.start + 1, gi.end + 1, ftCnt, numThreads + 1 ) ; | |
366 fflush( stdout ) ; | |
367 | |
368 int gctCnt = ftCnt ; | |
369 if ( gctCnt > 1 && sampleCnt > 1 ) | |
370 { | |
371 gctCnt = ( gctCnt < sampleCnt ? gctCnt : sampleCnt ) ; | |
372 struct _getConstraintsThreadArg *args = new struct _getConstraintsThreadArg[ gctCnt ] ; | |
373 for ( j = 0 ; j < gctCnt ; ++j ) | |
374 { | |
375 args[j].pMultiSampleConstraints = &multiSampleConstraints ; | |
376 args[j].numThreads = gctCnt ; | |
377 args[j].tid = j ; | |
378 args[j].subexons = intervalSubexons ; | |
379 args[j].seCnt = gi.endIdx - gi.startIdx + 1 ; | |
380 args[j].start = gi.start ; args[j].end = gi.end ; | |
381 pthread_create( &getConstraintsThreads[j], &pthreadAttr, GetConstraints_Thread, &args[j] ) ; | |
382 } | |
383 | |
384 | |
385 | |
386 for ( j = 0 ; j < gctCnt ; ++j ) | |
387 pthread_join( getConstraintsThreads[j], NULL ) ; | |
388 | |
389 delete[] args ; | |
390 } | |
391 else | |
392 { | |
393 for ( j = 0 ; j < sampleCnt ; ++j ) | |
394 multiSampleConstraints[j].BuildConstraints( intervalSubexons, gi.endIdx - gi.startIdx + 1, gi.start, gi.end ) ; | |
395 } | |
396 | |
397 // Search for the free queue. | |
398 int tag = -1 ; // get the working thread. | |
399 pthread_mutex_lock( &ftLock ) ; | |
400 if ( ftCnt == 0 ) | |
401 { | |
402 pthread_cond_wait( &fullWorkCond, &ftLock ) ; | |
403 } | |
404 tag = freeThreads[ ftCnt - 1 ] ; | |
405 --ftCnt ; | |
406 pthread_mutex_unlock( &ftLock ) ; | |
407 | |
408 if ( initThreads[tag] ) | |
409 pthread_join( threads[tag], NULL ) ; // Make sure the chosen thread exits. | |
410 | |
411 // Assign the subexons, the constraints and correlation content. | |
412 pArgs[tag].subexons = new struct _subexon[gi.endIdx - gi.startIdx + 1] ; | |
413 pArgs[tag].seCnt = gi.endIdx - gi.startIdx + 1 ; | |
414 for ( j = 0 ; j < pArgs[tag].seCnt ; ++j ) | |
415 { | |
416 pArgs[tag].subexons[j] = intervalSubexons[j] ; | |
417 int cnt = intervalSubexons[j].prevCnt ; | |
418 pArgs[tag].subexons[j].prev = new int[cnt] ; | |
419 memcpy( pArgs[tag].subexons[j].prev, intervalSubexons[j].prev, sizeof( int ) * cnt ) ; | |
420 cnt = intervalSubexons[j].nextCnt ; | |
421 pArgs[tag].subexons[j].next = new int[cnt] ; | |
422 memcpy( pArgs[tag].subexons[j].next, intervalSubexons[j].next, sizeof( int ) * cnt ) ; | |
423 } | |
424 | |
425 for ( j = 0 ; j < sampleCnt ; ++j ) | |
426 { | |
427 pArgs[tag].constraints[j].Assign( multiSampleConstraints[ j ] ) ; | |
428 } | |
429 pArgs[tag].subexonCorrelation.Assign( subexonCorrelation ) ; | |
430 pthread_create( &threads[tag], &pthreadAttr, TranscriptDeciderSolve_Wrapper, &pArgs[tag] ) ; | |
431 initThreads[tag] = true ; | |
432 for ( j = 0 ; j < gi.endIdx - gi.startIdx + 1 ; ++j ) | |
433 { | |
434 delete[] intervalSubexons[j].prev ; | |
435 delete[] intervalSubexons[j].next ; | |
436 } | |
437 delete[] intervalSubexons ; | |
438 } | |
439 | |
440 for ( i = 0 ; i < numThreads ; ++i ) | |
441 { | |
442 if ( initThreads[i] ) | |
443 pthread_join( threads[i], NULL ) ; | |
444 } | |
445 | |
446 // Release memory | |
447 for ( i = 0 ; i < numThreads ; ++i ) | |
448 { | |
449 std::vector<Constraints>().swap( pArgs[i].constraints ) ; | |
450 } | |
451 delete []pArgs ; | |
452 pthread_attr_destroy( &pthreadAttr ) ; | |
453 pthread_mutex_destroy( &ftLock ) ; | |
454 pthread_cond_destroy( &fullWorkCond ) ; | |
455 | |
456 delete[] threads ; | |
457 delete[] getConstraintsThreads ; | |
458 delete[] initThreads ; | |
459 delete[] freeThreads ; | |
460 } // end of else for multi-thread. | |
461 | |
462 outputHandler.OutputCommandInfo( argc, argv ) ; | |
463 for ( i = 0 ; i < sampleCnt ; ++i ) | |
464 { | |
465 char buffer[1024] ; | |
466 alignmentFiles[i].GetFileName( buffer ) ; | |
467 int separator = -1 ; | |
468 | |
469 // deprive the directory. | |
470 for ( j = 0 ; buffer[j] ; ++j ) | |
471 { | |
472 if ( buffer[j] == '/' ) | |
473 separator = j ; | |
474 } | |
475 if ( separator != -1 ) | |
476 { | |
477 ++separator ; | |
478 for ( j = separator ; buffer[j] ; ++j ) | |
479 buffer[j - separator] = buffer[j] ; | |
480 buffer[j - separator] = '\0' ; | |
481 } | |
482 outputHandler.OutputCommentToSampleGTF( i, buffer ) ; | |
483 } | |
484 outputHandler.ComputeFPKMTPM( alignmentFiles ) ; | |
485 outputHandler.Flush() ; | |
486 | |
487 for ( i = 0 ; i < sampleCnt ; ++i ) | |
488 alignmentFiles[i].Close() ; | |
489 fclose( fpSubexon ) ; | |
490 return 0 ; | |
491 } |