0
|
1 /*
|
|
2 Find junctions from the SAM file generated by Tophat2.
|
|
3 The SAM file should be sorted.
|
|
4 The junction is the start and the end coordinates of the spliced region in this program. The output of the junction is a bit different.
|
|
5
|
|
6 Usage: ./a.out input [option] >output.
|
|
7 ./a.out -h for help.
|
|
8 */
|
|
9 #include <stdio.h>
|
|
10 #include <string.h>
|
|
11 #include <stdlib.h>
|
|
12
|
|
13 #include "sam.h"
|
|
14
|
|
15 #define LINE_SIZE 4097
|
|
16 #define QUEUE_SIZE 10001
|
|
17 #define HASH_MAX 1000003
|
|
18
|
|
19 struct _readTree
|
|
20 {
|
|
21 char id[256] ;
|
|
22 int leftAnchor, rightAnchor ;
|
|
23 int pos ;
|
|
24 //bool secondary ;
|
|
25 bool valid ;
|
|
26 int editDistance ;
|
|
27 int NH, cnt ; // if cnt < NH, then it has real secondary match for this splice junction
|
|
28 struct _readTree *left, *right ;
|
|
29
|
|
30 int flag ;// The flag from sam head.
|
|
31 } ;
|
|
32
|
|
33 // The structure of a junction
|
|
34 struct _junction
|
|
35 {
|
|
36 int start, end ;
|
|
37 int readCnt ; // The # of reads containing this junction
|
|
38 int secReadCnt ; // The # of reads whose secondary alignment has this junction.
|
|
39 char strand ; // On '+' or '-' strand
|
|
40 int leftAnchor, rightAnchor ; // The longest left and right anchor
|
|
41 int oppositeAnchor ; // The longest anchor of the shorter side.
|
|
42 int uniqEditDistance, secEditDistance ;
|
|
43 struct _readTree head ;
|
|
44 } ;
|
|
45
|
|
46 char line[LINE_SIZE] ;
|
|
47 char col[11][LINE_SIZE] ; // The option fields is not needed.
|
|
48 char strand ; // Extract XS field
|
|
49 char noncanonStrandInfo ;
|
|
50 //bool secondary ;
|
|
51 int NH ;
|
|
52 int editDistance ;
|
|
53 int mateStart ;
|
|
54 int filterYS ;
|
|
55 int samFlag ;
|
|
56
|
|
57 struct _junction junctionQueue[QUEUE_SIZE] ; // Expected only a few junctions in it for each read. This queue is sorted.
|
|
58
|
|
59 int qHead, qTail ;
|
|
60
|
|
61 bool flagPrintJunction ;
|
|
62 bool flagPrintAll ;
|
|
63 bool flagStrict ;
|
|
64 int junctionCnt ;
|
|
65 bool anchorBoth ;
|
|
66 bool validRead ;
|
|
67
|
|
68 int flank ;
|
|
69 struct _cigarSeg
|
|
70 {
|
|
71 int len ;
|
|
72 char type ;
|
|
73 } ;
|
|
74
|
|
75 struct _readTree *contradictedReads ;
|
|
76
|
|
77 char nucToNum[26] = { 0, 4, 1, 4, 4, 4, 2,
|
|
78 4, 4, 4, 4, 4, 4, 4,
|
|
79 4, 4, 4, 4, 4, 3,
|
|
80 4, 4, 4, 4, 4, 4 } ;
|
|
81
|
|
82 void PrintHelp()
|
|
83 {
|
|
84 printf(
|
|
85 "Prints reads from the SAM/BAM file that containing junctions.\n"
|
|
86 "Usage: ./a.out input [option]>output\n"
|
|
87 "Options:\n"
|
|
88 "\t-j xx [-B]: Output the junctions using 1-based coordinates. The format is \"reference id\" start end \"# of read\" strand.(They are sorted)\n and the xx is an integer means the maximum unqualified anchor length for a splice junction(default=8). If -B, the splice junction must be supported by a read whose both anchors are longer than xx.\n"
|
|
89 "\t-a: Output all the junctions, and use non-positive support number to indicate unqualified junctions.\n"
|
|
90 "\t-y: If the bits from YS field of bam matches the argument, we filter the alignment (default: 4).\n"
|
|
91 ) ;
|
|
92 }
|
|
93
|
|
94 void GetJunctionInfo( struct _junction &junc, struct _readTree *p )
|
|
95 {
|
|
96 if ( p == NULL )
|
|
97 return ;
|
|
98
|
|
99 if ( p->valid )
|
|
100 {
|
|
101 //if ( junc.start == 22381343 + 1 && junc.end == 22904987 - 1 )
|
|
102 // printf( "%s %d %d %d\n", p->id, p->leftAnchor, p->rightAnchor, p->flag ) ;
|
|
103
|
|
104
|
|
105
|
|
106 if ( p->cnt < p->NH )
|
|
107 {
|
|
108 junc.secEditDistance += p->editDistance ;
|
|
109 ++junc.secReadCnt ;
|
|
110 }
|
|
111 else
|
|
112 {
|
|
113 junc.uniqEditDistance += p->editDistance ;
|
|
114 ++junc.readCnt ;
|
|
115 }
|
|
116 int l = p->leftAnchor, r = p->rightAnchor ;
|
|
117
|
|
118 if ( !anchorBoth )
|
|
119 {
|
|
120 if ( l > junc.leftAnchor )
|
|
121 junc.leftAnchor = l ;
|
|
122 if ( r > junc.rightAnchor )
|
|
123 junc.rightAnchor = r ;
|
|
124
|
|
125 if ( l <= r && l > junc.oppositeAnchor )
|
|
126 junc.oppositeAnchor = l ;
|
|
127 else if ( r < l && r > junc.oppositeAnchor )
|
|
128 junc.oppositeAnchor = r ;
|
|
129 }
|
|
130 else
|
|
131 {
|
|
132 if ( l > flank && r > flank )
|
|
133 {
|
|
134 junc.leftAnchor = l ;
|
|
135 junc.rightAnchor = r ;
|
|
136 }
|
|
137
|
|
138 if ( l <= r && l > junc.oppositeAnchor )
|
|
139 junc.oppositeAnchor = l ;
|
|
140 else if ( r < l && r > junc.oppositeAnchor )
|
|
141 junc.oppositeAnchor = r ;
|
|
142 }
|
|
143 }
|
|
144 GetJunctionInfo( junc, p->left ) ;
|
|
145 GetJunctionInfo( junc, p->right ) ;
|
|
146 }
|
|
147
|
|
148 void PrintJunctionReads( struct _junction &junc, struct _readTree *p )
|
|
149 {
|
|
150 if ( p == NULL )
|
|
151 return ;
|
|
152
|
|
153 if ( p->valid )
|
|
154 printf( "%s\n", p->id ) ;
|
|
155 PrintJunctionReads( junc, p->left ) ;
|
|
156 PrintJunctionReads( junc, p->right ) ;
|
|
157 }
|
|
158
|
|
159 void PrintJunction( char *chrome, struct _junction &junc )
|
|
160 {
|
|
161 int sum ;
|
|
162 junc.leftAnchor = 0 ;
|
|
163 junc.rightAnchor = 0 ;
|
|
164 junc.oppositeAnchor = 0 ;
|
|
165 junc.readCnt = 0 ;
|
|
166 junc.secReadCnt = 0 ;
|
|
167 junc.uniqEditDistance = 0 ;
|
|
168 junc.secEditDistance = 0 ;
|
|
169 GetJunctionInfo( junc, &junc.head ) ;
|
|
170
|
|
171 sum = junc.readCnt + junc.secReadCnt ;
|
|
172
|
|
173 if ( junc.leftAnchor <= flank || junc.rightAnchor <= flank || ( junc.readCnt + junc.secReadCnt <= 0 ) )
|
|
174 {
|
|
175 if ( flagPrintAll )
|
|
176 {
|
|
177 sum = -sum ;
|
|
178 }
|
|
179 else
|
|
180 return ;
|
|
181 }
|
|
182
|
|
183 /*if ( junc.end - junc.start + 1 >= 200000 )
|
|
184 {
|
|
185 if ( junc.secReadCnt > 0 )
|
|
186 junc.secReadCnt = 1 ;
|
|
187 }*/
|
|
188
|
|
189 /*if ( junc.oppositeAnchor <= ( ( flank / 2 < 1 ) ? flank / 2 : 1 ) && ( junc.readCnt + junc.secReadCnt ) <= 10 )
|
|
190 {
|
|
191 if ( junc.readCnt > 0 )
|
|
192 junc.readCnt = 1 ;
|
|
193 if ( junc.secReadCnt > 0 )
|
|
194 junc.secReadCnt = 0 ;
|
|
195 }*/
|
|
196
|
|
197 printf( "%s %d %d %d %c %d %d %d %d\n", chrome, junc.start - 1, junc.end + 1, sum, junc.strand,
|
|
198 junc.readCnt, junc.secReadCnt, junc.uniqEditDistance, junc.secEditDistance ) ;
|
|
199 //PrintJunctionReads( junc, &junc.head ) ;
|
|
200 }
|
|
201
|
|
202 void ClearReadTree( struct _readTree *p )
|
|
203 {
|
|
204 if ( p == NULL )
|
|
205 return ;
|
|
206 ClearReadTree( p->left ) ;
|
|
207 ClearReadTree( p->right ) ;
|
|
208 free( p ) ;
|
|
209 }
|
|
210
|
|
211 // Insert to the read tree
|
|
212 bool InsertReadTree( struct _readTree *p, char *id, int l, int r )
|
|
213 {
|
|
214 int tmp = strcmp( p->id, id ) ;
|
|
215 if ( tmp == 0 )
|
|
216 {
|
|
217 p->cnt += 1 ;
|
|
218 return true ;
|
|
219 }
|
|
220 else if ( tmp < 0 )
|
|
221 {
|
|
222 if ( p->left )
|
|
223 return InsertReadTree( p->left, id, l, r ) ;
|
|
224 else
|
|
225 {
|
|
226 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
|
|
227 strcpy( p->left->id, id ) ;
|
|
228 p->left->leftAnchor = l ;
|
|
229 p->left->rightAnchor = r ;
|
|
230 //p->left->secondary = secondary ;
|
|
231 p->left->editDistance = editDistance ;
|
|
232 p->left->left = p->left->right = NULL ;
|
|
233 p->left->valid = validRead ;
|
|
234 p->left->cnt = 1 ;
|
|
235 p->left->NH = NH ;
|
|
236 p->left->flag = samFlag ;
|
|
237 return false ;
|
|
238 }
|
|
239 }
|
|
240 else
|
|
241 {
|
|
242 if ( p->right )
|
|
243 return InsertReadTree( p->right, id, l, r ) ;
|
|
244 else
|
|
245 {
|
|
246 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
|
|
247 strcpy( p->right->id, id ) ;
|
|
248 p->right->leftAnchor = l ;
|
|
249 p->right->rightAnchor = r ;
|
|
250 //p->right->secondary = secondary ;
|
|
251 p->right->editDistance = editDistance ;
|
|
252 p->right->left = p->right->right = NULL ;
|
|
253 p->right->valid = validRead ;
|
|
254 p->right->cnt = 1 ;
|
|
255 p->right->NH = NH ;
|
|
256 p->right->flag = samFlag ;
|
|
257 return false ;
|
|
258 }
|
|
259 }
|
|
260 }
|
|
261
|
|
262
|
|
263 bool SearchContradictedReads( struct _readTree *p, char *id, int pos )
|
|
264 {
|
|
265 if ( p == NULL )
|
|
266 return false ;
|
|
267 int tmp = strcmp( p->id, id ) ;
|
|
268 if ( tmp == 0 && p->pos == pos )
|
|
269 return true ;
|
|
270 else if ( tmp <= 0 )
|
|
271 return SearchContradictedReads( p->left, id, pos ) ;
|
|
272 else
|
|
273 return SearchContradictedReads( p->right, id, pos ) ;
|
|
274 }
|
|
275
|
|
276 bool InsertContradictedReads( struct _readTree *p, char *id, int pos )
|
|
277 {
|
|
278 if ( p == NULL )
|
|
279 {
|
|
280 contradictedReads = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
|
|
281 strcpy( contradictedReads->id, id ) ;
|
|
282 contradictedReads->pos = pos ;
|
|
283 contradictedReads->left = contradictedReads->right = NULL ;
|
|
284 return false ;
|
|
285 }
|
|
286 int tmp = strcmp( p->id, id ) ;
|
|
287 if ( tmp == 0 && p->pos == pos )
|
|
288 return true ;
|
|
289 else if ( tmp <= 0 )
|
|
290 {
|
|
291 if ( p->left )
|
|
292 return InsertContradictedReads( p->left, id, pos ) ;
|
|
293 else
|
|
294 {
|
|
295 p->left = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
|
|
296 strcpy( p->left->id, id ) ;
|
|
297 p->left->pos = pos ;
|
|
298 p->left->left = p->left->right = NULL ;
|
|
299 return false ;
|
|
300 }
|
|
301 }
|
|
302 else
|
|
303 {
|
|
304 if ( p->right )
|
|
305 return InsertContradictedReads( p->right, id, pos ) ;
|
|
306 else
|
|
307 {
|
|
308 p->right = (struct _readTree *)malloc( sizeof( struct _readTree ) ) ;
|
|
309 strcpy( p->right->id, id ) ;
|
|
310 p->right->pos = pos ;
|
|
311 p->right->left = p->right->right = NULL ;
|
|
312 return false ;
|
|
313 }
|
|
314 }
|
|
315 }
|
|
316
|
|
317 struct _readTree *GetReadTreeNode( struct _readTree *p, char *id )
|
|
318 {
|
|
319 if ( p == NULL )
|
|
320 return NULL ;
|
|
321 int tmp = strcmp( p->id, id ) ;
|
|
322 if ( tmp == 0 )
|
|
323 return p ;
|
|
324 else if ( tmp < 0 )
|
|
325 return GetReadTreeNode( p->left, id ) ;
|
|
326 else
|
|
327 return GetReadTreeNode( p->right, id ) ;
|
|
328 }
|
|
329
|
|
330 // Insert the new junction into the queue,
|
|
331 // and make sure the queue is sorted.
|
|
332 // Assume each junction range is only on one strand.
|
|
333 // l, r is the left and right anchor from a read
|
|
334 void InsertQueue( int start, int end, int l, int r )
|
|
335 {
|
|
336 int i, j ;
|
|
337 i = qTail ;
|
|
338
|
|
339
|
|
340 while ( i != qHead )
|
|
341 {
|
|
342 j = i - 1 ;
|
|
343 if ( j < 0 )
|
|
344 j = QUEUE_SIZE - 1 ;
|
|
345
|
|
346 if ( junctionQueue[j].start < start
|
|
347 || ( junctionQueue[j].start == start && junctionQueue[j].end < end ) )
|
|
348 break ;
|
|
349
|
|
350 junctionQueue[i] = junctionQueue[j] ;
|
|
351 --i ;
|
|
352
|
|
353 if ( i < 0 )
|
|
354 i = QUEUE_SIZE - 1 ;
|
|
355 }
|
|
356
|
|
357 junctionQueue[i].start = start ;
|
|
358 junctionQueue[i].end = end ;
|
|
359 /*if ( !secondary )
|
|
360 {
|
|
361 junctionQueue[i].readCnt = 1 ;
|
|
362 junctionQueue[i].secReadCnt = 0 ;
|
|
363 }
|
|
364 else
|
|
365 {
|
|
366 junctionQueue[i].readCnt = 0 ;
|
|
367 junctionQueue[i].secReadCnt = 1 ;
|
|
368 }*/
|
|
369 junctionQueue[i].strand = strand ;
|
|
370 junctionQueue[i].leftAnchor = l ;
|
|
371 junctionQueue[i].rightAnchor = r ;
|
|
372
|
|
373 strcpy( junctionQueue[i].head.id, col[0] ) ;
|
|
374 junctionQueue[i].head.valid = validRead ;
|
|
375 junctionQueue[i].head.leftAnchor = l ;
|
|
376 junctionQueue[i].head.rightAnchor = r ;
|
|
377 junctionQueue[i].head.left = junctionQueue[i].head.right = NULL ;
|
|
378 //junctionQueue[i].head.secondary = secondary ;
|
|
379 junctionQueue[i].head.NH = NH ;
|
|
380 junctionQueue[i].head.cnt = 1 ;
|
|
381 junctionQueue[i].head.editDistance = editDistance ;
|
|
382
|
|
383 ++qTail ;
|
|
384 if ( qTail >= QUEUE_SIZE )
|
|
385 qTail = 0 ;
|
|
386 }
|
|
387
|
|
388 // Search the queue, and remove the heads if its end(start) is smaller than prune.(Because it is impossible to have that junction again)
|
|
389 // Add the junction to the queue, if it is not in it.
|
|
390 // Return true, if it finds the junction. Otherwise, return false.
|
|
391 bool SearchQueue( int start, int end, int prune, int l, int r )
|
|
392 {
|
|
393 int i ;
|
|
394
|
|
395 // Test whether this read might be a false alignment.
|
|
396 i = qHead ;
|
|
397 while ( i != qTail )
|
|
398 {
|
|
399 struct _readTree *rt ;
|
|
400 if ( ( junctionQueue[i].start == start && junctionQueue[i].end < end ) ||
|
|
401 ( junctionQueue[i].start > start && junctionQueue[i].end == end ) )
|
|
402 {
|
|
403 // This alignment is false ;
|
|
404 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ;
|
|
405 // the commented out logic because it is handled by contradicted reads
|
|
406 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) )
|
|
407 {
|
|
408 if ( rt->leftAnchor <= flank || rt->rightAnchor <= flank )//|| rt->secondary )
|
|
409 {
|
|
410 if ( l > flank && r > flank )//&& !secondary )
|
|
411 rt->valid = false ;
|
|
412 else
|
|
413 validRead = false ;
|
|
414 }
|
|
415 else
|
|
416 {
|
|
417 // Ignore this read
|
|
418 //return true ;
|
|
419 validRead = false ;
|
|
420 }
|
|
421 }
|
|
422 }
|
|
423 else if ( ( junctionQueue[i].start == start && junctionQueue[i].end > end ) ||
|
|
424 ( junctionQueue[i].start < start && junctionQueue[i].end == end ) )
|
|
425 {
|
|
426 // This other alignment is false ;
|
|
427 rt = GetReadTreeNode( &junctionQueue[i].head, col[0] ) ;
|
|
428 //if ( rt != NULL )
|
|
429 if ( rt != NULL ) //&& ( rt->flag & 0x40 ) != ( samFlag & 0x40 ) )
|
|
430 {
|
|
431 if ( l <= flank || r <= flank )//|| secondary )
|
|
432 {
|
|
433 if ( rt->leftAnchor > flank && rt->rightAnchor > flank )//&& !rt->secondary )
|
|
434 validRead = false ;
|
|
435 else
|
|
436 rt->valid = false ;
|
|
437 }
|
|
438 else
|
|
439 rt->valid = false ;
|
|
440 }
|
|
441 }
|
|
442 ++i ;
|
|
443 if ( i >= QUEUE_SIZE )
|
|
444 i = 0 ;
|
|
445 }
|
|
446
|
|
447 //if ( start == 8077108 && end == 8078439 )
|
|
448 // exit( 1 ) ;
|
|
449 i = qHead ;
|
|
450
|
|
451 while ( i != qTail )
|
|
452 {
|
|
453 if ( junctionQueue[i].start == start &&
|
|
454 junctionQueue[i].end == end )
|
|
455 {
|
|
456 if (junctionQueue[i].strand == '?' && junctionQueue[i].strand != strand )
|
|
457 {
|
|
458 junctionQueue[i].strand = strand ;
|
|
459 }
|
|
460 InsertReadTree( &junctionQueue[i].head, col[0], l, r ) ;
|
|
461 return true ;
|
|
462 }
|
|
463
|
|
464 if ( junctionQueue[i].end < prune && i == qHead )
|
|
465 {
|
|
466 // pop
|
|
467 PrintJunction( col[2], junctionQueue[i] ) ;
|
|
468 ClearReadTree( junctionQueue[i].head.left ) ;
|
|
469 ClearReadTree( junctionQueue[i].head.right ) ;
|
|
470 ++qHead ;
|
|
471 if ( qHead >= QUEUE_SIZE )
|
|
472 qHead = 0 ;
|
|
473 }
|
|
474 ++i ;
|
|
475 if ( i >= QUEUE_SIZE )
|
|
476 i = 0 ;
|
|
477 }
|
|
478
|
|
479 InsertQueue( start, end, l, r ) ;
|
|
480 return false ;
|
|
481 }
|
|
482
|
|
483 // Compute the junctions based on the CIGAR
|
|
484 bool CompareJunctions( int startLocation, char *cigar )
|
|
485 {
|
|
486 int currentLocation = startLocation ; // Current location on the reference genome
|
|
487 int i, j ;
|
|
488 int num ;
|
|
489 int newJuncCnt = 0 ; // The # of junctions in the read, and the # of new junctions among them.
|
|
490
|
|
491 struct _cigarSeg cigarSeg[1000] ; // A segment of the cigar.
|
|
492 int ccnt = 0 ; // cigarSeg cnt
|
|
493
|
|
494 j = 0 ;
|
|
495 num = 0 ;
|
|
496 validRead = true ;
|
|
497
|
|
498 for ( i = 0 ; cigar[i] ; ++i )
|
|
499 {
|
|
500 if ( cigar[i] >= '0' && cigar[i] <= '9' )
|
|
501 {
|
|
502 num = num * 10 + cigar[i] - '0' ;
|
|
503 }
|
|
504 else
|
|
505 {
|
|
506 cigarSeg[ccnt].len = num ;
|
|
507 cigarSeg[ccnt].type = cigar[i] ;
|
|
508 ++ccnt ;
|
|
509 num = 0 ;
|
|
510 }
|
|
511 }
|
|
512
|
|
513 // Filter low complex alignment.
|
|
514 // Only applies this to alignments does not have a strand information.
|
|
515 if ( strand == '?' )
|
|
516 {
|
|
517 if ( noncanonStrandInfo != -1 )
|
|
518 {
|
|
519 if ( ( noncanonStrandInfo & filterYS ) != 0 )
|
|
520 validRead = false ;
|
|
521 }
|
|
522 else
|
|
523 {
|
|
524 int softStart = -1 ;
|
|
525 int softEnd = 0 ;
|
|
526 if ( cigarSeg[0].type == 'S' )
|
|
527 softStart = cigarSeg[0].len ;
|
|
528 if ( cigarSeg[ ccnt - 1 ].type == 'S' )
|
|
529 softEnd = cigarSeg[ ccnt - 1 ].len ;
|
|
530 int readLen = strlen( col[9] ) ;
|
|
531 int count[5] = { 0, 0, 0, 0, 0 } ;
|
|
532
|
|
533 int pos = 0 ;
|
|
534 for ( i = 0 ; i < ccnt ; ++i )
|
|
535 {
|
|
536 switch ( cigarSeg[i].type )
|
|
537 {
|
|
538 case 'S':
|
|
539 pos += cigarSeg[i].len ;
|
|
540 case 'M':
|
|
541 case 'I':
|
|
542 {
|
|
543 for ( j = 0 ; j < cigarSeg[i].len ; ++j )
|
|
544 ++count[ nucToNum[ col[9][pos + j] - 'A' ] ] ;
|
|
545 pos += j ;
|
|
546 } break ;
|
|
547 case 'N':
|
|
548 {
|
|
549 int max = 0 ;
|
|
550 int sum = 0 ;
|
|
551 for ( j = 0 ; j < 5 ; ++j )
|
|
552 {
|
|
553 if ( count[j] > max )
|
|
554 max = count[j] ;
|
|
555 sum += count[j] ;
|
|
556 }
|
|
557 if ( max > 0.8 * sum )
|
|
558 validRead = false ;
|
|
559 count[0] = count[1] = count[2] = count[3] = count[4] = 0 ;
|
|
560 } break ;
|
|
561 case 'H':
|
|
562 case 'P':
|
|
563 case 'D':
|
|
564 default: break ;
|
|
565 }
|
|
566 }
|
|
567
|
|
568 int max = 0 ;
|
|
569 int sum = 0 ;
|
|
570 for ( j = 0 ; j < 5 ; ++j )
|
|
571 {
|
|
572 if ( count[j] > max )
|
|
573 max = count[j] ;
|
|
574 sum += count[j] ;
|
|
575 }
|
|
576 if ( max > 0.8 * sum )
|
|
577 validRead = false ;
|
|
578 /* count[0] = count[1] = count[2] = count[3] = count[4] = 0 ;
|
|
579
|
|
580
|
|
581 for ( i = softStart + 1 ; i < readLen - softEnd ; ++i )
|
|
582 {
|
|
583 switch ( col[9][i] )
|
|
584 {
|
|
585 case 'A': ++count[0] ; break ;
|
|
586 case 'C': ++count[1] ; break ;
|
|
587 case 'G': ++count[2] ; break ;
|
|
588 case 'T': ++count[3] ; break ;
|
|
589 default: ++count[4] ;
|
|
590 }
|
|
591 }
|
|
592 int max = 0 ;
|
|
593 for ( j = 0 ; j < 5 ; ++j )
|
|
594 if ( count[j] > max )
|
|
595 max = count[j] ;
|
|
596 if ( max > 0.6 * ( readLen - softEnd - softStart - 1 ) )
|
|
597 validRead = false ;*/
|
|
598 }
|
|
599 }
|
|
600
|
|
601 // Test whether contradict with mate pair
|
|
602 if ( col[6][0] == '=' )
|
|
603 {
|
|
604 currentLocation = startLocation ;
|
|
605 for ( i = 0 ; i < ccnt ; ++i )
|
|
606 {
|
|
607 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H'
|
|
608 || cigarSeg[i].type == 'P' )
|
|
609 continue ;
|
|
610 else if ( cigarSeg[i].type == 'N' )
|
|
611 {
|
|
612 if ( mateStart >= currentLocation && mateStart <= currentLocation + cigarSeg[i].len - 1 )
|
|
613 {
|
|
614 /*if ( cigarSeg[i].len == 91486 )
|
|
615 {
|
|
616 printf( "%s %d %d\n", currentLocation, mateStart ) ;
|
|
617 exit( 1 ) ;
|
|
618 }*/
|
|
619 // ignore this read
|
|
620 //return false ;
|
|
621 InsertContradictedReads( contradictedReads, col[0], mateStart ) ;
|
|
622 validRead = false ;
|
|
623 break ;
|
|
624 }
|
|
625 }
|
|
626 currentLocation += cigarSeg[i].len ;
|
|
627 }
|
|
628
|
|
629 i = qHead ;
|
|
630 // Search if it falls in the splices junction created by its mate.
|
|
631 while ( i != qTail )
|
|
632 {
|
|
633 if ( mateStart < junctionQueue[i].start && junctionQueue[i].start <= startLocation
|
|
634 && startLocation <= junctionQueue[i].end
|
|
635 && SearchContradictedReads( contradictedReads, col[0], startLocation ) )
|
|
636 {
|
|
637 validRead = false ;
|
|
638 break ;
|
|
639 }
|
|
640 ++i ;
|
|
641 if ( i >= QUEUE_SIZE )
|
|
642 i = 0 ;
|
|
643 }
|
|
644 }
|
|
645
|
|
646 currentLocation = startLocation ;
|
|
647 for ( i = 0 ; i < ccnt ; ++i )
|
|
648 {
|
|
649 if ( cigarSeg[i].type == 'I' || cigarSeg[i].type == 'S' || cigarSeg[i].type == 'H'
|
|
650 || cigarSeg[i].type == 'P' )
|
|
651 continue ;
|
|
652 else if ( cigarSeg[i].type == 'N' )
|
|
653 {
|
|
654 int left, right ;
|
|
655 int tmp ;
|
|
656 tmp = i ;
|
|
657 while ( i > 0 && ( cigarSeg[i - 1].type == 'I' || cigarSeg[i - 1].type == 'D' ) )
|
|
658 --i ;
|
|
659 if ( i > 0 && cigarSeg[i - 1].type == 'M' )
|
|
660 {
|
|
661 left = cigarSeg[i - 1].len ;
|
|
662 if ( i >= 2 && cigarSeg[i - 2].type == 'N' && left <= flank )
|
|
663 {
|
|
664 left = flank + 1 ;
|
|
665 }
|
|
666 }
|
|
667 else
|
|
668 left = 0 ;
|
|
669
|
|
670 i = tmp ;
|
|
671 while ( i < ccnt && ( cigarSeg[i + 1].type == 'I' || cigarSeg[i + 1].type == 'D' ) )
|
|
672 ++i ;
|
|
673 if ( i < ccnt && cigarSeg[i + 1].type == 'M' )
|
|
674 {
|
|
675 right = cigarSeg[i + 1].len ;
|
|
676 if ( i + 2 < ccnt && cigarSeg[i + 2].type == 'N' && right <= flank )
|
|
677 {
|
|
678 right = flank + 1 ;
|
|
679 }
|
|
680 }
|
|
681 else
|
|
682 right = 0 ;
|
|
683 i = tmp ;
|
|
684 if ( !SearchQueue( currentLocation, currentLocation + cigarSeg[i].len - 1, startLocation, left, right ) )
|
|
685 {
|
|
686 ++newJuncCnt ;
|
|
687 }
|
|
688 }
|
|
689 currentLocation += cigarSeg[i].len ;
|
|
690 }
|
|
691 /*else if ( cigar[i] == 'I' )
|
|
692 {
|
|
693 num = 0 ;
|
|
694 }
|
|
695 else if ( cigar[i] == 'N' )
|
|
696 {
|
|
697 if ( !SearchQueue( currentLocation, currentLocation + num - 1, startLocation ) )
|
|
698 {
|
|
699 ++newJuncCnt ;
|
|
700 //if ( flagPrintJunction )
|
|
701 // printf( "%s %d %d\n", col[2], currentLocation - 2, currentLocation + len - 1 ) ;
|
|
702 }
|
|
703 currentLocation += num ;
|
|
704 num = 0 ;
|
|
705 }
|
|
706 else // Other operations, like M, D,...,
|
|
707 {
|
|
708 currentLocation += num ;
|
|
709 num = 0 ;
|
|
710 }*/
|
|
711
|
|
712 junctionCnt += newJuncCnt ;
|
|
713
|
|
714 if ( newJuncCnt )
|
|
715 return true ;
|
|
716 else
|
|
717 return false ;
|
|
718 }
|
|
719
|
|
720 void cigar2string( bam1_core_t *c, uint32_t *in_cigar, char *out_cigar )
|
|
721 {
|
|
722 int k, op, l ;
|
|
723 char opcode ;
|
|
724
|
|
725 *out_cigar = '\0' ;
|
|
726 for ( k = 0 ; k < c->n_cigar ; ++k )
|
|
727 {
|
|
728 op = in_cigar[k] & BAM_CIGAR_MASK ;
|
|
729 l = in_cigar[k] >> BAM_CIGAR_SHIFT ;
|
|
730 switch (op)
|
|
731 {
|
|
732 case BAM_CMATCH: opcode = 'M' ; break ;
|
|
733 case BAM_CINS: opcode = 'I' ; break ;
|
|
734 case BAM_CDEL: opcode = 'D' ; break ;
|
|
735 case BAM_CREF_SKIP: opcode = 'N' ; break ;
|
|
736 case BAM_CSOFT_CLIP: opcode = 'S' ; break ;
|
|
737 case BAM_CHARD_CLIP: opcode = 'H' ; break ;
|
|
738 case BAM_CPAD: opcode = 'P' ; break ;
|
|
739 }
|
|
740 sprintf( out_cigar + strlen( out_cigar ), "%d%c", l, opcode ) ;
|
|
741 }
|
|
742 }
|
|
743
|
|
744
|
|
745
|
|
746 int main( int argc, char *argv[] )
|
|
747 {
|
|
748 FILE *fp ;
|
|
749 samfile_t *fpsam ;
|
|
750 bam1_t *b = NULL ;
|
|
751 bool useSam = true ;
|
|
752
|
|
753 int i, len ;
|
|
754 int startLocation ;
|
|
755 bool flagRemove = false ;
|
|
756 char prevChrome[103] ;
|
|
757
|
|
758 anchorBoth = false ;
|
|
759
|
|
760 flagPrintJunction = false ;
|
|
761 flagPrintAll = false ;
|
|
762 flagStrict = false ;
|
|
763 junctionCnt = 0 ;
|
|
764 bool hasMateReadIdSuffix = false ;
|
|
765
|
|
766 strcpy( prevChrome, "" ) ;
|
|
767 flagRemove = true ;
|
|
768 flagPrintJunction = true ;
|
|
769 flank = 8 ;
|
|
770 filterYS = 4 ;
|
|
771
|
|
772 contradictedReads = NULL ;
|
|
773
|
|
774 // processing the argument list
|
|
775 for ( i = 1 ; i < argc ; ++i )
|
|
776 {
|
|
777 if ( !strcmp( argv[i], "-h" ) )
|
|
778 {
|
|
779 PrintHelp() ;
|
|
780 return 0 ;
|
|
781 }
|
|
782 else if ( !strcmp( argv[i], "-r" ) )
|
|
783 {
|
|
784 flagRemove = true ;
|
|
785 }
|
|
786 else if ( !strcmp( argv[i], "-j" ) )
|
|
787 {
|
|
788 strcpy( prevChrome, "" ) ;
|
|
789 flagRemove = true ;
|
|
790 flagPrintJunction = true ;
|
|
791 flank = atoi( argv[i + 1] ) ;
|
|
792 if ( i + 2 < argc && !strcmp( argv[i+2], "-B" ) )
|
|
793 {
|
|
794 anchorBoth = true ;
|
|
795 ++i ;
|
|
796 }
|
|
797 ++i ;
|
|
798 }
|
|
799 else if ( !strcmp( argv[i], "-a" ) )
|
|
800 {
|
|
801 flagPrintAll = true ;
|
|
802 }
|
|
803 else if ( !strcmp( argv[i], "--strict" ) )
|
|
804 {
|
|
805 flagStrict = true ;
|
|
806 }
|
|
807 else if ( !strcmp( argv[i], "-y" ) )
|
|
808 {
|
|
809 filterYS = atoi( argv[i + 1] ) ;
|
|
810 ++i ;
|
|
811 }
|
|
812 else if ( !strcmp( argv[i], "--hasMateIdSuffix" ) )
|
|
813 {
|
|
814 hasMateReadIdSuffix = true ;
|
|
815 ++i ;
|
|
816 }
|
|
817 else if ( i > 1 )
|
|
818 {
|
|
819 printf( "Unknown option %s\n", argv[i] ) ;
|
|
820 exit( 1 ) ;
|
|
821 }
|
|
822 }
|
|
823 if ( argc == 1 )
|
|
824 {
|
|
825 PrintHelp() ;
|
|
826 return 0 ;
|
|
827 }
|
|
828
|
|
829 len = strlen( argv[1] ) ;
|
|
830 if ( argv[1][len-3] == 'b' || argv[1][len-3] == 'B' )
|
|
831 {
|
|
832 if ( !( fpsam = samopen( argv[1], "rb", 0 ) ) )
|
|
833 return 0 ;
|
|
834
|
|
835 if ( !fpsam->header )
|
|
836 {
|
|
837 //samclose( fpsam ) ;
|
|
838 //fpsam = samopen( argv[1], "r", 0 ) ;
|
|
839 //if ( !fpsam->header )
|
|
840 //{
|
|
841 useSam = false ;
|
|
842 fp = fopen( argv[1], "r" ) ;
|
|
843 //}
|
|
844 }
|
|
845 }
|
|
846 else
|
|
847 {
|
|
848 useSam = false ;
|
|
849 fp = NULL ;
|
|
850 if ( !strcmp( argv[1], "-" ) )
|
|
851 fp = stdin ;
|
|
852 else
|
|
853 fp = fopen( argv[1], "r" ) ;
|
|
854 if ( fp == NULL )
|
|
855 {
|
|
856 printf( "Could not open file %s\n", argv[1] ) ;
|
|
857 return 0 ;
|
|
858 }
|
|
859 }
|
|
860
|
|
861 while ( 1 )
|
|
862 {
|
|
863 int flag = 0 ;
|
|
864 if ( useSam )
|
|
865 {
|
|
866 if ( b )
|
|
867 bam_destroy1( b ) ;
|
|
868 b = bam_init1() ;
|
|
869 if ( samread( fpsam, b ) <= 0 )
|
|
870 break ;
|
|
871 if ( b->core.tid >= 0 )
|
|
872 strcpy( col[2], fpsam->header->target_name[b->core.tid] ) ;
|
|
873 else
|
|
874 strcpy( col[2], "-1" ) ;
|
|
875 cigar2string( &(b->core), bam1_cigar( b ), col[5] ) ;
|
|
876 strcpy( col[0], bam1_qname( b ) ) ;
|
|
877 flag = b->core.flag ;
|
|
878 if ( bam_aux_get( b, "NH" ) )
|
|
879 {
|
|
880 /*if ( bam_aux2i( bam_aux_get(b, "NH" ) ) >= 2 )
|
|
881 {
|
|
882 secondary = true ;
|
|
883 }
|
|
884 else
|
|
885 secondary = false ;*/
|
|
886
|
|
887 NH = bam_aux2i( bam_aux_get( b, "NH" ) ) ;
|
|
888 }
|
|
889 else
|
|
890 {
|
|
891 //secondary = false ;
|
|
892 NH = 1 ;
|
|
893 }
|
|
894
|
|
895 if ( bam_aux_get( b, "NM" ) )
|
|
896 {
|
|
897 editDistance = bam_aux2i( bam_aux_get( b, "NM" ) ) ;
|
|
898 }
|
|
899 else if ( bam_aux_get( b, "nM" ) )
|
|
900 {
|
|
901 editDistance = bam_aux2i( bam_aux_get( b, "nM" ) ) ;
|
|
902 }
|
|
903 else
|
|
904 editDistance = 0 ;
|
|
905
|
|
906 mateStart = b->core.mpos + 1 ;
|
|
907 if ( b->core.mtid == b->core.tid )
|
|
908 col[6][0] = '=' ;
|
|
909 else
|
|
910 col[6][0] = '*' ;
|
|
911
|
|
912 if ( b->core.l_qseq < 20 )
|
|
913 continue ;
|
|
914
|
|
915 for ( i = 0 ; i < b->core.l_qseq ; ++i )
|
|
916 {
|
|
917 int bit = bam1_seqi( bam1_seq( b ), i ) ;
|
|
918 switch ( bit )
|
|
919 {
|
|
920 case 1: col[9][i] = 'A' ; break ;
|
|
921 case 2: col[9][i] = 'C' ; break ;
|
|
922 case 4: col[9][i] = 'G' ; break ;
|
|
923 case 8: col[9][i] = 'T' ; break ;
|
|
924 case 15: col[9][i] = 'N' ; break ;
|
|
925 default: col[9][i] = 'A' ; break ;
|
|
926 }
|
|
927 }
|
|
928 col[9][i] = '\0' ;
|
|
929
|
|
930 /*if ( flag & 0x100 )
|
|
931 secondary = true ;
|
|
932 else
|
|
933 secondary = false ;*/
|
|
934 }
|
|
935 else
|
|
936 {
|
|
937 char *p ;
|
|
938 if ( !fgets( line, LINE_SIZE, fp ) )
|
|
939 break ;
|
|
940 if ( line[0] == '\0' || line[0] == '@' )
|
|
941 continue ;
|
|
942 sscanf( line, "%s%s%s%s%s%s%s%s%s%s%s", col, col + 1, col + 2, col + 3, col + 4,
|
|
943 col + 5, col + 6, col + 7, col + 8, col + 9, col + 10 ) ;
|
|
944
|
|
945 flag = atoi( col[1] ) ;
|
|
946 if ( p = strstr( line, "NH" ) )
|
|
947 {
|
|
948 int k = 0 ;
|
|
949 p += 5 ;
|
|
950 while ( *p != ' ' && *p != '\t' && *p != '\0')
|
|
951 {
|
|
952 k = k * 10 + *p - '0' ;
|
|
953 ++p ;
|
|
954 }
|
|
955 /*if ( k >= 2 )
|
|
956 {
|
|
957 secondary = true ;
|
|
958 }
|
|
959 else
|
|
960 secondary = false ;*/
|
|
961 NH = k ;
|
|
962 }
|
|
963 else
|
|
964 {
|
|
965 //secondary = false ;
|
|
966 NH = 1 ;
|
|
967 }
|
|
968 if ( ( p = strstr( line, "NM" ) ) || ( p = strstr( line, "nM" ) ) )
|
|
969 {
|
|
970 int k = 0 ;
|
|
971 p += 5 ;
|
|
972 while ( *p != ' ' && *p != '\t' && *p != '\0')
|
|
973 {
|
|
974 k = k * 10 + *p - '0' ;
|
|
975 ++p ;
|
|
976 }
|
|
977 editDistance = k ;
|
|
978 }
|
|
979 else
|
|
980 editDistance = 0 ;
|
|
981
|
|
982 mateStart = atoi( col[7] ) ;
|
|
983 /*if ( flag & 0x100 )
|
|
984 secondary = true ;
|
|
985 else
|
|
986 secondary = false ;*/
|
|
987
|
|
988 }
|
|
989 samFlag = flag ;
|
|
990 for ( i = 0 ; col[5][i] ; ++i )
|
|
991 if ( col[5][i] == 'N' )
|
|
992 break ;
|
|
993
|
|
994 if ( !col[5][i] )
|
|
995 continue ;
|
|
996
|
|
997 // remove .1, .2 or /1, /2 suffix
|
|
998 if ( hasMateReadIdSuffix )
|
|
999 {
|
|
1000 char *s = col[0] ;
|
|
1001 int len = strlen( s ) ;
|
|
1002 if ( len >= 2 && ( s[len - 1] == '1' || s[len - 1] == '2' )
|
|
1003 && ( s[len - 2] == '.' || s[len - 2] == '/' ) )
|
|
1004 {
|
|
1005 s[len - 2] = '\0' ;
|
|
1006 }
|
|
1007 }
|
|
1008
|
|
1009 if ( useSam )
|
|
1010 {
|
|
1011 if ( bam_aux_get( b, "XS" ) )
|
|
1012 {
|
|
1013 strand = bam_aux2A( bam_aux_get( b, "XS" ) ) ;
|
|
1014 if ( bam_aux_get( b, "YS" ) )
|
|
1015 {
|
|
1016 noncanonStrandInfo = bam_aux2i( bam_aux_get( b, "YS" ) ) ;
|
|
1017 }
|
|
1018 else
|
|
1019 {
|
|
1020 noncanonStrandInfo = -1 ;
|
|
1021 }
|
|
1022 }
|
|
1023 else
|
|
1024 {
|
|
1025 strand = '?' ;
|
|
1026 noncanonStrandInfo = -1 ;
|
|
1027 }
|
|
1028 startLocation = b->core.pos + 1 ;
|
|
1029 }
|
|
1030 else
|
|
1031 {
|
|
1032 if ( strstr( line, "XS:A:-" ) ) // on negative strand
|
|
1033 strand = '-' ;
|
|
1034 else if ( strstr( line, "XS:A:+" ) )
|
|
1035 strand = '+' ;
|
|
1036 else
|
|
1037 {
|
|
1038 strand = '?' ;
|
|
1039 char *p = strstr( line, "YS:i:" ) ;
|
|
1040 if ( p != NULL )
|
|
1041 {
|
|
1042 p += 5 ;
|
|
1043 noncanonStrandInfo = atoi( p ) ;
|
|
1044 }
|
|
1045 else
|
|
1046 noncanonStrandInfo = -1 ;
|
|
1047 }
|
|
1048 startLocation = atoi( col[3] ) ;
|
|
1049 }
|
|
1050
|
|
1051 // Found the junctions from the read.
|
|
1052 if ( strcmp( prevChrome, col[2] ) )
|
|
1053 {
|
|
1054 if ( flagPrintJunction )
|
|
1055 {
|
|
1056 // Print the remaining elements in the queue
|
|
1057 i = qHead ;
|
|
1058 while ( i != qTail )
|
|
1059 {
|
|
1060 PrintJunction( prevChrome, junctionQueue[i] ) ;
|
|
1061 ++i ;
|
|
1062 if ( i >= QUEUE_SIZE )
|
|
1063 i = 0 ;
|
|
1064 }
|
|
1065 }
|
|
1066
|
|
1067 // new chromosome
|
|
1068 ClearReadTree( contradictedReads ) ;
|
|
1069 contradictedReads = NULL ;
|
|
1070 qHead = qTail = 0 ;
|
|
1071 strcpy( prevChrome, col[2] ) ;
|
|
1072 }
|
|
1073
|
|
1074 if ( flagRemove )
|
|
1075 {
|
|
1076 if ( CompareJunctions( startLocation, col[5] ) )
|
|
1077 {
|
|
1078 // Test whether this read has new junctions
|
|
1079 //++junctionCnt ;
|
|
1080 if ( !flagPrintJunction )
|
|
1081 printf( "%s", line ) ;
|
|
1082 }
|
|
1083 }
|
|
1084 else
|
|
1085 {
|
|
1086 ++junctionCnt ;
|
|
1087 printf( "%s", line ) ;
|
|
1088 }
|
|
1089 //printf( "hi2 %s\n", col[0] ) ;
|
|
1090 }
|
|
1091
|
|
1092 if ( flagPrintJunction )
|
|
1093 {
|
|
1094 // Print the remaining elements in the queue
|
|
1095 i = qHead ;
|
|
1096 while ( i != qTail )
|
|
1097 {
|
|
1098 PrintJunction( prevChrome, junctionQueue[i] ) ;
|
|
1099 ++i ;
|
|
1100 if ( i >= QUEUE_SIZE )
|
|
1101 i = 0 ;
|
|
1102 }
|
|
1103 }
|
|
1104
|
|
1105 //fprintf( stderr, "The number of junctions: %d\n", junctionCnt ) ;
|
|
1106 return 0 ;
|
|
1107 }
|
|
1108
|