0
|
1 // The class handles reading the bam file
|
|
2
|
|
3 #ifndef _LSONG_RSCAF_ALIGNMENT_HEADER
|
|
4 #define _LSONG_RSCAF_ALIGNMENT_HEADER
|
|
5
|
|
6 #include "samtools-0.1.19/sam.h"
|
|
7 #include <map>
|
|
8 #include <string>
|
|
9 #include <assert.h>
|
|
10 #include <iostream>
|
|
11 #include <stdlib.h>
|
|
12 #include <math.h>
|
|
13
|
|
14 #include "defs.h"
|
|
15
|
|
16 class Alignments
|
|
17 {
|
|
18 private:
|
|
19 samfile_t *fpSam ;
|
|
20 bam1_t *b ;
|
|
21
|
|
22 char fileName[1024] ;
|
|
23 bool opened ;
|
|
24 std::map<std::string, int> chrNameToId ;
|
|
25 bool allowSupplementary ;
|
|
26 bool allowClip ;
|
|
27 bool hasClipHead, hasClipTail ;
|
|
28 int segmentsSum ; // the sum of segments.
|
|
29
|
|
30 bool atBegin ;
|
|
31 bool atEnd ;
|
|
32
|
|
33 static int CompInt( const void *p1, const void *p2 )
|
|
34 {
|
|
35 return (*(int *)p1 ) - (*(int *)p2 ) ;
|
|
36 }
|
|
37
|
|
38 void Open()
|
|
39 {
|
|
40 fpSam = samopen( fileName, "rb", 0 ) ;
|
|
41 if ( !fpSam->header )
|
|
42 {
|
|
43 fprintf( stderr, "Can not open %s.\n", fileName ) ;
|
|
44 exit( 1 ) ;
|
|
45 }
|
|
46
|
|
47 // Collect the chromosome information
|
|
48 for ( int i = 0 ; i < fpSam->header->n_targets ; ++i )
|
|
49 {
|
|
50 std::string s( fpSam->header->target_name[i] ) ;
|
|
51 chrNameToId[s] = i ;
|
|
52 }
|
|
53 opened = true ;
|
|
54 atBegin = true ;
|
|
55 atEnd = false ;
|
|
56 }
|
|
57 public:
|
|
58 struct _pair segments[MAX_SEG_COUNT] ;
|
|
59 int segCnt ;
|
|
60
|
|
61 int totalReadCnt ;
|
|
62 int fragLen, fragStdev ;
|
|
63 int readLen ;
|
|
64 bool matePaired ;
|
|
65
|
|
66 Alignments()
|
|
67 {
|
|
68 b = NULL ;
|
|
69 opened = false ;
|
|
70 atBegin = true ;
|
|
71 atEnd = false ;
|
|
72 allowSupplementary = false ;
|
|
73 allowClip = true ;
|
|
74
|
|
75 totalReadCnt = 0 ;
|
|
76 fragLen = 0 ;
|
|
77 fragStdev = 0 ;
|
|
78 readLen = 0 ;
|
|
79 matePaired = false ;
|
|
80 }
|
|
81 ~Alignments()
|
|
82 {
|
|
83 if ( b )
|
|
84 bam_destroy1( b ) ;
|
|
85 }
|
|
86
|
|
87 void Open( char *file )
|
|
88 {
|
|
89 strcpy( fileName, file ) ;
|
|
90 Open() ;
|
|
91 }
|
|
92
|
|
93 void Rewind()
|
|
94 {
|
|
95 Close() ;
|
|
96 Open() ;
|
|
97
|
|
98 atBegin = true ;
|
|
99 atEnd = false ;
|
|
100 }
|
|
101
|
|
102 void Close()
|
|
103 {
|
|
104 samclose( fpSam ) ;
|
|
105 fpSam = NULL ;
|
|
106 }
|
|
107
|
|
108 bool IsOpened()
|
|
109 {
|
|
110 return opened ;
|
|
111 }
|
|
112
|
|
113 bool IsAtBegin()
|
|
114 {
|
|
115 return atBegin ;
|
|
116 }
|
|
117
|
|
118 bool IsAtEnd()
|
|
119 {
|
|
120 return atEnd ;
|
|
121 }
|
|
122
|
|
123 bool HasClip()
|
|
124 {
|
|
125 return hasClipHead || hasClipTail ;
|
|
126 }
|
|
127
|
|
128 bool HasClipHead()
|
|
129 {
|
|
130 return hasClipHead ;
|
|
131 }
|
|
132 bool HasClipTail()
|
|
133 {
|
|
134 return hasClipTail ;
|
|
135 }
|
|
136
|
|
137 int Next()
|
|
138 {
|
|
139 int i ;
|
|
140 int start = 0, len = 0 ;
|
|
141 uint32_t *rawCigar ;
|
|
142
|
|
143 if ( atBegin == true )
|
|
144 totalReadCnt = 0 ;
|
|
145
|
|
146 atBegin = false ;
|
|
147 while ( 1 )
|
|
148 {
|
|
149 while ( 1 )
|
|
150 {
|
|
151 if ( b )
|
|
152 bam_destroy1( b ) ;
|
|
153 b = bam_init1() ;
|
|
154
|
|
155 if ( samread( fpSam, b ) <= 0 )
|
|
156 {
|
|
157 atEnd = true ;
|
|
158 return 0 ;
|
|
159 }
|
|
160
|
|
161 if ( ( b->core.flag & 0x900 ) == 0 )
|
|
162 ++totalReadCnt ;
|
|
163
|
|
164 if ( b->core.flag & 0xC )
|
|
165 continue ;
|
|
166
|
|
167 //if ( ( b->core.flag & 0x900 ) == 0 )
|
|
168 break ;
|
|
169 }
|
|
170
|
|
171 // to many repeat.
|
|
172 if ( bam_aux_get( b, "NH" ) )
|
|
173 {
|
|
174 if ( bam_aux2i( bam_aux_get( b, "NH" ) ) >= 5 )
|
|
175 continue ;
|
|
176 }
|
|
177
|
|
178 // Compute the exons segments from the reads
|
|
179 segCnt = 0 ;
|
|
180 start = b->core.pos ; //+ 1 ;
|
|
181 rawCigar = bam1_cigar( b ) ;
|
|
182 // Check whether the query length is compatible with the read
|
|
183 if ( bam_cigar2qlen( &b->core, rawCigar ) != b->core.l_qseq )
|
|
184 continue ;
|
|
185
|
|
186 bool clipMiddle = false ;
|
|
187 int clipSum = 0 ;
|
|
188 hasClipHead = hasClipTail = false ;
|
|
189 len = 0 ;
|
|
190 segmentsSum = 0 ;
|
|
191 for ( i = 0 ; i < b->core.n_cigar ; ++i )
|
|
192 {
|
|
193 int op = rawCigar[i] & BAM_CIGAR_MASK ;
|
|
194 int num = rawCigar[i] >> BAM_CIGAR_SHIFT ;
|
|
195
|
|
196 switch ( op )
|
|
197 {
|
|
198 case BAM_CMATCH:
|
|
199 case BAM_CDEL:
|
|
200 len += num ; break ;
|
|
201 case BAM_CSOFT_CLIP:
|
|
202 case BAM_CHARD_CLIP:
|
|
203 case BAM_CPAD:
|
|
204 {
|
|
205 if ( i == 0 )
|
|
206 hasClipHead = true ;
|
|
207 else if ( i == b->core.n_cigar - 1 )
|
|
208 hasClipTail = true ;
|
|
209 else
|
|
210 clipMiddle = true ;
|
|
211
|
|
212 clipSum += num ;
|
|
213 }
|
|
214 case BAM_CINS:
|
|
215 num = 0 ; break ;
|
|
216 case BAM_CREF_SKIP:
|
|
217 {
|
|
218 segments[ segCnt ].a = start ;
|
|
219 segments[ segCnt ].b = start + len - 1 ;
|
|
220 ++segCnt ;
|
|
221 segmentsSum += len ;
|
|
222 start = start + len + num ;
|
|
223 len = 0 ;
|
|
224 } break ;
|
|
225 default:
|
|
226 len += num ; break ;
|
|
227 }
|
|
228 }
|
|
229 if ( clipMiddle ) // should never happend
|
|
230 continue ;
|
|
231
|
|
232 if ( clipSum >= 2 && !allowClip )
|
|
233 continue ;
|
|
234
|
|
235 if ( len > 0 )
|
|
236 {
|
|
237 segments[ segCnt ].a = start ;
|
|
238 segments[ segCnt ].b = start + len - 1 ;
|
|
239 ++segCnt ;
|
|
240 segmentsSum += len ;
|
|
241 }
|
|
242 /*if ( !strcmp( bam1_qname( b ), "chr1:109656301-109749401W:ENST00000490758.2:381:1480:1090:1290:X" ) )
|
|
243 {
|
|
244 for ( i = 0 ; i < segCnt ; ++i )
|
|
245 printf( "(%d %d) ", segments[i].a, segments[i].b ) ;
|
|
246 printf( "\n" ) ;
|
|
247 }*/
|
|
248
|
|
249 // Check whether the mates are compatible
|
|
250 //int mChrId = b->core.mtid ;
|
|
251 int64_t mPos = b->core.mpos ;
|
|
252
|
|
253 if ( b->core.mtid == b->core.tid )
|
|
254 {
|
|
255 for ( i = 0 ; i < segCnt - 1 ; ++i )
|
|
256 {
|
|
257 if ( mPos >= segments[i].b && mPos <= segments[i + 1].a )
|
|
258 break ;
|
|
259 }
|
|
260 if ( i < segCnt - 1 )
|
|
261 continue ;
|
|
262 }
|
|
263
|
|
264 break ;
|
|
265 }
|
|
266
|
|
267 return 1 ;
|
|
268 }
|
|
269
|
|
270
|
|
271 int GetChromId()
|
|
272 {
|
|
273 return b->core.tid ;
|
|
274 }
|
|
275
|
|
276 char* GetChromName( int tid )
|
|
277 {
|
|
278 return fpSam->header->target_name[ tid ] ;
|
|
279 }
|
|
280
|
|
281 int GetChromIdFromName( const char *s )
|
|
282 {
|
|
283 std::string ss( s ) ;
|
|
284 if ( chrNameToId.find( ss ) == chrNameToId.end() )
|
|
285 {
|
|
286 printf( "Unknown genome name: %s\n", s ) ;
|
|
287 exit( 1 ) ;
|
|
288 }
|
|
289 return chrNameToId[ss] ;
|
|
290 }
|
|
291
|
|
292 int GetChromLength( int tid )
|
|
293 {
|
|
294 return fpSam->header->target_len[ tid ] ;
|
|
295 }
|
|
296
|
|
297 int GetChromCount()
|
|
298 {
|
|
299 return fpSam->header->n_targets ;
|
|
300 }
|
|
301
|
|
302 void GetMatePosition( int &chrId, int64_t &pos )
|
|
303 {
|
|
304 if ( b->core.flag & 0x8 )
|
|
305 {
|
|
306 chrId = -1 ;
|
|
307 pos = -1 ;
|
|
308 }
|
|
309 else
|
|
310 {
|
|
311 chrId = b->core.mtid ;
|
|
312 pos = b->core.mpos ; //+ 1 ;
|
|
313 }
|
|
314 }
|
|
315
|
|
316 int GetRepeatPosition( int &chrId, int64_t &pos )
|
|
317 {
|
|
318 // Look at the CC field.
|
|
319 if ( !bam_aux_get( b, "CC" ) || !bam_aux_get( b, "CP" ) )
|
|
320 {
|
|
321 chrId = -1 ;
|
|
322 pos = -1 ;
|
|
323 return 0 ;
|
|
324 }
|
|
325
|
|
326 std::string s( bam_aux2Z( bam_aux_get(b, "CC" ) ) ) ;
|
|
327 chrId = chrNameToId[ s ] ;
|
|
328 pos = bam_aux2i( bam_aux_get( b, "CP" ) ) ;// Possible error for 64bit
|
|
329 return 1 ;
|
|
330 }
|
|
331
|
|
332 void GetFileName( char *buffer )
|
|
333 {
|
|
334 strcpy( buffer, fileName ) ;
|
|
335 }
|
|
336
|
|
337 int GetReadLength()
|
|
338 {
|
|
339 return b->core.l_qseq ;
|
|
340 }
|
|
341
|
|
342 int GetRefCoverLength()
|
|
343 {
|
|
344 return segmentsSum ;
|
|
345 }
|
|
346
|
|
347 bool IsFirstMate()
|
|
348 {
|
|
349 if ( b->core.flag & 0x40 )
|
|
350 return true ;
|
|
351 return false ;
|
|
352 }
|
|
353
|
|
354 bool IsReverse()
|
|
355 {
|
|
356 if ( b->core.flag & 0x10 )
|
|
357 return true ;
|
|
358 return false ;
|
|
359 }
|
|
360
|
|
361 bool IsMateReverse()
|
|
362 {
|
|
363 if ( b->core.flag & 0x20 )
|
|
364 return true ;
|
|
365 return false ;
|
|
366 }
|
|
367
|
|
368 char *GetReadId()
|
|
369 {
|
|
370 return bam1_qname( b ) ;
|
|
371 }
|
|
372
|
|
373 bool IsUnique()
|
|
374 {
|
|
375 if ( bam_aux_get( b, "NH" ) )
|
|
376 {
|
|
377 if ( bam_aux2i( bam_aux_get( b, "NH" ) ) > 1 )
|
|
378 return false ;
|
|
379 }
|
|
380 if ( IsSupplementary() && GetFieldZ( (char *)"XZ" ) != NULL )
|
|
381 return false ;
|
|
382 return true ;
|
|
383 }
|
|
384
|
|
385 bool IsPrimary()
|
|
386 {
|
|
387 if ( ( b->core.flag & 0x900 ) == 0 )
|
|
388 return true ;
|
|
389 else
|
|
390 return false ;
|
|
391 }
|
|
392
|
|
393 // -1:minus, 0: unknown, 1:plus
|
|
394 int GetStrand()
|
|
395 {
|
|
396 if ( segCnt == 1 )
|
|
397 return 0 ;
|
|
398 if ( bam_aux_get( b, "XS" ) )
|
|
399 {
|
|
400 if ( bam_aux2A( bam_aux_get( b, "XS" ) ) == '-' )
|
|
401 return -1 ;
|
|
402 else
|
|
403 return 1 ;
|
|
404 }
|
|
405 else
|
|
406 return 0 ;
|
|
407 }
|
|
408
|
|
409 int GetFieldI( char *f )
|
|
410 {
|
|
411 if ( bam_aux_get( b, f ) )
|
|
412 {
|
|
413 return bam_aux2i( bam_aux_get( b, f ) ) ;
|
|
414 }
|
|
415 return -1 ;
|
|
416 }
|
|
417
|
|
418 char *GetFieldZ( char *f )
|
|
419 {
|
|
420 if ( bam_aux_get( b, f ) )
|
|
421 {
|
|
422 return bam_aux2Z( bam_aux_get( b, f ) ) ;
|
|
423 }
|
|
424 return NULL ;
|
|
425 }
|
|
426
|
|
427 int GetNumberOfHits()
|
|
428 {
|
|
429 if ( bam_aux_get( b, "NH" ) )
|
|
430 {
|
|
431 return bam_aux2i( bam_aux_get( b, "NH" ) ) ;
|
|
432 }
|
|
433 return 1 ;
|
|
434 }
|
|
435
|
|
436 bool IsSupplementary()
|
|
437 {
|
|
438 if ( ( b->core.flag & 0x800 ) == 0 )
|
|
439 return false ;
|
|
440 else
|
|
441 return true ;
|
|
442 }
|
|
443
|
|
444 void SetAllowSupplementary( bool in )
|
|
445 {
|
|
446 allowSupplementary = in ;
|
|
447 }
|
|
448
|
|
449 void SetAllowClip( bool in )
|
|
450 {
|
|
451 allowClip = in ;
|
|
452 }
|
|
453
|
|
454 bool IsGCRich( bool threshold = 0.9 )
|
|
455 {
|
|
456 int i = 0 ;
|
|
457 int gcCnt = 0 ;
|
|
458 for ( i = 0 ; i < b->core.l_qseq ; ++i )
|
|
459 {
|
|
460 int bit = bam1_seqi( bam1_seq( b ), i ) ;
|
|
461 if ( bit == 2 || bit == 4 )
|
|
462 ++gcCnt ;
|
|
463 }
|
|
464 if ( gcCnt >= threshold * b->core.l_qseq )
|
|
465 return true ;
|
|
466 }
|
|
467
|
|
468 void GetGeneralInfo( bool stopEarly = false )
|
|
469 {
|
|
470 int i, k ;
|
|
471
|
|
472 const int sampleMax = 1000000 ;
|
|
473 int *lens = new int[sampleMax] ;
|
|
474 int *mateDiff = new int[sampleMax] ;
|
|
475 int lensCnt = 0 ;
|
|
476 int mateDiffCnt = 0 ;
|
|
477 bool end = false ;
|
|
478
|
|
479 while ( 1 )
|
|
480 {
|
|
481 while ( 1 )
|
|
482 {
|
|
483 if ( b )
|
|
484 bam_destroy1( b ) ;
|
|
485 b = bam_init1() ;
|
|
486
|
|
487 if ( samread( fpSam, b ) <= 0 )
|
|
488 {
|
|
489 end = true ;
|
|
490 break ;
|
|
491 }
|
|
492 if ( b->core.flag & 0xC )
|
|
493 continue ;
|
|
494
|
|
495 if ( ( b->core.flag & 0x900 ) == 0 )
|
|
496 break ;
|
|
497 }
|
|
498 if ( end )
|
|
499 break ;
|
|
500
|
|
501 if ( lensCnt < sampleMax )
|
|
502 {
|
|
503 lens[ lensCnt ] = b->core.l_qseq ;
|
|
504 ++lensCnt ;
|
|
505 }
|
|
506
|
|
507 if ( mateDiffCnt < sampleMax && b->core.tid == b->core.mtid
|
|
508 && b->core.pos < b->core.mpos )
|
|
509 {
|
|
510 mateDiff[ mateDiffCnt ] = b->core.mpos - b->core.pos ;
|
|
511 ++mateDiffCnt ;
|
|
512 }
|
|
513 ++totalReadCnt ;
|
|
514 if ( totalReadCnt >= sampleMax && stopEarly )
|
|
515 break ;
|
|
516 }
|
|
517
|
|
518 // Get the read length info and fragment length info
|
|
519 qsort( lens, lensCnt, sizeof( int ), CompInt ) ;
|
|
520 readLen = lens[ lensCnt - 1 ] ;
|
|
521
|
|
522 if ( mateDiffCnt > 0 )
|
|
523 {
|
|
524 matePaired = true ;
|
|
525
|
|
526 qsort( mateDiff, mateDiffCnt, sizeof( int ), CompInt ) ;
|
|
527 long long int sum = 0 ;
|
|
528 long long int sumsq = 0 ;
|
|
529
|
|
530 for ( i = 0 ; i < mateDiffCnt * 0.7 ; ++i )
|
|
531 {
|
|
532 sum += ( mateDiff[i] + readLen );
|
|
533 sumsq += ( mateDiff[i] + readLen ) * ( mateDiff[i] + readLen ) ;
|
|
534 }
|
|
535 k = i ;
|
|
536 fragLen = (int)( sum / k ) ;
|
|
537 fragStdev = (int)sqrt( sumsq / k - fragLen * fragLen ) ;
|
|
538 }
|
|
539 else
|
|
540 {
|
|
541 fragLen = readLen ;
|
|
542 fragStdev = 0 ;
|
|
543 }
|
|
544 //printf( "readLen = %d\nfragLen = %d, fragStdev = %d\n", readLen, fragLen, fragStdev ) ;
|
|
545 delete[] lens ;
|
|
546 delete[] mateDiff ;
|
|
547 }
|
|
548 } ;
|
|
549 #endif
|